hc
2024-02-19 890e1df1bec891d9203724541e81f8fbe5183388
kernel/mm/slub.c
....@@ -28,6 +28,7 @@
2828 #include <linux/ctype.h>
2929 #include <linux/debugobjects.h>
3030 #include <linux/kallsyms.h>
31
+#include <linux/kfence.h>
3132 #include <linux/memory.h>
3233 #include <linux/math64.h>
3334 #include <linux/fault-inject.h>
....@@ -36,7 +37,9 @@
3637 #include <linux/memcontrol.h>
3738 #include <linux/random.h>
3839
40
+#include <linux/debugfs.h>
3941 #include <trace/events/kmem.h>
42
+#include <trace/hooks/mm.h>
4043
4144 #include "internal.h"
4245
....@@ -59,10 +62,11 @@
5962 * D. page->frozen -> frozen state
6063 *
6164 * If a slab is frozen then it is exempt from list management. It is not
62
- * on any list. The processor that froze the slab is the one who can
63
- * perform list operations on the page. Other processors may put objects
64
- * onto the freelist but the processor that froze the slab is the only
65
- * one that can retrieve the objects from the page's freelist.
65
+ * on any list except per cpu partial list. The processor that froze the
66
+ * slab is the one who can perform list operations on the page. Other
67
+ * processors may put objects onto the freelist but the processor that
68
+ * froze the slab is the only one that can retrieve the objects from the
69
+ * page's freelist.
6670 *
6771 * The list_lock protects the partial and full list on each node and
6872 * the partial slab counter. If taken then no new slabs may be added or
....@@ -93,9 +97,7 @@
9397 * minimal so we rely on the page allocators per cpu caches for
9498 * fast frees and allocs.
9599 *
96
- * Overloading of page flags that are otherwise used for LRU management.
97
- *
98
- * PageActive The slab is frozen and exempt from list processing.
100
+ * page->frozen The slab is frozen and exempt from list processing.
99101 * This means that the slab is dedicated to a purpose
100102 * such as satisfying allocations for a specific
101103 * processor. Objects may be freed in the slab while
....@@ -111,23 +113,27 @@
111113 * free objects in addition to the regular freelist
112114 * that requires the slab lock.
113115 *
114
- * PageError Slab requires special handling due to debug
116
+ * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
115117 * options set. This moves slab handling out of
116118 * the fast path and disables lockless freelists.
117119 */
118120
119
-static inline int kmem_cache_debug(struct kmem_cache *s)
120
-{
121121 #ifdef CONFIG_SLUB_DEBUG
122
- return unlikely(s->flags & SLAB_DEBUG_FLAGS);
122
+#ifdef CONFIG_SLUB_DEBUG_ON
123
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
123124 #else
124
- return 0;
125
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
125126 #endif
127
+#endif
128
+
129
+static inline bool kmem_cache_debug(struct kmem_cache *s)
130
+{
131
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
126132 }
127133
128134 void *fixup_red_left(struct kmem_cache *s, void *p)
129135 {
130
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
136
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
131137 p += s->red_left_pad;
132138
133139 return p;
....@@ -197,33 +203,19 @@
197203 /* Use cmpxchg_double */
198204 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
199205
200
-/*
201
- * Tracking user of a slab.
202
- */
203
-#define TRACK_ADDRS_COUNT 16
204
-struct track {
205
- unsigned long addr; /* Called from address */
206
-#ifdef CONFIG_STACKTRACE
207
- unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
208
-#endif
209
- int cpu; /* Was running on cpu */
210
- int pid; /* Pid context */
211
- unsigned long when; /* When did the operation occur */
212
-};
213
-
214
-enum track_item { TRACK_ALLOC, TRACK_FREE };
215
-
216206 #ifdef CONFIG_SLUB_SYSFS
217207 static int sysfs_slab_add(struct kmem_cache *);
218208 static int sysfs_slab_alias(struct kmem_cache *, const char *);
219
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
220
-static void sysfs_slab_remove(struct kmem_cache *s);
221209 #else
222210 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
223211 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
224212 { return 0; }
225
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
226
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
213
+#endif
214
+
215
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
216
+static void debugfs_slab_add(struct kmem_cache *);
217
+#else
218
+static inline void debugfs_slab_add(struct kmem_cache *s) { }
227219 #endif
228220
229221 static inline void stat(const struct kmem_cache *s, enum stat_item si)
....@@ -251,7 +243,7 @@
251243 {
252244 #ifdef CONFIG_SLAB_FREELIST_HARDENED
253245 /*
254
- * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
246
+ * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
255247 * Normally, this doesn't cause any issues, as both set_freepointer()
256248 * and get_freepointer() are called with a pointer with the same tag.
257249 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
....@@ -277,6 +269,7 @@
277269
278270 static inline void *get_freepointer(struct kmem_cache *s, void *object)
279271 {
272
+ object = kasan_reset_tag(object);
280273 return freelist_dereference(s, object + s->offset);
281274 }
282275
....@@ -290,11 +283,12 @@
290283 unsigned long freepointer_addr;
291284 void *p;
292285
293
- if (!debug_pagealloc_enabled())
286
+ if (!debug_pagealloc_enabled_static())
294287 return get_freepointer(s, object);
295288
289
+ object = kasan_reset_tag(object);
296290 freepointer_addr = (unsigned long)object + s->offset;
297
- probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
291
+ copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
298292 return freelist_ptr(s, p, freepointer_addr);
299293 }
300294
....@@ -306,6 +300,7 @@
306300 BUG_ON(object == fp); /* naive detection of double free or corruption */
307301 #endif
308302
303
+ freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
309304 *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
310305 }
311306
....@@ -314,12 +309,6 @@
314309 for (__p = fixup_red_left(__s, __addr); \
315310 __p < (__addr) + (__objects) * (__s)->size; \
316311 __p += (__s)->size)
317
-
318
-/* Determine object index from a given position */
319
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
320
-{
321
- return (kasan_reset_tag(p) - addr) / s->size;
322
-}
323312
324313 static inline unsigned int order_objects(unsigned int order, unsigned int size)
325314 {
....@@ -441,19 +430,43 @@
441430 }
442431
443432 #ifdef CONFIG_SLUB_DEBUG
433
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
434
+static DEFINE_SPINLOCK(object_map_lock);
435
+
436
+static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
437
+ struct page *page)
438
+{
439
+ void *addr = page_address(page);
440
+ void *p;
441
+
442
+ bitmap_zero(obj_map, page->objects);
443
+
444
+ for (p = page->freelist; p; p = get_freepointer(s, p))
445
+ set_bit(__obj_to_index(s, addr, p), obj_map);
446
+}
447
+
444448 /*
445449 * Determine a map of object in use on a page.
446450 *
447451 * Node listlock must be held to guarantee that the page does
448452 * not vanish from under us.
449453 */
450
-static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
454
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
455
+ __acquires(&object_map_lock)
451456 {
452
- void *p;
453
- void *addr = page_address(page);
457
+ VM_BUG_ON(!irqs_disabled());
454458
455
- for (p = page->freelist; p; p = get_freepointer(s, p))
456
- set_bit(slab_index(p, s, addr), map);
459
+ spin_lock(&object_map_lock);
460
+
461
+ __fill_map(object_map, s, page);
462
+
463
+ return object_map;
464
+}
465
+
466
+static void put_map(unsigned long *map) __releases(&object_map_lock)
467
+{
468
+ VM_BUG_ON(map != object_map);
469
+ spin_unlock(&object_map_lock);
457470 }
458471
459472 static inline unsigned int size_from_object(struct kmem_cache *s)
....@@ -476,12 +489,12 @@
476489 * Debug settings:
477490 */
478491 #if defined(CONFIG_SLUB_DEBUG_ON)
479
-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
492
+slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
480493 #else
481
-static slab_flags_t slub_debug;
494
+slab_flags_t slub_debug;
482495 #endif
483496
484
-static char *slub_debug_slabs;
497
+static char *slub_debug_string;
485498 static int disable_higher_order_debug;
486499
487500 /*
....@@ -528,9 +541,29 @@
528541 unsigned int length)
529542 {
530543 metadata_access_enable();
531
- print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
532
- length, 1);
544
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
545
+ 16, 1, kasan_reset_tag((void *)addr), length, 1);
533546 metadata_access_disable();
547
+}
548
+
549
+/*
550
+ * See comment in calculate_sizes().
551
+ */
552
+static inline bool freeptr_outside_object(struct kmem_cache *s)
553
+{
554
+ return s->offset >= s->inuse;
555
+}
556
+
557
+/*
558
+ * Return offset of the end of info block which is inuse + free pointer if
559
+ * not overlapping with object.
560
+ */
561
+static inline unsigned int get_info_end(struct kmem_cache *s)
562
+{
563
+ if (freeptr_outside_object(s))
564
+ return s->inuse + sizeof(void *);
565
+ else
566
+ return s->inuse;
534567 }
535568
536569 static struct track *get_track(struct kmem_cache *s, void *object,
....@@ -538,13 +571,45 @@
538571 {
539572 struct track *p;
540573
541
- if (s->offset)
542
- p = object + s->offset + sizeof(void *);
543
- else
544
- p = object + s->inuse;
574
+ p = object + get_info_end(s);
545575
546
- return p + alloc;
576
+ return kasan_reset_tag(p + alloc);
547577 }
578
+
579
+/*
580
+ * This function will be used to loop through all the slab objects in
581
+ * a page to give track structure for each object, the function fn will
582
+ * be using this track structure and extract required info into its private
583
+ * data, the return value will be the number of track structures that are
584
+ * processed.
585
+ */
586
+unsigned long get_each_object_track(struct kmem_cache *s,
587
+ struct page *page, enum track_item alloc,
588
+ int (*fn)(const struct kmem_cache *, const void *,
589
+ const struct track *, void *), void *private)
590
+{
591
+ void *p;
592
+ struct track *t;
593
+ int ret;
594
+ unsigned long num_track = 0;
595
+
596
+ if (!slub_debug || !(s->flags & SLAB_STORE_USER))
597
+ return 0;
598
+
599
+ slab_lock(page);
600
+ for_each_object(p, s, page_address(page), page->objects) {
601
+ t = get_track(s, p, alloc);
602
+ metadata_access_enable();
603
+ ret = fn(s, p, t, private);
604
+ metadata_access_disable();
605
+ if (ret < 0)
606
+ break;
607
+ num_track += 1;
608
+ }
609
+ slab_unlock(page);
610
+ return num_track;
611
+}
612
+EXPORT_SYMBOL_GPL(get_each_object_track);
548613
549614 static void set_track(struct kmem_cache *s, void *object,
550615 enum track_item alloc, unsigned long addr)
....@@ -553,31 +618,25 @@
553618
554619 if (addr) {
555620 #ifdef CONFIG_STACKTRACE
556
- struct stack_trace trace;
557
- int i;
621
+ unsigned int nr_entries;
558622
559
- trace.nr_entries = 0;
560
- trace.max_entries = TRACK_ADDRS_COUNT;
561
- trace.entries = p->addrs;
562
- trace.skip = 3;
563623 metadata_access_enable();
564
- save_stack_trace(&trace);
624
+ nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
625
+ TRACK_ADDRS_COUNT, 3);
565626 metadata_access_disable();
566627
567
- /* See rant in lockdep.c */
568
- if (trace.nr_entries != 0 &&
569
- trace.entries[trace.nr_entries - 1] == ULONG_MAX)
570
- trace.nr_entries--;
571
-
572
- for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
573
- p->addrs[i] = 0;
628
+ if (nr_entries < TRACK_ADDRS_COUNT)
629
+ p->addrs[nr_entries] = 0;
630
+ trace_android_vh_save_track_hash(alloc == TRACK_ALLOC,
631
+ (unsigned long)p);
574632 #endif
575633 p->addr = addr;
576634 p->cpu = smp_processor_id();
577635 p->pid = current->pid;
578636 p->when = jiffies;
579
- } else
637
+ } else {
580638 memset(p, 0, sizeof(struct track));
639
+ }
581640 }
582641
583642 static void init_tracking(struct kmem_cache *s, void *object)
....@@ -608,7 +667,7 @@
608667 #endif
609668 }
610669
611
-static void print_tracking(struct kmem_cache *s, void *object)
670
+void print_tracking(struct kmem_cache *s, void *object)
612671 {
613672 unsigned long pr_time = jiffies;
614673 if (!(s->flags & SLAB_STORE_USER))
....@@ -636,8 +695,6 @@
636695 pr_err("=============================================================================\n");
637696 pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
638697 pr_err("-----------------------------------------------------------------------------\n\n");
639
-
640
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
641698 va_end(args);
642699 }
643700
....@@ -691,10 +748,7 @@
691748 print_section(KERN_ERR, "Redzone ", p + s->object_size,
692749 s->inuse - s->object_size);
693750
694
- if (s->offset)
695
- off = s->offset + sizeof(void *);
696
- else
697
- off = s->inuse;
751
+ off = get_info_end(s);
698752
699753 if (s->flags & SLAB_STORE_USER)
700754 off += 2 * sizeof(struct track);
....@@ -714,6 +768,7 @@
714768 {
715769 slab_bug(s, "%s", reason);
716770 print_trailer(s, page, object);
771
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
717772 }
718773
719774 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
....@@ -728,11 +783,12 @@
728783 slab_bug(s, "%s", buf);
729784 print_page_info(page);
730785 dump_stack();
786
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
731787 }
732788
733789 static void init_object(struct kmem_cache *s, void *object, u8 val)
734790 {
735
- u8 *p = object;
791
+ u8 *p = kasan_reset_tag(object);
736792
737793 if (s->flags & SLAB_RED_ZONE)
738794 memset(p - s->red_left_pad, val, s->red_left_pad);
....@@ -759,9 +815,10 @@
759815 {
760816 u8 *fault;
761817 u8 *end;
818
+ u8 *addr = page_address(page);
762819
763820 metadata_access_enable();
764
- fault = memchr_inv(start, value, bytes);
821
+ fault = memchr_inv(kasan_reset_tag(start), value, bytes);
765822 metadata_access_disable();
766823 if (!fault)
767824 return 1;
....@@ -771,9 +828,11 @@
771828 end--;
772829
773830 slab_bug(s, "%s overwritten", what);
774
- pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
775
- fault, end - 1, fault[0], value);
831
+ pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
832
+ fault, end - 1, fault - addr,
833
+ fault[0], value);
776834 print_trailer(s, page, object);
835
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
777836
778837 restore_bytes(s, what, value, fault, end);
779838 return 0;
....@@ -785,7 +844,7 @@
785844 * object address
786845 * Bytes of the object to be managed.
787846 * If the freepointer may overlay the object then the free
788
- * pointer is the first word of the object.
847
+ * pointer is at the middle of the object.
789848 *
790849 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
791850 * 0xa5 (POISON_END)
....@@ -819,11 +878,7 @@
819878
820879 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
821880 {
822
- unsigned long off = s->inuse; /* The end of info */
823
-
824
- if (s->offset)
825
- /* Freepointer is placed after the object. */
826
- off += sizeof(void *);
881
+ unsigned long off = get_info_end(s); /* The end of info */
827882
828883 if (s->flags & SLAB_STORE_USER)
829884 /* We also have user information there */
....@@ -852,7 +907,7 @@
852907 return 1;
853908
854909 start = page_address(page);
855
- length = PAGE_SIZE << compound_order(page);
910
+ length = page_size(page);
856911 end = start + length;
857912 remainder = length % s->size;
858913 if (!remainder)
....@@ -860,14 +915,15 @@
860915
861916 pad = end - remainder;
862917 metadata_access_enable();
863
- fault = memchr_inv(pad, POISON_INUSE, remainder);
918
+ fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
864919 metadata_access_disable();
865920 if (!fault)
866921 return 1;
867922 while (end > fault && end[-1] == POISON_INUSE)
868923 end--;
869924
870
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
925
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
926
+ fault, end - 1, fault - start);
871927 print_section(KERN_ERR, "Padding ", pad, remainder);
872928
873929 restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
....@@ -909,7 +965,7 @@
909965 check_pad_bytes(s, page, p);
910966 }
911967
912
- if (!s->offset && val == SLUB_RED_ACTIVE)
968
+ if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
913969 /*
914970 * Object and freepointer overlap. Cannot check
915971 * freepointer while object is allocated.
....@@ -1038,7 +1094,7 @@
10381094 return;
10391095
10401096 lockdep_assert_held(&n->list_lock);
1041
- list_add(&page->lru, &n->full);
1097
+ list_add(&page->slab_list, &n->full);
10421098 }
10431099
10441100 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
....@@ -1047,7 +1103,7 @@
10471103 return;
10481104
10491105 lockdep_assert_held(&n->list_lock);
1050
- list_del(&page->lru);
1106
+ list_del(&page->slab_list);
10511107 }
10521108
10531109 /* Tracking of the number of slabs for debugging purposes */
....@@ -1090,26 +1146,26 @@
10901146 static void setup_object_debug(struct kmem_cache *s, struct page *page,
10911147 void *object)
10921148 {
1093
- if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
1149
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
10941150 return;
10951151
10961152 init_object(s, object, SLUB_RED_INACTIVE);
10971153 init_tracking(s, object);
10981154 }
10991155
1100
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
1156
+static
1157
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
11011158 {
1102
- if (!(s->flags & SLAB_POISON))
1159
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
11031160 return;
11041161
11051162 metadata_access_enable();
1106
- memset(addr, POISON_INUSE, PAGE_SIZE << order);
1163
+ memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
11071164 metadata_access_disable();
11081165 }
11091166
11101167 static inline int alloc_consistency_checks(struct kmem_cache *s,
1111
- struct page *page,
1112
- void *object, unsigned long addr)
1168
+ struct page *page, void *object)
11131169 {
11141170 if (!check_slab(s, page))
11151171 return 0;
....@@ -1130,7 +1186,7 @@
11301186 void *object, unsigned long addr)
11311187 {
11321188 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1133
- if (!alloc_consistency_checks(s, page, object, addr))
1189
+ if (!alloc_consistency_checks(s, page, object))
11341190 goto bad;
11351191 }
11361192
....@@ -1196,7 +1252,7 @@
11961252 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
11971253 void *object = head;
11981254 int cnt = 0;
1199
- unsigned long uninitialized_var(flags);
1255
+ unsigned long flags;
12001256 int ret = 0;
12011257
12021258 spin_lock_irqsave(&n->list_lock, flags);
....@@ -1240,69 +1296,138 @@
12401296 return ret;
12411297 }
12421298
1243
-static int __init setup_slub_debug(char *str)
1299
+/*
1300
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
1301
+ *
1302
+ * @str: start of block
1303
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1304
+ * @slabs: return start of list of slabs, or NULL when there's no list
1305
+ * @init: assume this is initial parsing and not per-kmem-create parsing
1306
+ *
1307
+ * returns the start of next block if there's any, or NULL
1308
+ */
1309
+static char *
1310
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
12441311 {
1245
- slub_debug = DEBUG_DEFAULT_FLAGS;
1246
- if (*str++ != '=' || !*str)
1247
- /*
1248
- * No options specified. Switch on full debugging.
1249
- */
1250
- goto out;
1312
+ bool higher_order_disable = false;
12511313
1252
- if (*str == ',')
1314
+ /* Skip any completely empty blocks */
1315
+ while (*str && *str == ';')
1316
+ str++;
1317
+
1318
+ if (*str == ',') {
12531319 /*
12541320 * No options but restriction on slabs. This means full
12551321 * debugging for slabs matching a pattern.
12561322 */
1323
+ *flags = DEBUG_DEFAULT_FLAGS;
12571324 goto check_slabs;
1325
+ }
1326
+ *flags = 0;
12581327
1259
- slub_debug = 0;
1260
- if (*str == '-')
1261
- /*
1262
- * Switch off all debugging measures.
1263
- */
1264
- goto out;
1265
-
1266
- /*
1267
- * Determine which debug features should be switched on
1268
- */
1269
- for (; *str && *str != ','; str++) {
1328
+ /* Determine which debug features should be switched on */
1329
+ for (; *str && *str != ',' && *str != ';'; str++) {
12701330 switch (tolower(*str)) {
1331
+ case '-':
1332
+ *flags = 0;
1333
+ break;
12711334 case 'f':
1272
- slub_debug |= SLAB_CONSISTENCY_CHECKS;
1335
+ *flags |= SLAB_CONSISTENCY_CHECKS;
12731336 break;
12741337 case 'z':
1275
- slub_debug |= SLAB_RED_ZONE;
1338
+ *flags |= SLAB_RED_ZONE;
12761339 break;
12771340 case 'p':
1278
- slub_debug |= SLAB_POISON;
1341
+ *flags |= SLAB_POISON;
12791342 break;
12801343 case 'u':
1281
- slub_debug |= SLAB_STORE_USER;
1344
+ *flags |= SLAB_STORE_USER;
12821345 break;
12831346 case 't':
1284
- slub_debug |= SLAB_TRACE;
1347
+ *flags |= SLAB_TRACE;
12851348 break;
12861349 case 'a':
1287
- slub_debug |= SLAB_FAILSLAB;
1350
+ *flags |= SLAB_FAILSLAB;
12881351 break;
12891352 case 'o':
12901353 /*
12911354 * Avoid enabling debugging on caches if its minimum
12921355 * order would increase as a result.
12931356 */
1294
- disable_higher_order_debug = 1;
1357
+ higher_order_disable = true;
12951358 break;
12961359 default:
1297
- pr_err("slub_debug option '%c' unknown. skipped\n",
1298
- *str);
1360
+ if (init)
1361
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
1362
+ }
1363
+ }
1364
+check_slabs:
1365
+ if (*str == ',')
1366
+ *slabs = ++str;
1367
+ else
1368
+ *slabs = NULL;
1369
+
1370
+ /* Skip over the slab list */
1371
+ while (*str && *str != ';')
1372
+ str++;
1373
+
1374
+ /* Skip any completely empty blocks */
1375
+ while (*str && *str == ';')
1376
+ str++;
1377
+
1378
+ if (init && higher_order_disable)
1379
+ disable_higher_order_debug = 1;
1380
+
1381
+ if (*str)
1382
+ return str;
1383
+ else
1384
+ return NULL;
1385
+}
1386
+
1387
+static int __init setup_slub_debug(char *str)
1388
+{
1389
+ slab_flags_t flags;
1390
+ slab_flags_t global_flags;
1391
+ char *saved_str;
1392
+ char *slab_list;
1393
+ bool global_slub_debug_changed = false;
1394
+ bool slab_list_specified = false;
1395
+
1396
+ global_flags = DEBUG_DEFAULT_FLAGS;
1397
+ if (*str++ != '=' || !*str)
1398
+ /*
1399
+ * No options specified. Switch on full debugging.
1400
+ */
1401
+ goto out;
1402
+
1403
+ saved_str = str;
1404
+ while (str) {
1405
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1406
+
1407
+ if (!slab_list) {
1408
+ global_flags = flags;
1409
+ global_slub_debug_changed = true;
1410
+ } else {
1411
+ slab_list_specified = true;
12991412 }
13001413 }
13011414
1302
-check_slabs:
1303
- if (*str == ',')
1304
- slub_debug_slabs = str + 1;
1415
+ /*
1416
+ * For backwards compatibility, a single list of flags with list of
1417
+ * slabs means debugging is only changed for those slabs, so the global
1418
+ * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1419
+ * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1420
+ * long as there is no option specifying flags without a slab list.
1421
+ */
1422
+ if (slab_list_specified) {
1423
+ if (!global_slub_debug_changed)
1424
+ global_flags = slub_debug;
1425
+ slub_debug_string = saved_str;
1426
+ }
13051427 out:
1428
+ slub_debug = global_flags;
1429
+ if (slub_debug != 0 || slub_debug_string)
1430
+ static_branch_enable(&slub_debug_enabled);
13061431 if ((static_branch_unlikely(&init_on_alloc) ||
13071432 static_branch_unlikely(&init_on_free)) &&
13081433 (slub_debug & SLAB_POISON))
....@@ -1312,24 +1437,65 @@
13121437
13131438 __setup("slub_debug", setup_slub_debug);
13141439
1440
+/*
1441
+ * kmem_cache_flags - apply debugging options to the cache
1442
+ * @object_size: the size of an object without meta data
1443
+ * @flags: flags to set
1444
+ * @name: name of the cache
1445
+ *
1446
+ * Debug option(s) are applied to @flags. In addition to the debug
1447
+ * option(s), if a slab name (or multiple) is specified i.e.
1448
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1449
+ * then only the select slabs will receive the debug option(s).
1450
+ */
13151451 slab_flags_t kmem_cache_flags(unsigned int object_size,
1316
- slab_flags_t flags, const char *name,
1317
- void (*ctor)(void *))
1452
+ slab_flags_t flags, const char *name)
13181453 {
1319
- /*
1320
- * Enable debugging if selected on the kernel commandline.
1321
- */
1322
- if (slub_debug && (!slub_debug_slabs || (name &&
1323
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
1324
- flags |= slub_debug;
1454
+ char *iter;
1455
+ size_t len;
1456
+ char *next_block;
1457
+ slab_flags_t block_flags;
13251458
1326
- return flags;
1459
+ len = strlen(name);
1460
+ next_block = slub_debug_string;
1461
+ /* Go through all blocks of debug options, see if any matches our slab's name */
1462
+ while (next_block) {
1463
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1464
+ if (!iter)
1465
+ continue;
1466
+ /* Found a block that has a slab list, search it */
1467
+ while (*iter) {
1468
+ char *end, *glob;
1469
+ size_t cmplen;
1470
+
1471
+ end = strchrnul(iter, ',');
1472
+ if (next_block && next_block < end)
1473
+ end = next_block - 1;
1474
+
1475
+ glob = strnchr(iter, end - iter, '*');
1476
+ if (glob)
1477
+ cmplen = glob - iter;
1478
+ else
1479
+ cmplen = max_t(size_t, len, (end - iter));
1480
+
1481
+ if (!strncmp(name, iter, cmplen)) {
1482
+ flags |= block_flags;
1483
+ return flags;
1484
+ }
1485
+
1486
+ if (!*end || *end == ';')
1487
+ break;
1488
+ iter = end + 1;
1489
+ }
1490
+ }
1491
+
1492
+ return flags | slub_debug;
13271493 }
13281494 #else /* !CONFIG_SLUB_DEBUG */
13291495 static inline void setup_object_debug(struct kmem_cache *s,
13301496 struct page *page, void *object) {}
1331
-static inline void setup_page_debug(struct kmem_cache *s,
1332
- void *addr, int order) {}
1497
+static inline
1498
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
13331499
13341500 static inline int alloc_debug_processing(struct kmem_cache *s,
13351501 struct page *page, void *object, unsigned long addr) { return 0; }
....@@ -1348,8 +1514,7 @@
13481514 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
13491515 struct page *page) {}
13501516 slab_flags_t kmem_cache_flags(unsigned int object_size,
1351
- slab_flags_t flags, const char *name,
1352
- void (*ctor)(void *))
1517
+ slab_flags_t flags, const char *name)
13531518 {
13541519 return flags;
13551520 }
....@@ -1380,6 +1545,7 @@
13801545 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
13811546 {
13821547 ptr = kasan_kmalloc_large(ptr, size, flags);
1548
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
13831549 kmemleak_alloc(ptr, size, 1, flags);
13841550 return ptr;
13851551 }
....@@ -1387,10 +1553,11 @@
13871553 static __always_inline void kfree_hook(void *x)
13881554 {
13891555 kmemleak_free(x);
1390
- kasan_kfree_large(x, _RET_IP_);
1556
+ kasan_kfree_large(x);
13911557 }
13921558
1393
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
1559
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
1560
+ void *x, bool init)
13941561 {
13951562 kmemleak_free_recursive(x, s->flags);
13961563
....@@ -1411,8 +1578,30 @@
14111578 if (!(s->flags & SLAB_DEBUG_OBJECTS))
14121579 debug_check_no_obj_freed(x, s->object_size);
14131580
1414
- /* KASAN might put x into memory quarantine, delaying its reuse */
1415
- return kasan_slab_free(s, x, _RET_IP_);
1581
+ /* Use KCSAN to help debug racy use-after-free. */
1582
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
1583
+ __kcsan_check_access(x, s->object_size,
1584
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
1585
+
1586
+ /*
1587
+ * As memory initialization might be integrated into KASAN,
1588
+ * kasan_slab_free and initialization memset's must be
1589
+ * kept together to avoid discrepancies in behavior.
1590
+ *
1591
+ * The initialization memset's clear the object and the metadata,
1592
+ * but don't touch the SLAB redzone.
1593
+ */
1594
+ if (init) {
1595
+ int rsize;
1596
+
1597
+ if (!kasan_has_integrated_init())
1598
+ memset(kasan_reset_tag(x), 0, s->object_size);
1599
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
1600
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
1601
+ s->size - s->inuse - rsize);
1602
+ }
1603
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
1604
+ return kasan_slab_free(s, x, init);
14161605 }
14171606
14181607 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
....@@ -1423,7 +1612,11 @@
14231612 void *object;
14241613 void *next = *head;
14251614 void *old_tail = *tail ? *tail : *head;
1426
- int rsize;
1615
+
1616
+ if (is_kfence_address(next)) {
1617
+ slab_free_hook(s, next, false);
1618
+ return true;
1619
+ }
14271620
14281621 /* Head and tail of the reconstructed freelist */
14291622 *head = NULL;
....@@ -1433,20 +1626,8 @@
14331626 object = next;
14341627 next = get_freepointer(s, object);
14351628
1436
- if (slab_want_init_on_free(s)) {
1437
- /*
1438
- * Clear the object and the metadata, but don't touch
1439
- * the redzone.
1440
- */
1441
- memset(object, 0, s->object_size);
1442
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
1443
- : 0;
1444
- memset((char *)object + s->inuse, 0,
1445
- s->size - s->inuse - rsize);
1446
-
1447
- }
14481629 /* If object's reuse doesn't have to be delayed */
1449
- if (!slab_free_hook(s, object)) {
1630
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
14501631 /* Move object to the new freelist */
14511632 set_freepointer(s, object, *head);
14521633 *head = object;
....@@ -1494,10 +1675,8 @@
14941675 else
14951676 page = __alloc_pages_node(node, flags, order);
14961677
1497
- if (page && memcg_charge_slab(page, flags, order, s)) {
1498
- __free_pages(page, order);
1499
- page = NULL;
1500
- }
1678
+ if (page)
1679
+ account_slab_page(page, order, s);
15011680
15021681 return page;
15031682 }
....@@ -1617,7 +1796,7 @@
16171796 struct kmem_cache_order_objects oo = s->oo;
16181797 gfp_t alloc_gfp;
16191798 void *start, *p, *next;
1620
- int idx, order;
1799
+ int idx;
16211800 bool shuffle;
16221801
16231802 flags &= gfp_allowed_mask;
....@@ -1651,7 +1830,6 @@
16511830
16521831 page->objects = oo_objects(oo);
16531832
1654
- order = compound_order(page);
16551833 page->slab_cache = s;
16561834 __SetPageSlab(page);
16571835 if (page_is_pfmemalloc(page))
....@@ -1661,7 +1839,7 @@
16611839
16621840 start = page_address(page);
16631841
1664
- setup_page_debug(s, start, order);
1842
+ setup_page_debug(s, page, start);
16651843
16661844 shuffle = shuffle_freelist(s, page);
16671845
....@@ -1687,11 +1865,6 @@
16871865 if (!page)
16881866 return NULL;
16891867
1690
- mod_lruvec_page_state(page,
1691
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1692
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1693
- 1 << oo_order(oo));
1694
-
16951868 inc_slabs_node(s, page_to_nid(page), page->objects);
16961869
16971870 return page;
....@@ -1699,13 +1872,8 @@
16991872
17001873 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
17011874 {
1702
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1703
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1704
- flags &= ~GFP_SLAB_BUG_MASK;
1705
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1706
- invalid_mask, &invalid_mask, flags, &flags);
1707
- dump_stack();
1708
- }
1875
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
1876
+ flags = kmalloc_fix_flags(flags);
17091877
17101878 return allocate_slab(s,
17111879 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
....@@ -1716,7 +1884,7 @@
17161884 int order = compound_order(page);
17171885 int pages = 1 << order;
17181886
1719
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1887
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
17201888 void *p;
17211889
17221890 slab_pad_check(s, page);
....@@ -1725,18 +1893,13 @@
17251893 check_object(s, page, p, SLUB_RED_INACTIVE);
17261894 }
17271895
1728
- mod_lruvec_page_state(page,
1729
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1730
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1731
- -pages);
1732
-
17331896 __ClearPageSlabPfmemalloc(page);
17341897 __ClearPageSlab(page);
17351898
17361899 page->mapping = NULL;
17371900 if (current->reclaim_state)
17381901 current->reclaim_state->reclaimed_slab += pages;
1739
- memcg_uncharge_slab(page, order, s);
1902
+ unaccount_slab_page(page, order, s);
17401903 __free_pages(page, order);
17411904 }
17421905
....@@ -1769,9 +1932,9 @@
17691932 {
17701933 n->nr_partial++;
17711934 if (tail == DEACTIVATE_TO_TAIL)
1772
- list_add_tail(&page->lru, &n->partial);
1935
+ list_add_tail(&page->slab_list, &n->partial);
17731936 else
1774
- list_add(&page->lru, &n->partial);
1937
+ list_add(&page->slab_list, &n->partial);
17751938 }
17761939
17771940 static inline void add_partial(struct kmem_cache_node *n,
....@@ -1785,7 +1948,7 @@
17851948 struct page *page)
17861949 {
17871950 lockdep_assert_held(&n->list_lock);
1788
- list_del(&page->lru);
1951
+ list_del(&page->slab_list);
17891952 n->nr_partial--;
17901953 }
17911954
....@@ -1852,14 +2015,14 @@
18522015 /*
18532016 * Racy check. If we mistakenly see no partial slabs then we
18542017 * just allocate an empty slab. If we mistakenly try to get a
1855
- * partial slab and there is none available then get_partials()
2018
+ * partial slab and there is none available then get_partial()
18562019 * will return NULL.
18572020 */
18582021 if (!n || !n->nr_partial)
18592022 return NULL;
18602023
18612024 spin_lock(&n->list_lock);
1862
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
2025
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
18632026 void *t;
18642027
18652028 if (!pfmemalloc_match(page, flags))
....@@ -1897,7 +2060,7 @@
18972060 struct zonelist *zonelist;
18982061 struct zoneref *z;
18992062 struct zone *zone;
1900
- enum zone_type high_zoneidx = gfp_zone(flags);
2063
+ enum zone_type highest_zoneidx = gfp_zone(flags);
19012064 void *object;
19022065 unsigned int cpuset_mems_cookie;
19032066
....@@ -1926,7 +2089,7 @@
19262089 do {
19272090 cpuset_mems_cookie = read_mems_allowed_begin();
19282091 zonelist = node_zonelist(mempolicy_slab_node(), flags);
1929
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2092
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
19302093 struct kmem_cache_node *n;
19312094
19322095 n = get_node(s, zone_to_nid(zone));
....@@ -1947,7 +2110,7 @@
19472110 }
19482111 }
19492112 } while (read_mems_allowed_retry(cpuset_mems_cookie));
1950
-#endif
2113
+#endif /* CONFIG_NUMA */
19512114 return NULL;
19522115 }
19532116
....@@ -1970,9 +2133,9 @@
19702133 return get_any_partial(s, flags, c);
19712134 }
19722135
1973
-#ifdef CONFIG_PREEMPT
2136
+#ifdef CONFIG_PREEMPTION
19742137 /*
1975
- * Calculate the next globally unique transaction for disambiguiation
2138
+ * Calculate the next globally unique transaction for disambiguation
19762139 * during cmpxchg. The transactions start with the cpu number and are then
19772140 * incremented by CONFIG_NR_CPUS.
19782141 */
....@@ -1990,6 +2153,7 @@
19902153 return tid + TID_STEP;
19912154 }
19922155
2156
+#ifdef SLUB_DEBUG_CMPXCHG
19932157 static inline unsigned int tid_to_cpu(unsigned long tid)
19942158 {
19952159 return tid % TID_STEP;
....@@ -1999,6 +2163,7 @@
19992163 {
20002164 return tid / TID_STEP;
20012165 }
2166
+#endif
20022167
20032168 static inline unsigned int init_tid(int cpu)
20042169 {
....@@ -2013,7 +2178,7 @@
20132178
20142179 pr_info("%s %s: cmpxchg redo ", n, s->name);
20152180
2016
-#ifdef CONFIG_PREEMPT
2181
+#ifdef CONFIG_PREEMPTION
20172182 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
20182183 pr_warn("due to cpu change %d -> %d\n",
20192184 tid_to_cpu(tid), tid_to_cpu(actual_tid));
....@@ -2131,7 +2296,7 @@
21312296 if (!lock) {
21322297 lock = 1;
21332298 /*
2134
- * Taking the spinlock removes the possiblity
2299
+ * Taking the spinlock removes the possibility
21352300 * that acquire_slab() will see a slab page that
21362301 * is frozen
21372302 */
....@@ -2139,7 +2304,8 @@
21392304 }
21402305 } else {
21412306 m = M_FULL;
2142
- if (kmem_cache_debug(s) && !lock) {
2307
+#ifdef CONFIG_SLUB_DEBUG
2308
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
21432309 lock = 1;
21442310 /*
21452311 * This also ensures that the scanning of full
....@@ -2148,29 +2314,19 @@
21482314 */
21492315 spin_lock(&n->list_lock);
21502316 }
2317
+#endif
21512318 }
21522319
21532320 if (l != m) {
2154
-
21552321 if (l == M_PARTIAL)
2156
-
21572322 remove_partial(n, page);
2158
-
21592323 else if (l == M_FULL)
2160
-
21612324 remove_full(s, n, page);
21622325
2163
- if (m == M_PARTIAL) {
2164
-
2326
+ if (m == M_PARTIAL)
21652327 add_partial(n, page, tail);
2166
- stat(s, tail);
2167
-
2168
- } else if (m == M_FULL) {
2169
-
2170
- stat(s, DEACTIVATE_FULL);
2328
+ else if (m == M_FULL)
21712329 add_full(s, n, page);
2172
-
2173
- }
21742330 }
21752331
21762332 l = m;
....@@ -2183,7 +2339,11 @@
21832339 if (lock)
21842340 spin_unlock(&n->list_lock);
21852341
2186
- if (m == M_FREE) {
2342
+ if (m == M_PARTIAL)
2343
+ stat(s, tail);
2344
+ else if (m == M_FULL)
2345
+ stat(s, DEACTIVATE_FULL);
2346
+ else if (m == M_FREE) {
21872347 stat(s, DEACTIVATE_EMPTY);
21882348 discard_slab(s, page);
21892349 stat(s, FREE_SLAB);
....@@ -2191,6 +2351,7 @@
21912351
21922352 c->page = NULL;
21932353 c->freelist = NULL;
2354
+ c->tid = next_tid(c->tid);
21942355 }
21952356
21962357 /*
....@@ -2207,11 +2368,11 @@
22072368 struct kmem_cache_node *n = NULL, *n2 = NULL;
22082369 struct page *page, *discard_page = NULL;
22092370
2210
- while ((page = c->partial)) {
2371
+ while ((page = slub_percpu_partial(c))) {
22112372 struct page new;
22122373 struct page old;
22132374
2214
- c->partial = page->next;
2375
+ slub_set_percpu_partial(c, page);
22152376
22162377 n2 = get_node(s, page_to_nid(page));
22172378 if (n != n2) {
....@@ -2258,12 +2419,12 @@
22582419 discard_slab(s, page);
22592420 stat(s, FREE_SLAB);
22602421 }
2261
-#endif
2422
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
22622423 }
22632424
22642425 /*
2265
- * Put a page that was just frozen (in __slab_free) into a partial page
2266
- * slot if available.
2426
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
2427
+ * partial page slot if available.
22672428 *
22682429 * If we did not find a slot then simply move all the partials to the
22692430 * per node partial list.
....@@ -2284,7 +2445,7 @@
22842445 if (oldpage) {
22852446 pobjects = oldpage->pobjects;
22862447 pages = oldpage->pages;
2287
- if (drain && pobjects > s->cpu_partial) {
2448
+ if (drain && pobjects > slub_cpu_partial(s)) {
22882449 unsigned long flags;
22892450 /*
22902451 * partial array is full. Move the existing
....@@ -2309,7 +2470,7 @@
23092470
23102471 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
23112472 != oldpage);
2312
- if (unlikely(!s->cpu_partial)) {
2473
+ if (unlikely(!slub_cpu_partial(s))) {
23132474 unsigned long flags;
23142475
23152476 local_irq_save(flags);
....@@ -2317,15 +2478,13 @@
23172478 local_irq_restore(flags);
23182479 }
23192480 preempt_enable();
2320
-#endif
2481
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
23212482 }
23222483
23232484 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
23242485 {
23252486 stat(s, CPUSLAB_FLUSH);
23262487 deactivate_slab(s, c->page, c->freelist, c);
2327
-
2328
- c->tid = next_tid(c->tid);
23292488 }
23302489
23312490 /*
....@@ -2337,12 +2496,10 @@
23372496 {
23382497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
23392498
2340
- if (likely(c)) {
2341
- if (c->page)
2342
- flush_slab(s, c);
2499
+ if (c->page)
2500
+ flush_slab(s, c);
23432501
2344
- unfreeze_partials(s, c);
2345
- }
2502
+ unfreeze_partials(s, c);
23462503 }
23472504
23482505 static void flush_cpu_slab(void *d)
....@@ -2362,7 +2519,7 @@
23622519
23632520 static void flush_all(struct kmem_cache *s)
23642521 {
2365
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
2522
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
23662523 }
23672524
23682525 /*
....@@ -2391,7 +2548,7 @@
23912548 static inline int node_match(struct page *page, int node)
23922549 {
23932550 #ifdef CONFIG_NUMA
2394
- if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
2551
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
23952552 return 0;
23962553 #endif
23972554 return 1;
....@@ -2418,7 +2575,7 @@
24182575 struct page *page;
24192576
24202577 spin_lock_irqsave(&n->list_lock, flags);
2421
- list_for_each_entry(page, &n->partial, lru)
2578
+ list_for_each_entry(page, &n->partial, slab_list)
24222579 x += get_count(page);
24232580 spin_unlock_irqrestore(&n->list_lock, flags);
24242581 return x;
....@@ -2492,8 +2649,7 @@
24922649 stat(s, ALLOC_SLAB);
24932650 c->page = page;
24942651 *pc = c;
2495
- } else
2496
- freelist = NULL;
2652
+ }
24972653
24982654 return freelist;
24992655 }
....@@ -2565,6 +2721,8 @@
25652721 void *freelist;
25662722 struct page *page;
25672723
2724
+ stat(s, ALLOC_SLOWPATH);
2725
+
25682726 page = c->page;
25692727 if (!page) {
25702728 /*
....@@ -2612,6 +2770,7 @@
26122770
26132771 if (!freelist) {
26142772 c->page = NULL;
2773
+ c->tid = next_tid(c->tid);
26152774 stat(s, DEACTIVATE_BYPASS);
26162775 goto new_slab;
26172776 }
....@@ -2669,7 +2828,7 @@
26692828 unsigned long flags;
26702829
26712830 local_irq_save(flags);
2672
-#ifdef CONFIG_PREEMPT
2831
+#ifdef CONFIG_PREEMPTION
26732832 /*
26742833 * We may have been preempted and rescheduled on a different
26752834 * cpu before disabling interrupts. Need to reload cpu area
....@@ -2691,7 +2850,8 @@
26912850 void *obj)
26922851 {
26932852 if (unlikely(slab_want_init_on_free(s)) && obj)
2694
- memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
2853
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
2854
+ 0, sizeof(void *));
26952855 }
26962856
26972857 /*
....@@ -2705,16 +2865,23 @@
27052865 * Otherwise we can simply pick the next object from the lockless free list.
27062866 */
27072867 static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2708
- gfp_t gfpflags, int node, unsigned long addr)
2868
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
27092869 {
27102870 void *object;
27112871 struct kmem_cache_cpu *c;
27122872 struct page *page;
27132873 unsigned long tid;
2874
+ struct obj_cgroup *objcg = NULL;
2875
+ bool init = false;
27142876
2715
- s = slab_pre_alloc_hook(s, gfpflags);
2877
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
27162878 if (!s)
27172879 return NULL;
2880
+
2881
+ object = kfence_alloc(s, orig_size, gfpflags);
2882
+ if (unlikely(object))
2883
+ goto out;
2884
+
27182885 redo:
27192886 /*
27202887 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
....@@ -2723,13 +2890,13 @@
27232890 * as we end up on the original cpu again when doing the cmpxchg.
27242891 *
27252892 * We should guarantee that tid and kmem_cache are retrieved on
2726
- * the same cpu. It could be different if CONFIG_PREEMPT so we need
2893
+ * the same cpu. It could be different if CONFIG_PREEMPTION so we need
27272894 * to check if it is matched or not.
27282895 */
27292896 do {
27302897 tid = this_cpu_read(s->cpu_slab->tid);
27312898 c = raw_cpu_ptr(s->cpu_slab);
2732
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
2899
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
27332900 unlikely(tid != READ_ONCE(c->tid)));
27342901
27352902 /*
....@@ -2751,9 +2918,8 @@
27512918
27522919 object = c->freelist;
27532920 page = c->page;
2754
- if (unlikely(!object || !node_match(page, node))) {
2921
+ if (unlikely(!object || !page || !node_match(page, node))) {
27552922 object = __slab_alloc(s, gfpflags, node, addr, c);
2756
- stat(s, ALLOC_SLOWPATH);
27572923 } else {
27582924 void *next_object = get_freepointer_safe(s, object);
27592925
....@@ -2784,24 +2950,23 @@
27842950 }
27852951
27862952 maybe_wipe_obj_freeptr(s, object);
2953
+ init = slab_want_init_on_alloc(gfpflags, s);
27872954
2788
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
2789
- memset(object, 0, s->object_size);
2790
-
2791
- slab_post_alloc_hook(s, gfpflags, 1, &object);
2955
+out:
2956
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
27922957
27932958 return object;
27942959 }
27952960
27962961 static __always_inline void *slab_alloc(struct kmem_cache *s,
2797
- gfp_t gfpflags, unsigned long addr)
2962
+ gfp_t gfpflags, unsigned long addr, size_t orig_size)
27982963 {
2799
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2964
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
28002965 }
28012966
28022967 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
28032968 {
2804
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2969
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
28052970
28062971 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
28072972 s->size, gfpflags);
....@@ -2813,7 +2978,7 @@
28132978 #ifdef CONFIG_TRACING
28142979 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
28152980 {
2816
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2981
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
28172982 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
28182983 ret = kasan_kmalloc(s, ret, size, gfpflags);
28192984 return ret;
....@@ -2824,7 +2989,7 @@
28242989 #ifdef CONFIG_NUMA
28252990 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
28262991 {
2827
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2992
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
28282993
28292994 trace_kmem_cache_alloc_node(_RET_IP_, ret,
28302995 s->object_size, s->size, gfpflags, node);
....@@ -2838,7 +3003,7 @@
28383003 gfp_t gfpflags,
28393004 int node, size_t size)
28403005 {
2841
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
3006
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
28423007
28433008 trace_kmalloc_node(_RET_IP_, ret,
28443009 size, s->size, gfpflags, node);
....@@ -2848,7 +3013,7 @@
28483013 }
28493014 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
28503015 #endif
2851
-#endif
3016
+#endif /* CONFIG_NUMA */
28523017
28533018 /*
28543019 * Slow path handling. This may still be called frequently since objects
....@@ -2868,9 +3033,12 @@
28683033 struct page new;
28693034 unsigned long counters;
28703035 struct kmem_cache_node *n = NULL;
2871
- unsigned long uninitialized_var(flags);
3036
+ unsigned long flags;
28723037
28733038 stat(s, FREE_SLOWPATH);
3039
+
3040
+ if (kfence_free(head))
3041
+ return;
28743042
28753043 if (kmem_cache_debug(s) &&
28763044 !free_debug_processing(s, page, head, tail, cnt, addr))
....@@ -2922,20 +3090,21 @@
29223090
29233091 if (likely(!n)) {
29243092
2925
- /*
2926
- * If we just froze the page then put it onto the
2927
- * per cpu partial list.
2928
- */
2929
- if (new.frozen && !was_frozen) {
3093
+ if (likely(was_frozen)) {
3094
+ /*
3095
+ * The list lock was not taken therefore no list
3096
+ * activity can be necessary.
3097
+ */
3098
+ stat(s, FREE_FROZEN);
3099
+ } else if (new.frozen) {
3100
+ /*
3101
+ * If we just froze the page then put it onto the
3102
+ * per cpu partial list.
3103
+ */
29303104 put_cpu_partial(s, page, 1);
29313105 stat(s, CPU_PARTIAL_FREE);
29323106 }
2933
- /*
2934
- * The list lock was not taken therefore no list
2935
- * activity can be necessary.
2936
- */
2937
- if (was_frozen)
2938
- stat(s, FREE_FROZEN);
3107
+
29393108 return;
29403109 }
29413110
....@@ -2947,8 +3116,7 @@
29473116 * then add it.
29483117 */
29493118 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2950
- if (kmem_cache_debug(s))
2951
- remove_full(s, n, page);
3119
+ remove_full(s, n, page);
29523120 add_partial(n, page, DEACTIVATE_TO_TAIL);
29533121 stat(s, FREE_ADD_PARTIAL);
29543122 }
....@@ -2994,6 +3162,10 @@
29943162 void *tail_obj = tail ? : head;
29953163 struct kmem_cache_cpu *c;
29963164 unsigned long tid;
3165
+
3166
+ /* memcg_slab_free_hook() is already called for bulk free. */
3167
+ if (!tail)
3168
+ memcg_slab_free_hook(s, &head, 1);
29973169 redo:
29983170 /*
29993171 * Determine the currently cpus per cpu slab.
....@@ -3004,7 +3176,7 @@
30043176 do {
30053177 tid = this_cpu_read(s->cpu_slab->tid);
30063178 c = raw_cpu_ptr(s->cpu_slab);
3007
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
3179
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
30083180 unlikely(tid != READ_ONCE(c->tid)));
30093181
30103182 /* Same with comment on barrier() in slab_alloc_node() */
....@@ -3114,6 +3286,13 @@
31143286 df->s = cache_from_obj(s, object); /* Support for memcg */
31153287 }
31163288
3289
+ if (is_kfence_address(object)) {
3290
+ slab_free_hook(df->s, object, false);
3291
+ __kfence_free(object);
3292
+ p[size] = NULL; /* mark object processed */
3293
+ return size;
3294
+ }
3295
+
31173296 /* Start new detached freelist */
31183297 df->page = page;
31193298 set_freepointer(df->s, object, NULL);
....@@ -3155,6 +3334,7 @@
31553334 if (WARN_ON(!size))
31563335 return;
31573336
3337
+ memcg_slab_free_hook(s, p, size);
31583338 do {
31593339 struct detached_freelist df;
31603340
....@@ -3173,9 +3353,10 @@
31733353 {
31743354 struct kmem_cache_cpu *c;
31753355 int i;
3356
+ struct obj_cgroup *objcg = NULL;
31763357
31773358 /* memcg and kmem_cache debug support */
3178
- s = slab_pre_alloc_hook(s, flags);
3359
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
31793360 if (unlikely(!s))
31803361 return false;
31813362 /*
....@@ -3187,8 +3368,14 @@
31873368 c = this_cpu_ptr(s->cpu_slab);
31883369
31893370 for (i = 0; i < size; i++) {
3190
- void *object = c->freelist;
3371
+ void *object = kfence_alloc(s, s->object_size, flags);
31913372
3373
+ if (unlikely(object)) {
3374
+ p[i] = object;
3375
+ continue;
3376
+ }
3377
+
3378
+ object = c->freelist;
31923379 if (unlikely(!object)) {
31933380 /*
31943381 * We may have removed an object from c->freelist using
....@@ -3220,20 +3407,16 @@
32203407 c->tid = next_tid(c->tid);
32213408 local_irq_enable();
32223409
3223
- /* Clear memory outside IRQ disabled fastpath loop */
3224
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
3225
- int j;
3226
-
3227
- for (j = 0; j < i; j++)
3228
- memset(p[j], 0, s->object_size);
3229
- }
3230
-
3231
- /* memcg and kmem_cache debug support */
3232
- slab_post_alloc_hook(s, flags, size, p);
3410
+ /*
3411
+ * memcg and kmem_cache debug support and memory initialization.
3412
+ * Done outside of the IRQ disabled fastpath loop.
3413
+ */
3414
+ slab_post_alloc_hook(s, objcg, flags, size, p,
3415
+ slab_want_init_on_alloc(flags, s));
32333416 return i;
32343417 error:
32353418 local_irq_enable();
3236
- slab_post_alloc_hook(s, flags, i, p);
3419
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
32373420 __kmem_cache_free_bulk(s, i, p);
32383421 return 0;
32393422 }
....@@ -3429,8 +3612,7 @@
34293612 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
34303613 init_tracking(kmem_cache_node, n);
34313614 #endif
3432
- n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
3433
- GFP_KERNEL);
3615
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
34343616 page->freelist = get_freepointer(kmem_cache_node, n);
34353617 page->inuse = 1;
34363618 page->frozen = 0;
....@@ -3518,15 +3700,15 @@
35183700 * 50% to keep some capacity around for frees.
35193701 */
35203702 if (!kmem_cache_has_cpu_partial(s))
3521
- s->cpu_partial = 0;
3703
+ slub_set_cpu_partial(s, 0);
35223704 else if (s->size >= PAGE_SIZE)
3523
- s->cpu_partial = 2;
3705
+ slub_set_cpu_partial(s, 2);
35243706 else if (s->size >= 1024)
3525
- s->cpu_partial = 6;
3707
+ slub_set_cpu_partial(s, 6);
35263708 else if (s->size >= 256)
3527
- s->cpu_partial = 13;
3709
+ slub_set_cpu_partial(s, 13);
35283710 else
3529
- s->cpu_partial = 30;
3711
+ slub_set_cpu_partial(s, 30);
35303712 #endif
35313713 }
35323714
....@@ -3571,22 +3753,36 @@
35713753
35723754 /*
35733755 * With that we have determined the number of bytes in actual use
3574
- * by the object. This is the potential offset to the free pointer.
3756
+ * by the object and redzoning.
35753757 */
35763758 s->inuse = size;
35773759
3578
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3579
- s->ctor)) {
3760
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3761
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
3762
+ s->ctor) {
35803763 /*
35813764 * Relocate free pointer after the object if it is not
35823765 * permitted to overwrite the first word of the object on
35833766 * kmem_cache_free.
35843767 *
35853768 * This is the case if we do RCU, have a constructor or
3586
- * destructor or are poisoning the objects.
3769
+ * destructor, are poisoning the objects, or are
3770
+ * redzoning an object smaller than sizeof(void *).
3771
+ *
3772
+ * The assumption that s->offset >= s->inuse means free
3773
+ * pointer is outside of the object is used in the
3774
+ * freeptr_outside_object() function. If that is no
3775
+ * longer true, the function needs to be modified.
35873776 */
35883777 s->offset = size;
35893778 size += sizeof(void *);
3779
+ } else {
3780
+ /*
3781
+ * Store freelist pointer near middle of object to keep
3782
+ * it away from the edges of the object to avoid small
3783
+ * sized over/underflows from neighboring allocations.
3784
+ */
3785
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
35903786 }
35913787
35923788 #ifdef CONFIG_SLUB_DEBUG
....@@ -3623,6 +3819,7 @@
36233819 */
36243820 size = ALIGN(size, s->align);
36253821 s->size = size;
3822
+ s->reciprocal_size = reciprocal_value(size);
36263823 if (forced_order >= 0)
36273824 order = forced_order;
36283825 else
....@@ -3657,7 +3854,7 @@
36573854
36583855 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
36593856 {
3660
- s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3857
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
36613858 #ifdef CONFIG_SLAB_FREELIST_HARDENED
36623859 s->random = get_random_long();
36633860 #endif
....@@ -3708,39 +3905,32 @@
37083905 if (alloc_kmem_cache_cpus(s))
37093906 return 0;
37103907
3711
- free_kmem_cache_nodes(s);
37123908 error:
3713
- if (flags & SLAB_PANIC)
3714
- panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
3715
- s->name, s->size, s->size,
3716
- oo_order(s->oo), s->offset, (unsigned long)flags);
3909
+ __kmem_cache_release(s);
37173910 return -EINVAL;
37183911 }
37193912
37203913 static void list_slab_objects(struct kmem_cache *s, struct page *page,
3721
- const char *text)
3914
+ const char *text)
37223915 {
37233916 #ifdef CONFIG_SLUB_DEBUG
37243917 void *addr = page_address(page);
3918
+ unsigned long *map;
37253919 void *p;
3726
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
3727
- sizeof(long),
3728
- GFP_ATOMIC);
3729
- if (!map)
3730
- return;
3920
+
37313921 slab_err(s, page, text, s->name);
37323922 slab_lock(page);
37333923
3734
- get_map(s, page, map);
3924
+ map = get_map(s, page);
37353925 for_each_object(p, s, addr, page->objects) {
37363926
3737
- if (!test_bit(slab_index(p, s, addr), map)) {
3927
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
37383928 pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
37393929 print_tracking(s, p);
37403930 }
37413931 }
3932
+ put_map(map);
37423933 slab_unlock(page);
3743
- kfree(map);
37443934 #endif
37453935 }
37463936
....@@ -3756,18 +3946,18 @@
37563946
37573947 BUG_ON(irqs_disabled());
37583948 spin_lock_irq(&n->list_lock);
3759
- list_for_each_entry_safe(page, h, &n->partial, lru) {
3949
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
37603950 if (!page->inuse) {
37613951 remove_partial(n, page);
3762
- list_add(&page->lru, &discard);
3952
+ list_add(&page->slab_list, &discard);
37633953 } else {
37643954 list_slab_objects(s, page,
3765
- "Objects remaining in %s on __kmem_cache_shutdown()");
3955
+ "Objects remaining in %s on __kmem_cache_shutdown()");
37663956 }
37673957 }
37683958 spin_unlock_irq(&n->list_lock);
37693959
3770
- list_for_each_entry_safe(page, h, &discard, lru)
3960
+ list_for_each_entry_safe(page, h, &discard, slab_list)
37713961 discard_slab(s, page);
37723962 }
37733963
....@@ -3797,7 +3987,6 @@
37973987 if (n->nr_partial || slabs_node(s, node))
37983988 return 1;
37993989 }
3800
- sysfs_slab_remove(s);
38013990 return 0;
38023991 }
38033992
....@@ -3846,7 +4035,7 @@
38464035 if (unlikely(ZERO_OR_NULL_PTR(s)))
38474036 return s;
38484037
3849
- ret = slab_alloc(s, flags, _RET_IP_);
4038
+ ret = slab_alloc(s, flags, _RET_IP_, size);
38504039
38514040 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
38524041
....@@ -3861,11 +4050,15 @@
38614050 {
38624051 struct page *page;
38634052 void *ptr = NULL;
4053
+ unsigned int order = get_order(size);
38644054
38654055 flags |= __GFP_COMP;
3866
- page = alloc_pages_node(node, flags, get_order(size));
3867
- if (page)
4056
+ page = alloc_pages_node(node, flags, order);
4057
+ if (page) {
38684058 ptr = page_address(page);
4059
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
4060
+ PAGE_SIZE << order);
4061
+ }
38694062
38704063 return kmalloc_large_node_hook(ptr, size, flags);
38714064 }
....@@ -3890,7 +4083,7 @@
38904083 if (unlikely(ZERO_OR_NULL_PTR(s)))
38914084 return s;
38924085
3893
- ret = slab_alloc_node(s, flags, node, _RET_IP_);
4086
+ ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
38944087
38954088 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
38964089
....@@ -3899,7 +4092,7 @@
38994092 return ret;
39004093 }
39014094 EXPORT_SYMBOL(__kmalloc_node);
3902
-#endif
4095
+#endif /* CONFIG_NUMA */
39034096
39044097 #ifdef CONFIG_HARDENED_USERCOPY
39054098 /*
....@@ -3916,6 +4109,7 @@
39164109 struct kmem_cache *s;
39174110 unsigned int offset;
39184111 size_t object_size;
4112
+ bool is_kfence = is_kfence_address(ptr);
39194113
39204114 ptr = kasan_reset_tag(ptr);
39214115
....@@ -3928,10 +4122,13 @@
39284122 to_user, 0, n);
39294123
39304124 /* Find offset within object. */
3931
- offset = (ptr - page_address(page)) % s->size;
4125
+ if (is_kfence)
4126
+ offset = ptr - kfence_object_start(ptr);
4127
+ else
4128
+ offset = (ptr - page_address(page)) % s->size;
39324129
39334130 /* Adjust for redzone and reject if within the redzone. */
3934
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
4131
+ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
39354132 if (offset < s->red_left_pad)
39364133 usercopy_abort("SLUB object in left red zone",
39374134 s->name, to_user, offset, n);
....@@ -3961,7 +4158,7 @@
39614158 }
39624159 #endif /* CONFIG_HARDENED_USERCOPY */
39634160
3964
-static size_t __ksize(const void *object)
4161
+size_t __ksize(const void *object)
39654162 {
39664163 struct page *page;
39674164
....@@ -3972,22 +4169,12 @@
39724169
39734170 if (unlikely(!PageSlab(page))) {
39744171 WARN_ON(!PageCompound(page));
3975
- return PAGE_SIZE << compound_order(page);
4172
+ return page_size(page);
39764173 }
39774174
39784175 return slab_ksize(page->slab_cache);
39794176 }
3980
-
3981
-size_t ksize(const void *object)
3982
-{
3983
- size_t size = __ksize(object);
3984
- /* We assume that ksize callers could use whole allocated area,
3985
- * so we need to unpoison this area.
3986
- */
3987
- kasan_unpoison_shadow(object, size);
3988
- return size;
3989
-}
3990
-EXPORT_SYMBOL(ksize);
4177
+EXPORT_SYMBOL(__ksize);
39914178
39924179 void kfree(const void *x)
39934180 {
....@@ -4001,9 +4188,13 @@
40014188
40024189 page = virt_to_head_page(x);
40034190 if (unlikely(!PageSlab(page))) {
4191
+ unsigned int order = compound_order(page);
4192
+
40044193 BUG_ON(!PageCompound(page));
40054194 kfree_hook(object);
4006
- __free_pages(page, compound_order(page));
4195
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
4196
+ -(PAGE_SIZE << order));
4197
+ __free_pages(page, order);
40074198 return;
40084199 }
40094200 slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
....@@ -4047,7 +4238,7 @@
40474238 * Note that concurrent frees may occur while we hold the
40484239 * list_lock. page->inuse here is the upper limit.
40494240 */
4050
- list_for_each_entry_safe(page, t, &n->partial, lru) {
4241
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
40514242 int free = page->objects - page->inuse;
40524243
40534244 /* Do not reread page->inuse */
....@@ -4057,10 +4248,10 @@
40574248 BUG_ON(free <= 0);
40584249
40594250 if (free == page->objects) {
4060
- list_move(&page->lru, &discard);
4251
+ list_move(&page->slab_list, &discard);
40614252 n->nr_partial--;
40624253 } else if (free <= SHRINK_PROMOTE_MAX)
4063
- list_move(&page->lru, promote + free - 1);
4254
+ list_move(&page->slab_list, promote + free - 1);
40644255 }
40654256
40664257 /*
....@@ -4073,7 +4264,7 @@
40734264 spin_unlock_irqrestore(&n->list_lock, flags);
40744265
40754266 /* Release empty slabs */
4076
- list_for_each_entry_safe(page, t, &discard, lru)
4267
+ list_for_each_entry_safe(page, t, &discard, slab_list)
40774268 discard_slab(s, page);
40784269
40794270 if (slabs_node(s, node))
....@@ -4082,42 +4273,6 @@
40824273
40834274 return ret;
40844275 }
4085
-
4086
-#ifdef CONFIG_MEMCG
4087
-static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
4088
-{
4089
- /*
4090
- * Called with all the locks held after a sched RCU grace period.
4091
- * Even if @s becomes empty after shrinking, we can't know that @s
4092
- * doesn't have allocations already in-flight and thus can't
4093
- * destroy @s until the associated memcg is released.
4094
- *
4095
- * However, let's remove the sysfs files for empty caches here.
4096
- * Each cache has a lot of interface files which aren't
4097
- * particularly useful for empty draining caches; otherwise, we can
4098
- * easily end up with millions of unnecessary sysfs files on
4099
- * systems which have a lot of memory and transient cgroups.
4100
- */
4101
- if (!__kmem_cache_shrink(s))
4102
- sysfs_slab_remove(s);
4103
-}
4104
-
4105
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
4106
-{
4107
- /*
4108
- * Disable empty slabs caching. Used to avoid pinning offline
4109
- * memory cgroups by kmem pages that can be freed.
4110
- */
4111
- slub_set_cpu_partial(s, 0);
4112
- s->min_partial = 0;
4113
-
4114
- /*
4115
- * s->cpu_partial is checked locklessly (see put_cpu_partial), so
4116
- * we have to make sure the change is visible before shrinking.
4117
- */
4118
- slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
4119
-}
4120
-#endif
41214276
41224277 static int slab_mem_going_offline_callback(void *arg)
41234278 {
....@@ -4265,17 +4420,15 @@
42654420 for_each_kmem_cache_node(s, node, n) {
42664421 struct page *p;
42674422
4268
- list_for_each_entry(p, &n->partial, lru)
4423
+ list_for_each_entry(p, &n->partial, slab_list)
42694424 p->slab_cache = s;
42704425
42714426 #ifdef CONFIG_SLUB_DEBUG
4272
- list_for_each_entry(p, &n->full, lru)
4427
+ list_for_each_entry(p, &n->full, slab_list)
42734428 p->slab_cache = s;
42744429 #endif
42754430 }
4276
- slab_init_memcg_params(s);
42774431 list_add(&s->list, &slab_caches);
4278
- memcg_link_cache(s);
42794432 return s;
42804433 }
42814434
....@@ -4316,7 +4469,7 @@
43164469 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
43174470 slub_cpu_dead);
43184471
4319
- pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
4472
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
43204473 cache_line_size(),
43214474 slub_min_order, slub_max_order, slub_min_objects,
43224475 nr_cpu_ids, nr_node_ids);
....@@ -4330,7 +4483,7 @@
43304483 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
43314484 slab_flags_t flags, void (*ctor)(void *))
43324485 {
4333
- struct kmem_cache *s, *c;
4486
+ struct kmem_cache *s;
43344487
43354488 s = find_mergeable(size, align, flags, name, ctor);
43364489 if (s) {
....@@ -4342,11 +4495,6 @@
43424495 */
43434496 s->object_size = max(s->object_size, size);
43444497 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
4345
-
4346
- for_each_memcg_cache(c, s) {
4347
- c->object_size = s->object_size;
4348
- c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
4349
- }
43504498
43514499 if (sysfs_slab_alias(s, name)) {
43524500 s->refcount--;
....@@ -4369,12 +4517,16 @@
43694517 if (slab_state <= UP)
43704518 return 0;
43714519
4372
- memcg_propagate_slab_attrs(s);
43734520 err = sysfs_slab_add(s);
4374
- if (err)
4521
+ if (err) {
43754522 __kmem_cache_release(s);
4523
+ return err;
4524
+ }
43764525
4377
- return err;
4526
+ if (s->flags & SLAB_STORE_USER)
4527
+ debugfs_slab_add(s);
4528
+
4529
+ return 0;
43784530 }
43794531
43804532 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
....@@ -4390,7 +4542,7 @@
43904542 if (unlikely(ZERO_OR_NULL_PTR(s)))
43914543 return s;
43924544
4393
- ret = slab_alloc(s, gfpflags, caller);
4545
+ ret = slab_alloc(s, gfpflags, caller, size);
43944546
43954547 /* Honor the call site pointer we received. */
43964548 trace_kmalloc(caller, ret, size, s->size, gfpflags);
....@@ -4421,7 +4573,7 @@
44214573 if (unlikely(ZERO_OR_NULL_PTR(s)))
44224574 return s;
44234575
4424
- ret = slab_alloc_node(s, gfpflags, node, caller);
4576
+ ret = slab_alloc_node(s, gfpflags, node, caller, size);
44254577
44264578 /* Honor the call site pointer we received. */
44274579 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
....@@ -4444,43 +4596,33 @@
44444596 #endif
44454597
44464598 #ifdef CONFIG_SLUB_DEBUG
4447
-static int validate_slab(struct kmem_cache *s, struct page *page,
4448
- unsigned long *map)
4599
+static void validate_slab(struct kmem_cache *s, struct page *page)
44494600 {
44504601 void *p;
44514602 void *addr = page_address(page);
4603
+ unsigned long *map;
44524604
4453
- if (!check_slab(s, page) ||
4454
- !on_freelist(s, page, NULL))
4455
- return 0;
4605
+ slab_lock(page);
4606
+
4607
+ if (!check_slab(s, page) || !on_freelist(s, page, NULL))
4608
+ goto unlock;
44564609
44574610 /* Now we know that a valid freelist exists */
4458
- bitmap_zero(map, page->objects);
4459
-
4460
- get_map(s, page, map);
4611
+ map = get_map(s, page);
44614612 for_each_object(p, s, addr, page->objects) {
4462
- if (test_bit(slab_index(p, s, addr), map))
4463
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
4464
- return 0;
4613
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
4614
+ SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
4615
+
4616
+ if (!check_object(s, page, p, val))
4617
+ break;
44654618 }
4466
-
4467
- for_each_object(p, s, addr, page->objects)
4468
- if (!test_bit(slab_index(p, s, addr), map))
4469
- if (!check_object(s, page, p, SLUB_RED_ACTIVE))
4470
- return 0;
4471
- return 1;
4472
-}
4473
-
4474
-static void validate_slab_slab(struct kmem_cache *s, struct page *page,
4475
- unsigned long *map)
4476
-{
4477
- slab_lock(page);
4478
- validate_slab(s, page, map);
4619
+ put_map(map);
4620
+unlock:
44794621 slab_unlock(page);
44804622 }
44814623
44824624 static int validate_slab_node(struct kmem_cache *s,
4483
- struct kmem_cache_node *n, unsigned long *map)
4625
+ struct kmem_cache_node *n)
44844626 {
44854627 unsigned long count = 0;
44864628 struct page *page;
....@@ -4488,8 +4630,8 @@
44884630
44894631 spin_lock_irqsave(&n->list_lock, flags);
44904632
4491
- list_for_each_entry(page, &n->partial, lru) {
4492
- validate_slab_slab(s, page, map);
4633
+ list_for_each_entry(page, &n->partial, slab_list) {
4634
+ validate_slab(s, page);
44934635 count++;
44944636 }
44954637 if (count != n->nr_partial)
....@@ -4499,8 +4641,8 @@
44994641 if (!(s->flags & SLAB_STORE_USER))
45004642 goto out;
45014643
4502
- list_for_each_entry(page, &n->full, lru) {
4503
- validate_slab_slab(s, page, map);
4644
+ list_for_each_entry(page, &n->full, slab_list) {
4645
+ validate_slab(s, page);
45044646 count++;
45054647 }
45064648 if (count != atomic_long_read(&n->nr_slabs))
....@@ -4516,20 +4658,16 @@
45164658 {
45174659 int node;
45184660 unsigned long count = 0;
4519
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
4520
- sizeof(unsigned long),
4521
- GFP_KERNEL);
45224661 struct kmem_cache_node *n;
4523
-
4524
- if (!map)
4525
- return -ENOMEM;
45264662
45274663 flush_all(s);
45284664 for_each_kmem_cache_node(s, node, n)
4529
- count += validate_slab_node(s, n, map);
4530
- kfree(map);
4665
+ count += validate_slab_node(s, n);
4666
+
45314667 return count;
45324668 }
4669
+
4670
+#ifdef CONFIG_DEBUG_FS
45334671 /*
45344672 * Generate lists of code addresses where slabcache objects are allocated
45354673 * and freed.
....@@ -4551,7 +4689,10 @@
45514689 unsigned long max;
45524690 unsigned long count;
45534691 struct location *loc;
4692
+ loff_t idx;
45544693 };
4694
+
4695
+static struct dentry *slab_debugfs_root;
45554696
45564697 static void free_loc_track(struct loc_track *t)
45574698 {
....@@ -4658,105 +4799,19 @@
46584799
46594800 static void process_slab(struct loc_track *t, struct kmem_cache *s,
46604801 struct page *page, enum track_item alloc,
4661
- unsigned long *map)
4802
+ unsigned long *obj_map)
46624803 {
46634804 void *addr = page_address(page);
46644805 void *p;
46654806
4666
- bitmap_zero(map, page->objects);
4667
- get_map(s, page, map);
4807
+ __fill_map(obj_map, s, page);
46684808
46694809 for_each_object(p, s, addr, page->objects)
4670
- if (!test_bit(slab_index(p, s, addr), map))
4810
+ if (!test_bit(__obj_to_index(s, addr, p), obj_map))
46714811 add_location(t, s, get_track(s, p, alloc));
46724812 }
4673
-
4674
-static int list_locations(struct kmem_cache *s, char *buf,
4675
- enum track_item alloc)
4676
-{
4677
- int len = 0;
4678
- unsigned long i;
4679
- struct loc_track t = { 0, 0, NULL };
4680
- int node;
4681
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
4682
- sizeof(unsigned long),
4683
- GFP_KERNEL);
4684
- struct kmem_cache_node *n;
4685
-
4686
- if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4687
- GFP_KERNEL)) {
4688
- kfree(map);
4689
- return sprintf(buf, "Out of memory\n");
4690
- }
4691
- /* Push back cpu slabs */
4692
- flush_all(s);
4693
-
4694
- for_each_kmem_cache_node(s, node, n) {
4695
- unsigned long flags;
4696
- struct page *page;
4697
-
4698
- if (!atomic_long_read(&n->nr_slabs))
4699
- continue;
4700
-
4701
- spin_lock_irqsave(&n->list_lock, flags);
4702
- list_for_each_entry(page, &n->partial, lru)
4703
- process_slab(&t, s, page, alloc, map);
4704
- list_for_each_entry(page, &n->full, lru)
4705
- process_slab(&t, s, page, alloc, map);
4706
- spin_unlock_irqrestore(&n->list_lock, flags);
4707
- }
4708
-
4709
- for (i = 0; i < t.count; i++) {
4710
- struct location *l = &t.loc[i];
4711
-
4712
- if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
4713
- break;
4714
- len += sprintf(buf + len, "%7ld ", l->count);
4715
-
4716
- if (l->addr)
4717
- len += sprintf(buf + len, "%pS", (void *)l->addr);
4718
- else
4719
- len += sprintf(buf + len, "<not-available>");
4720
-
4721
- if (l->sum_time != l->min_time) {
4722
- len += sprintf(buf + len, " age=%ld/%ld/%ld",
4723
- l->min_time,
4724
- (long)div_u64(l->sum_time, l->count),
4725
- l->max_time);
4726
- } else
4727
- len += sprintf(buf + len, " age=%ld",
4728
- l->min_time);
4729
-
4730
- if (l->min_pid != l->max_pid)
4731
- len += sprintf(buf + len, " pid=%ld-%ld",
4732
- l->min_pid, l->max_pid);
4733
- else
4734
- len += sprintf(buf + len, " pid=%ld",
4735
- l->min_pid);
4736
-
4737
- if (num_online_cpus() > 1 &&
4738
- !cpumask_empty(to_cpumask(l->cpus)) &&
4739
- len < PAGE_SIZE - 60)
4740
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4741
- " cpus=%*pbl",
4742
- cpumask_pr_args(to_cpumask(l->cpus)));
4743
-
4744
- if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4745
- len < PAGE_SIZE - 60)
4746
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4747
- " nodes=%*pbl",
4748
- nodemask_pr_args(&l->nodes));
4749
-
4750
- len += sprintf(buf + len, "\n");
4751
- }
4752
-
4753
- free_loc_track(&t);
4754
- kfree(map);
4755
- if (!t.count)
4756
- len += sprintf(buf, "No data\n");
4757
- return len;
4758
-}
4759
-#endif
4813
+#endif /* CONFIG_DEBUG_FS */
4814
+#endif /* CONFIG_SLUB_DEBUG */
47604815
47614816 #ifdef SLUB_RESILIENCY_TEST
47624817 static void __init resiliency_test(void)
....@@ -4816,7 +4871,7 @@
48164871 #ifdef CONFIG_SLUB_SYSFS
48174872 static void resiliency_test(void) {};
48184873 #endif
4819
-#endif
4874
+#endif /* SLUB_RESILIENCY_TEST */
48204875
48214876 #ifdef CONFIG_SLUB_SYSFS
48224877 enum slab_stat_type {
....@@ -4955,20 +5010,6 @@
49555010 return x + sprintf(buf + x, "\n");
49565011 }
49575012
4958
-#ifdef CONFIG_SLUB_DEBUG
4959
-static int any_slab_objects(struct kmem_cache *s)
4960
-{
4961
- int node;
4962
- struct kmem_cache_node *n;
4963
-
4964
- for_each_kmem_cache_node(s, node, n)
4965
- if (atomic_long_read(&n->total_objects))
4966
- return 1;
4967
-
4968
- return 0;
4969
-}
4970
-#endif
4971
-
49725013 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
49735014 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
49745015
....@@ -5010,28 +5051,11 @@
50105051 }
50115052 SLAB_ATTR_RO(objs_per_slab);
50125053
5013
-static ssize_t order_store(struct kmem_cache *s,
5014
- const char *buf, size_t length)
5015
-{
5016
- unsigned int order;
5017
- int err;
5018
-
5019
- err = kstrtouint(buf, 10, &order);
5020
- if (err)
5021
- return err;
5022
-
5023
- if (order > slub_max_order || order < slub_min_order)
5024
- return -EINVAL;
5025
-
5026
- calculate_sizes(s, order);
5027
- return length;
5028
-}
5029
-
50305054 static ssize_t order_show(struct kmem_cache *s, char *buf)
50315055 {
50325056 return sprintf(buf, "%u\n", oo_order(s->oo));
50335057 }
5034
-SLAB_ATTR(order);
5058
+SLAB_ATTR_RO(order);
50355059
50365060 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
50375061 {
....@@ -5153,16 +5177,7 @@
51535177 {
51545178 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
51555179 }
5156
-
5157
-static ssize_t reclaim_account_store(struct kmem_cache *s,
5158
- const char *buf, size_t length)
5159
-{
5160
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
5161
- if (buf[0] == '1')
5162
- s->flags |= SLAB_RECLAIM_ACCOUNT;
5163
- return length;
5164
-}
5165
-SLAB_ATTR(reclaim_account);
5180
+SLAB_ATTR_RO(reclaim_account);
51665181
51675182 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
51685183 {
....@@ -5207,104 +5222,34 @@
52075222 {
52085223 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
52095224 }
5210
-
5211
-static ssize_t sanity_checks_store(struct kmem_cache *s,
5212
- const char *buf, size_t length)
5213
-{
5214
- s->flags &= ~SLAB_CONSISTENCY_CHECKS;
5215
- if (buf[0] == '1') {
5216
- s->flags &= ~__CMPXCHG_DOUBLE;
5217
- s->flags |= SLAB_CONSISTENCY_CHECKS;
5218
- }
5219
- return length;
5220
-}
5221
-SLAB_ATTR(sanity_checks);
5225
+SLAB_ATTR_RO(sanity_checks);
52225226
52235227 static ssize_t trace_show(struct kmem_cache *s, char *buf)
52245228 {
52255229 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
52265230 }
5227
-
5228
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
5229
- size_t length)
5230
-{
5231
- /*
5232
- * Tracing a merged cache is going to give confusing results
5233
- * as well as cause other issues like converting a mergeable
5234
- * cache into an umergeable one.
5235
- */
5236
- if (s->refcount > 1)
5237
- return -EINVAL;
5238
-
5239
- s->flags &= ~SLAB_TRACE;
5240
- if (buf[0] == '1') {
5241
- s->flags &= ~__CMPXCHG_DOUBLE;
5242
- s->flags |= SLAB_TRACE;
5243
- }
5244
- return length;
5245
-}
5246
-SLAB_ATTR(trace);
5231
+SLAB_ATTR_RO(trace);
52475232
52485233 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
52495234 {
52505235 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
52515236 }
52525237
5253
-static ssize_t red_zone_store(struct kmem_cache *s,
5254
- const char *buf, size_t length)
5255
-{
5256
- if (any_slab_objects(s))
5257
- return -EBUSY;
5258
-
5259
- s->flags &= ~SLAB_RED_ZONE;
5260
- if (buf[0] == '1') {
5261
- s->flags |= SLAB_RED_ZONE;
5262
- }
5263
- calculate_sizes(s, -1);
5264
- return length;
5265
-}
5266
-SLAB_ATTR(red_zone);
5238
+SLAB_ATTR_RO(red_zone);
52675239
52685240 static ssize_t poison_show(struct kmem_cache *s, char *buf)
52695241 {
52705242 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
52715243 }
52725244
5273
-static ssize_t poison_store(struct kmem_cache *s,
5274
- const char *buf, size_t length)
5275
-{
5276
- if (any_slab_objects(s))
5277
- return -EBUSY;
5278
-
5279
- s->flags &= ~SLAB_POISON;
5280
- if (buf[0] == '1') {
5281
- s->flags |= SLAB_POISON;
5282
- }
5283
- calculate_sizes(s, -1);
5284
- return length;
5285
-}
5286
-SLAB_ATTR(poison);
5245
+SLAB_ATTR_RO(poison);
52875246
52885247 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
52895248 {
52905249 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
52915250 }
52925251
5293
-static ssize_t store_user_store(struct kmem_cache *s,
5294
- const char *buf, size_t length)
5295
-{
5296
- if (any_slab_objects(s))
5297
- return -EBUSY;
5298
-
5299
- s->flags &= ~SLAB_STORE_USER;
5300
- if (buf[0] == '1') {
5301
- s->flags &= ~__CMPXCHG_DOUBLE;
5302
- s->flags |= SLAB_STORE_USER;
5303
- }
5304
- calculate_sizes(s, -1);
5305
- return length;
5306
-}
5307
-SLAB_ATTR(store_user);
5252
+SLAB_ATTR_RO(store_user);
53085253
53095254 static ssize_t validate_show(struct kmem_cache *s, char *buf)
53105255 {
....@@ -5325,21 +5270,6 @@
53255270 }
53265271 SLAB_ATTR(validate);
53275272
5328
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
5329
-{
5330
- if (!(s->flags & SLAB_STORE_USER))
5331
- return -ENOSYS;
5332
- return list_locations(s, buf, TRACK_ALLOC);
5333
-}
5334
-SLAB_ATTR_RO(alloc_calls);
5335
-
5336
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
5337
-{
5338
- if (!(s->flags & SLAB_STORE_USER))
5339
- return -ENOSYS;
5340
- return list_locations(s, buf, TRACK_FREE);
5341
-}
5342
-SLAB_ATTR_RO(free_calls);
53435273 #endif /* CONFIG_SLUB_DEBUG */
53445274
53455275 #ifdef CONFIG_FAILSLAB
....@@ -5347,19 +5277,7 @@
53475277 {
53485278 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
53495279 }
5350
-
5351
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
5352
- size_t length)
5353
-{
5354
- if (s->refcount > 1)
5355
- return -EINVAL;
5356
-
5357
- s->flags &= ~SLAB_FAILSLAB;
5358
- if (buf[0] == '1')
5359
- s->flags |= SLAB_FAILSLAB;
5360
- return length;
5361
-}
5362
-SLAB_ATTR(failslab);
5280
+SLAB_ATTR_RO(failslab);
53635281 #endif
53645282
53655283 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
....@@ -5482,7 +5400,7 @@
54825400 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
54835401 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
54845402 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5485
-#endif
5403
+#endif /* CONFIG_SLUB_STATS */
54865404
54875405 static struct attribute *slab_attrs[] = {
54885406 &slab_size_attr.attr,
....@@ -5512,8 +5430,6 @@
55125430 &poison_attr.attr,
55135431 &store_user_attr.attr,
55145432 &validate_attr.attr,
5515
- &alloc_calls_attr.attr,
5516
- &free_calls_attr.attr,
55175433 #endif
55185434 #ifdef CONFIG_ZONE_DMA
55195435 &cache_dma_attr.attr,
....@@ -5595,96 +5511,7 @@
55955511 return -EIO;
55965512
55975513 err = attribute->store(s, buf, len);
5598
-#ifdef CONFIG_MEMCG
5599
- if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5600
- struct kmem_cache *c;
5601
-
5602
- mutex_lock(&slab_mutex);
5603
- if (s->max_attr_size < len)
5604
- s->max_attr_size = len;
5605
-
5606
- /*
5607
- * This is a best effort propagation, so this function's return
5608
- * value will be determined by the parent cache only. This is
5609
- * basically because not all attributes will have a well
5610
- * defined semantics for rollbacks - most of the actions will
5611
- * have permanent effects.
5612
- *
5613
- * Returning the error value of any of the children that fail
5614
- * is not 100 % defined, in the sense that users seeing the
5615
- * error code won't be able to know anything about the state of
5616
- * the cache.
5617
- *
5618
- * Only returning the error code for the parent cache at least
5619
- * has well defined semantics. The cache being written to
5620
- * directly either failed or succeeded, in which case we loop
5621
- * through the descendants with best-effort propagation.
5622
- */
5623
- for_each_memcg_cache(c, s)
5624
- attribute->store(c, buf, len);
5625
- mutex_unlock(&slab_mutex);
5626
- }
5627
-#endif
56285514 return err;
5629
-}
5630
-
5631
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
5632
-{
5633
-#ifdef CONFIG_MEMCG
5634
- int i;
5635
- char *buffer = NULL;
5636
- struct kmem_cache *root_cache;
5637
-
5638
- if (is_root_cache(s))
5639
- return;
5640
-
5641
- root_cache = s->memcg_params.root_cache;
5642
-
5643
- /*
5644
- * This mean this cache had no attribute written. Therefore, no point
5645
- * in copying default values around
5646
- */
5647
- if (!root_cache->max_attr_size)
5648
- return;
5649
-
5650
- for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
5651
- char mbuf[64];
5652
- char *buf;
5653
- struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5654
- ssize_t len;
5655
-
5656
- if (!attr || !attr->store || !attr->show)
5657
- continue;
5658
-
5659
- /*
5660
- * It is really bad that we have to allocate here, so we will
5661
- * do it only as a fallback. If we actually allocate, though,
5662
- * we can just use the allocated buffer until the end.
5663
- *
5664
- * Most of the slub attributes will tend to be very small in
5665
- * size, but sysfs allows buffers up to a page, so they can
5666
- * theoretically happen.
5667
- */
5668
- if (buffer)
5669
- buf = buffer;
5670
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
5671
- !IS_ENABLED(CONFIG_SLUB_STATS))
5672
- buf = mbuf;
5673
- else {
5674
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
5675
- if (WARN_ON(!buffer))
5676
- continue;
5677
- buf = buffer;
5678
- }
5679
-
5680
- len = attr->show(root_cache, buf);
5681
- if (len > 0)
5682
- attr->store(s, buf, len);
5683
- }
5684
-
5685
- if (buffer)
5686
- free_page((unsigned long)buffer);
5687
-#endif
56885515 }
56895516
56905517 static void kmem_cache_release(struct kobject *k)
....@@ -5702,27 +5529,10 @@
57025529 .release = kmem_cache_release,
57035530 };
57045531
5705
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
5706
-{
5707
- struct kobj_type *ktype = get_ktype(kobj);
5708
-
5709
- if (ktype == &slab_ktype)
5710
- return 1;
5711
- return 0;
5712
-}
5713
-
5714
-static const struct kset_uevent_ops slab_uevent_ops = {
5715
- .filter = uevent_filter,
5716
-};
5717
-
57185532 static struct kset *slab_kset;
57195533
57205534 static inline struct kset *cache_kset(struct kmem_cache *s)
57215535 {
5722
-#ifdef CONFIG_MEMCG
5723
- if (!is_root_cache(s))
5724
- return s->memcg_params.root_cache->memcg_kset;
5725
-#endif
57265536 return slab_kset;
57275537 }
57285538
....@@ -5737,7 +5547,8 @@
57375547 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
57385548 char *p = name;
57395549
5740
- BUG_ON(!name);
5550
+ if (!name)
5551
+ return ERR_PTR(-ENOMEM);
57415552
57425553 *p++ = ':';
57435554 /*
....@@ -5765,36 +5576,12 @@
57655576 return name;
57665577 }
57675578
5768
-static void sysfs_slab_remove_workfn(struct work_struct *work)
5769
-{
5770
- struct kmem_cache *s =
5771
- container_of(work, struct kmem_cache, kobj_remove_work);
5772
-
5773
- if (!s->kobj.state_in_sysfs)
5774
- /*
5775
- * For a memcg cache, this may be called during
5776
- * deactivation and again on shutdown. Remove only once.
5777
- * A cache is never shut down before deactivation is
5778
- * complete, so no need to worry about synchronization.
5779
- */
5780
- goto out;
5781
-
5782
-#ifdef CONFIG_MEMCG
5783
- kset_unregister(s->memcg_kset);
5784
-#endif
5785
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
5786
-out:
5787
- kobject_put(&s->kobj);
5788
-}
5789
-
57905579 static int sysfs_slab_add(struct kmem_cache *s)
57915580 {
57925581 int err;
57935582 const char *name;
57945583 struct kset *kset = cache_kset(s);
57955584 int unmergeable = slab_unmergeable(s);
5796
-
5797
- INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
57985585
57995586 if (!kset) {
58005587 kobject_init(&s->kobj, &slab_ktype);
....@@ -5819,6 +5606,8 @@
58195606 * for the symlinks.
58205607 */
58215608 name = create_unique_id(s);
5609
+ if (IS_ERR(name))
5610
+ return PTR_ERR(name);
58225611 }
58235612
58245613 s->kobj.kset = kset;
....@@ -5830,17 +5619,6 @@
58305619 if (err)
58315620 goto out_del_kobj;
58325621
5833
-#ifdef CONFIG_MEMCG
5834
- if (is_root_cache(s) && memcg_sysfs_enabled) {
5835
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
5836
- if (!s->memcg_kset) {
5837
- err = -ENOMEM;
5838
- goto out_del_kobj;
5839
- }
5840
- }
5841
-#endif
5842
-
5843
- kobject_uevent(&s->kobj, KOBJ_ADD);
58445622 if (!unmergeable) {
58455623 /* Setup first alias */
58465624 sysfs_slab_alias(s, s->name);
....@@ -5852,19 +5630,6 @@
58525630 out_del_kobj:
58535631 kobject_del(&s->kobj);
58545632 goto out;
5855
-}
5856
-
5857
-static void sysfs_slab_remove(struct kmem_cache *s)
5858
-{
5859
- if (slab_state < FULL)
5860
- /*
5861
- * Sysfs has not been setup yet so no need to remove the
5862
- * cache from sysfs.
5863
- */
5864
- return;
5865
-
5866
- kobject_get(&s->kobj);
5867
- schedule_work(&s->kobj_remove_work);
58685633 }
58695634
58705635 void sysfs_slab_unlink(struct kmem_cache *s)
....@@ -5921,7 +5686,7 @@
59215686
59225687 mutex_lock(&slab_mutex);
59235688
5924
- slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5689
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
59255690 if (!slab_kset) {
59265691 mutex_unlock(&slab_mutex);
59275692 pr_err("Cannot register slab subsystem.\n");
....@@ -5956,6 +5721,189 @@
59565721 __initcall(slab_sysfs_init);
59575722 #endif /* CONFIG_SLUB_SYSFS */
59585723
5724
+#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
5725
+static int slab_debugfs_show(struct seq_file *seq, void *v)
5726
+{
5727
+ struct loc_track *t = seq->private;
5728
+ struct location *l;
5729
+ unsigned long idx;
5730
+
5731
+ idx = (unsigned long) t->idx;
5732
+ if (idx < t->count) {
5733
+ l = &t->loc[idx];
5734
+
5735
+ seq_printf(seq, "%7ld ", l->count);
5736
+
5737
+ if (l->addr)
5738
+ seq_printf(seq, "%pS", (void *)l->addr);
5739
+ else
5740
+ seq_puts(seq, "<not-available>");
5741
+
5742
+ if (l->sum_time != l->min_time) {
5743
+ seq_printf(seq, " age=%ld/%llu/%ld",
5744
+ l->min_time, div_u64(l->sum_time, l->count),
5745
+ l->max_time);
5746
+ } else
5747
+ seq_printf(seq, " age=%ld", l->min_time);
5748
+
5749
+ if (l->min_pid != l->max_pid)
5750
+ seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
5751
+ else
5752
+ seq_printf(seq, " pid=%ld",
5753
+ l->min_pid);
5754
+
5755
+ if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
5756
+ seq_printf(seq, " cpus=%*pbl",
5757
+ cpumask_pr_args(to_cpumask(l->cpus)));
5758
+
5759
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
5760
+ seq_printf(seq, " nodes=%*pbl",
5761
+ nodemask_pr_args(&l->nodes));
5762
+
5763
+ seq_puts(seq, "\n");
5764
+ }
5765
+
5766
+ if (!idx && !t->count)
5767
+ seq_puts(seq, "No data\n");
5768
+
5769
+ return 0;
5770
+}
5771
+
5772
+static void slab_debugfs_stop(struct seq_file *seq, void *v)
5773
+{
5774
+}
5775
+
5776
+static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
5777
+{
5778
+ struct loc_track *t = seq->private;
5779
+
5780
+ t->idx = ++(*ppos);
5781
+ if (*ppos <= t->count)
5782
+ return ppos;
5783
+
5784
+ return NULL;
5785
+}
5786
+
5787
+static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
5788
+{
5789
+ struct loc_track *t = seq->private;
5790
+
5791
+ t->idx = *ppos;
5792
+ return ppos;
5793
+}
5794
+
5795
+static const struct seq_operations slab_debugfs_sops = {
5796
+ .start = slab_debugfs_start,
5797
+ .next = slab_debugfs_next,
5798
+ .stop = slab_debugfs_stop,
5799
+ .show = slab_debugfs_show,
5800
+};
5801
+
5802
+static int slab_debug_trace_open(struct inode *inode, struct file *filep)
5803
+{
5804
+
5805
+ struct kmem_cache_node *n;
5806
+ enum track_item alloc;
5807
+ int node;
5808
+ struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
5809
+ sizeof(struct loc_track));
5810
+ struct kmem_cache *s = file_inode(filep)->i_private;
5811
+ unsigned long *obj_map;
5812
+
5813
+ if (!t)
5814
+ return -ENOMEM;
5815
+
5816
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
5817
+ if (!obj_map) {
5818
+ seq_release_private(inode, filep);
5819
+ return -ENOMEM;
5820
+ }
5821
+
5822
+ if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
5823
+ alloc = TRACK_ALLOC;
5824
+ else
5825
+ alloc = TRACK_FREE;
5826
+
5827
+ if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
5828
+ bitmap_free(obj_map);
5829
+ seq_release_private(inode, filep);
5830
+ return -ENOMEM;
5831
+ }
5832
+
5833
+ /* Push back cpu slabs */
5834
+ flush_all(s);
5835
+
5836
+ for_each_kmem_cache_node(s, node, n) {
5837
+ unsigned long flags;
5838
+ struct page *page;
5839
+
5840
+ if (!atomic_long_read(&n->nr_slabs))
5841
+ continue;
5842
+
5843
+ spin_lock_irqsave(&n->list_lock, flags);
5844
+ list_for_each_entry(page, &n->partial, slab_list)
5845
+ process_slab(t, s, page, alloc, obj_map);
5846
+ list_for_each_entry(page, &n->full, slab_list)
5847
+ process_slab(t, s, page, alloc, obj_map);
5848
+ spin_unlock_irqrestore(&n->list_lock, flags);
5849
+ }
5850
+
5851
+ bitmap_free(obj_map);
5852
+ return 0;
5853
+}
5854
+
5855
+static int slab_debug_trace_release(struct inode *inode, struct file *file)
5856
+{
5857
+ struct seq_file *seq = file->private_data;
5858
+ struct loc_track *t = seq->private;
5859
+
5860
+ free_loc_track(t);
5861
+ return seq_release_private(inode, file);
5862
+}
5863
+
5864
+static const struct file_operations slab_debugfs_fops = {
5865
+ .open = slab_debug_trace_open,
5866
+ .read = seq_read,
5867
+ .llseek = seq_lseek,
5868
+ .release = slab_debug_trace_release,
5869
+};
5870
+
5871
+static void debugfs_slab_add(struct kmem_cache *s)
5872
+{
5873
+ struct dentry *slab_cache_dir;
5874
+
5875
+ if (unlikely(!slab_debugfs_root))
5876
+ return;
5877
+
5878
+ slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
5879
+
5880
+ debugfs_create_file("alloc_traces", 0400,
5881
+ slab_cache_dir, s, &slab_debugfs_fops);
5882
+
5883
+ debugfs_create_file("free_traces", 0400,
5884
+ slab_cache_dir, s, &slab_debugfs_fops);
5885
+}
5886
+
5887
+void debugfs_slab_release(struct kmem_cache *s)
5888
+{
5889
+ debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
5890
+}
5891
+
5892
+static int __init slab_debugfs_init(void)
5893
+{
5894
+ struct kmem_cache *s;
5895
+
5896
+ slab_debugfs_root = debugfs_create_dir("slab", NULL);
5897
+
5898
+ list_for_each_entry(s, &slab_caches, list)
5899
+ if (s->flags & SLAB_STORE_USER)
5900
+ debugfs_slab_add(s);
5901
+
5902
+ return 0;
5903
+
5904
+}
5905
+__initcall(slab_debugfs_init);
5906
+#endif
59595907 /*
59605908 * The /proc/slabinfo ABI
59615909 */
....@@ -5981,6 +5929,7 @@
59815929 sinfo->objects_per_slab = oo_objects(s->oo);
59825930 sinfo->cache_order = oo_order(s->oo);
59835931 }
5932
+EXPORT_SYMBOL_GPL(get_slabinfo);
59845933
59855934 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
59865935 {