hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/mm/zsmalloc.c
....@@ -39,8 +39,8 @@
3939 #include <linux/highmem.h>
4040 #include <linux/string.h>
4141 #include <linux/slab.h>
42
+#include <linux/pgtable.h>
4243 #include <asm/tlbflush.h>
43
-#include <asm/pgtable.h>
4444 #include <linux/cpumask.h>
4545 #include <linux/cpu.h>
4646 #include <linux/vmalloc.h>
....@@ -52,10 +52,12 @@
5252 #include <linux/zsmalloc.h>
5353 #include <linux/zpool.h>
5454 #include <linux/mount.h>
55
+#include <linux/pseudo_fs.h>
5556 #include <linux/migrate.h>
5657 #include <linux/wait.h>
5758 #include <linux/pagemap.h>
5859 #include <linux/fs.h>
60
+#include <linux/local_lock.h>
5961
6062 #define ZSPAGE_MAGIC 0x58
6163
....@@ -76,9 +78,23 @@
7678
7779 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
7880
81
+#ifdef CONFIG_PREEMPT_RT
82
+
83
+struct zsmalloc_handle {
84
+ unsigned long addr;
85
+ spinlock_t lock;
86
+};
87
+
88
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
89
+
90
+#else
91
+
92
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
93
+#endif
94
+
7995 /*
8096 * Object location (<PFN>, <obj_idx>) is encoded as
81
- * as single (unsigned long) handle value.
97
+ * a single (unsigned long) handle value.
8298 *
8399 * Note that object index <obj_idx> starts from 0.
84100 *
....@@ -292,11 +308,8 @@
292308 };
293309
294310 struct mapping_area {
295
-#ifdef CONFIG_PGTABLE_MAPPING
296
- struct vm_struct *vm; /* vm area for mapping object that span pages */
297
-#else
311
+ local_lock_t lock;
298312 char *vm_buf; /* copy buffer for objects that span pages */
299
-#endif
300313 char *vm_addr; /* address of kmap_atomic()'ed pages */
301314 enum zs_mapmode vm_mm; /* mapping mode */
302315 };
....@@ -325,7 +338,7 @@
325338
326339 static int create_cache(struct zs_pool *pool)
327340 {
328
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
341
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
329342 0, 0, NULL);
330343 if (!pool->handle_cachep)
331344 return 1;
....@@ -349,9 +362,26 @@
349362
350363 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
351364 {
352
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
353
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
365
+ void *p;
366
+
367
+ p = kmem_cache_alloc(pool->handle_cachep,
368
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
369
+#ifdef CONFIG_PREEMPT_RT
370
+ if (p) {
371
+ struct zsmalloc_handle *zh = p;
372
+
373
+ spin_lock_init(&zh->lock);
374
+ }
375
+#endif
376
+ return (unsigned long)p;
354377 }
378
+
379
+#ifdef CONFIG_PREEMPT_RT
380
+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
381
+{
382
+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
383
+}
384
+#endif
355385
356386 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
357387 {
....@@ -361,7 +391,7 @@
361391 static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
362392 {
363393 return kmem_cache_alloc(pool->zspage_cachep,
364
- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
394
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA));
365395 }
366396
367397 static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
....@@ -371,12 +401,18 @@
371401
372402 static void record_obj(unsigned long handle, unsigned long obj)
373403 {
404
+#ifdef CONFIG_PREEMPT_RT
405
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
406
+
407
+ WRITE_ONCE(zh->addr, obj);
408
+#else
374409 /*
375410 * lsb of @obj represents handle lock while other bits
376411 * represent object value the handle is pointing so
377412 * updating shouldn't do store tearing.
378413 */
379414 WRITE_ONCE(*(unsigned long *)handle, obj);
415
+#endif
380416 }
381417
382418 /* zpool driver */
....@@ -423,7 +459,7 @@
423459 case ZPOOL_MM_WO:
424460 zs_mm = ZS_MM_WO;
425461 break;
426
- case ZPOOL_MM_RW: /* fallthru */
462
+ case ZPOOL_MM_RW:
427463 default:
428464 zs_mm = ZS_MM_RW;
429465 break;
....@@ -442,22 +478,26 @@
442478 }
443479
444480 static struct zpool_driver zs_zpool_driver = {
445
- .type = "zsmalloc",
446
- .owner = THIS_MODULE,
447
- .create = zs_zpool_create,
448
- .destroy = zs_zpool_destroy,
449
- .malloc = zs_zpool_malloc,
450
- .free = zs_zpool_free,
451
- .map = zs_zpool_map,
452
- .unmap = zs_zpool_unmap,
453
- .total_size = zs_zpool_total_size,
481
+ .type = "zsmalloc",
482
+ .owner = THIS_MODULE,
483
+ .create = zs_zpool_create,
484
+ .destroy = zs_zpool_destroy,
485
+ .malloc_support_movable = true,
486
+ .malloc = zs_zpool_malloc,
487
+ .free = zs_zpool_free,
488
+ .map = zs_zpool_map,
489
+ .unmap = zs_zpool_unmap,
490
+ .total_size = zs_zpool_total_size,
454491 };
455492
456493 MODULE_ALIAS("zpool-zsmalloc");
457494 #endif /* CONFIG_ZPOOL */
458495
459496 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
460
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
497
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
498
+ /* XXX remove this and use a spin_lock_t in pin_tag() */
499
+ .lock = INIT_LOCAL_LOCK(lock),
500
+};
461501
462502 static bool is_zspage_isolated(struct zspage *zspage)
463503 {
....@@ -475,10 +515,6 @@
475515 return zspage->inuse;
476516 }
477517
478
-static inline void set_zspage_inuse(struct zspage *zspage, int val)
479
-{
480
- zspage->inuse = val;
481
-}
482518
483519 static inline void mod_zspage_inuse(struct zspage *zspage, int val)
484520 {
....@@ -580,8 +616,6 @@
580616 }
581617
582618 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
583
- if (!zs_stat_root)
584
- pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
585619 }
586620
587621 static void __exit zs_stat_exit(void)
....@@ -652,29 +686,15 @@
652686
653687 static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
654688 {
655
- struct dentry *entry;
656
-
657689 if (!zs_stat_root) {
658690 pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
659691 return;
660692 }
661693
662
- entry = debugfs_create_dir(name, zs_stat_root);
663
- if (!entry) {
664
- pr_warn("debugfs dir <%s> creation failed\n", name);
665
- return;
666
- }
667
- pool->stat_dentry = entry;
694
+ pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
668695
669
- entry = debugfs_create_file("classes", S_IFREG | 0444,
670
- pool->stat_dentry, pool,
671
- &zs_stats_size_fops);
672
- if (!entry) {
673
- pr_warn("%s: debugfs file entry <%s> creation failed\n",
674
- name, "classes");
675
- debugfs_remove_recursive(pool->stat_dentry);
676
- pool->stat_dentry = NULL;
677
- }
696
+ debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
697
+ &zs_stats_size_fops);
678698 }
679699
680700 static void zs_pool_stat_destroy(struct zs_pool *pool)
....@@ -887,7 +907,13 @@
887907
888908 static unsigned long handle_to_obj(unsigned long handle)
889909 {
910
+#ifdef CONFIG_PREEMPT_RT
911
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
912
+
913
+ return zh->addr;
914
+#else
890915 return *(unsigned long *)handle;
916
+#endif
891917 }
892918
893919 static unsigned long obj_to_head(struct page *page, void *obj)
....@@ -901,22 +927,46 @@
901927
902928 static inline int testpin_tag(unsigned long handle)
903929 {
930
+#ifdef CONFIG_PREEMPT_RT
931
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
932
+
933
+ return spin_is_locked(&zh->lock);
934
+#else
904935 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
936
+#endif
905937 }
906938
907939 static inline int trypin_tag(unsigned long handle)
908940 {
941
+#ifdef CONFIG_PREEMPT_RT
942
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
943
+
944
+ return spin_trylock(&zh->lock);
945
+#else
909946 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
947
+#endif
910948 }
911949
912
-static void pin_tag(unsigned long handle)
950
+static void pin_tag(unsigned long handle) __acquires(bitlock)
913951 {
952
+#ifdef CONFIG_PREEMPT_RT
953
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
954
+
955
+ return spin_lock(&zh->lock);
956
+#else
914957 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
958
+#endif
915959 }
916960
917
-static void unpin_tag(unsigned long handle)
961
+static void unpin_tag(unsigned long handle) __releases(bitlock)
918962 {
963
+#ifdef CONFIG_PREEMPT_RT
964
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
965
+
966
+ return spin_unlock(&zh->lock);
967
+#else
919968 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
969
+#endif
920970 }
921971
922972 static void reset_page(struct page *page)
....@@ -1131,46 +1181,6 @@
11311181 return zspage;
11321182 }
11331183
1134
-#ifdef CONFIG_PGTABLE_MAPPING
1135
-static inline int __zs_cpu_up(struct mapping_area *area)
1136
-{
1137
- /*
1138
- * Make sure we don't leak memory if a cpu UP notification
1139
- * and zs_init() race and both call zs_cpu_up() on the same cpu
1140
- */
1141
- if (area->vm)
1142
- return 0;
1143
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
1144
- if (!area->vm)
1145
- return -ENOMEM;
1146
- return 0;
1147
-}
1148
-
1149
-static inline void __zs_cpu_down(struct mapping_area *area)
1150
-{
1151
- if (area->vm)
1152
- free_vm_area(area->vm);
1153
- area->vm = NULL;
1154
-}
1155
-
1156
-static inline void *__zs_map_object(struct mapping_area *area,
1157
- struct page *pages[2], int off, int size)
1158
-{
1159
- BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
1160
- area->vm_addr = area->vm->addr;
1161
- return area->vm_addr + off;
1162
-}
1163
-
1164
-static inline void __zs_unmap_object(struct mapping_area *area,
1165
- struct page *pages[2], int off, int size)
1166
-{
1167
- unsigned long addr = (unsigned long)area->vm_addr;
1168
-
1169
- unmap_kernel_range(addr, PAGE_SIZE * 2);
1170
-}
1171
-
1172
-#else /* CONFIG_PGTABLE_MAPPING */
1173
-
11741184 static inline int __zs_cpu_up(struct mapping_area *area)
11751185 {
11761186 /*
....@@ -1250,8 +1260,6 @@
12501260 /* enable page faults to match kunmap_atomic() return conditions */
12511261 pagefault_enable();
12521262 }
1253
-
1254
-#endif /* CONFIG_PGTABLE_MAPPING */
12551263
12561264 static int zs_cpu_prepare(unsigned int cpu)
12571265 {
....@@ -1342,7 +1350,8 @@
13421350 class = pool->size_class[class_idx];
13431351 off = (class->size * obj_idx) & ~PAGE_MASK;
13441352
1345
- area = &get_cpu_var(zs_map_area);
1353
+ local_lock(&zs_map_area.lock);
1354
+ area = this_cpu_ptr(&zs_map_area);
13461355 area->vm_mm = mm;
13471356 if (off + class->size <= PAGE_SIZE) {
13481357 /* this object is contained entirely within a page */
....@@ -1396,7 +1405,7 @@
13961405
13971406 __zs_unmap_object(area, pages, off, class->size);
13981407 }
1399
- put_cpu_var(zs_map_area);
1408
+ local_unlock(&zs_map_area.lock);
14001409
14011410 migrate_read_unlock(zspage);
14021411 unpin_tag(handle);
....@@ -1812,26 +1821,50 @@
18121821 */
18131822 static void lock_zspage(struct zspage *zspage)
18141823 {
1815
- struct page *page = get_first_page(zspage);
1824
+ struct page *curr_page, *page;
18161825
1817
- do {
1818
- lock_page(page);
1819
- } while ((page = get_next_page(page)) != NULL);
1826
+ /*
1827
+ * Pages we haven't locked yet can be migrated off the list while we're
1828
+ * trying to lock them, so we need to be careful and only attempt to
1829
+ * lock each page under migrate_read_lock(). Otherwise, the page we lock
1830
+ * may no longer belong to the zspage. This means that we may wait for
1831
+ * the wrong page to unlock, so we must take a reference to the page
1832
+ * prior to waiting for it to unlock outside migrate_read_lock().
1833
+ */
1834
+ while (1) {
1835
+ migrate_read_lock(zspage);
1836
+ page = get_first_page(zspage);
1837
+ if (trylock_page(page))
1838
+ break;
1839
+ get_page(page);
1840
+ migrate_read_unlock(zspage);
1841
+ wait_on_page_locked(page);
1842
+ put_page(page);
1843
+ }
1844
+
1845
+ curr_page = page;
1846
+ while ((page = get_next_page(curr_page))) {
1847
+ if (trylock_page(page)) {
1848
+ curr_page = page;
1849
+ } else {
1850
+ get_page(page);
1851
+ migrate_read_unlock(zspage);
1852
+ wait_on_page_locked(page);
1853
+ put_page(page);
1854
+ migrate_read_lock(zspage);
1855
+ }
1856
+ }
1857
+ migrate_read_unlock(zspage);
18201858 }
18211859
1822
-static struct dentry *zs_mount(struct file_system_type *fs_type,
1823
- int flags, const char *dev_name, void *data)
1860
+static int zs_init_fs_context(struct fs_context *fc)
18241861 {
1825
- static const struct dentry_operations ops = {
1826
- .d_dname = simple_dname,
1827
- };
1828
-
1829
- return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
1862
+ return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
18301863 }
18311864
18321865 static struct file_system_type zsmalloc_fs = {
18331866 .name = "zsmalloc",
1834
- .mount = zs_mount,
1867
+ .init_fs_context = zs_init_fs_context,
18351868 .kill_sb = kill_anon_super,
18361869 };
18371870
....@@ -1856,12 +1889,12 @@
18561889 rwlock_init(&zspage->lock);
18571890 }
18581891
1859
-static void migrate_read_lock(struct zspage *zspage)
1892
+static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
18601893 {
18611894 read_lock(&zspage->lock);
18621895 }
18631896
1864
-static void migrate_read_unlock(struct zspage *zspage)
1897
+static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
18651898 {
18661899 read_unlock(&zspage->lock);
18671900 }