.. | .. |
---|
39 | 39 | #include <linux/highmem.h> |
---|
40 | 40 | #include <linux/string.h> |
---|
41 | 41 | #include <linux/slab.h> |
---|
| 42 | +#include <linux/pgtable.h> |
---|
42 | 43 | #include <asm/tlbflush.h> |
---|
43 | | -#include <asm/pgtable.h> |
---|
44 | 44 | #include <linux/cpumask.h> |
---|
45 | 45 | #include <linux/cpu.h> |
---|
46 | 46 | #include <linux/vmalloc.h> |
---|
.. | .. |
---|
52 | 52 | #include <linux/zsmalloc.h> |
---|
53 | 53 | #include <linux/zpool.h> |
---|
54 | 54 | #include <linux/mount.h> |
---|
| 55 | +#include <linux/pseudo_fs.h> |
---|
55 | 56 | #include <linux/migrate.h> |
---|
56 | 57 | #include <linux/wait.h> |
---|
57 | 58 | #include <linux/pagemap.h> |
---|
58 | 59 | #include <linux/fs.h> |
---|
| 60 | +#include <linux/local_lock.h> |
---|
59 | 61 | |
---|
60 | 62 | #define ZSPAGE_MAGIC 0x58 |
---|
61 | 63 | |
---|
.. | .. |
---|
76 | 78 | |
---|
77 | 79 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) |
---|
78 | 80 | |
---|
| 81 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 82 | + |
---|
| 83 | +struct zsmalloc_handle { |
---|
| 84 | + unsigned long addr; |
---|
| 85 | + spinlock_t lock; |
---|
| 86 | +}; |
---|
| 87 | + |
---|
| 88 | +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) |
---|
| 89 | + |
---|
| 90 | +#else |
---|
| 91 | + |
---|
| 92 | +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) |
---|
| 93 | +#endif |
---|
| 94 | + |
---|
79 | 95 | /* |
---|
80 | 96 | * Object location (<PFN>, <obj_idx>) is encoded as |
---|
81 | | - * as single (unsigned long) handle value. |
---|
| 97 | + * a single (unsigned long) handle value. |
---|
82 | 98 | * |
---|
83 | 99 | * Note that object index <obj_idx> starts from 0. |
---|
84 | 100 | * |
---|
.. | .. |
---|
292 | 308 | }; |
---|
293 | 309 | |
---|
294 | 310 | struct mapping_area { |
---|
295 | | -#ifdef CONFIG_PGTABLE_MAPPING |
---|
296 | | - struct vm_struct *vm; /* vm area for mapping object that span pages */ |
---|
297 | | -#else |
---|
| 311 | + local_lock_t lock; |
---|
298 | 312 | char *vm_buf; /* copy buffer for objects that span pages */ |
---|
299 | | -#endif |
---|
300 | 313 | char *vm_addr; /* address of kmap_atomic()'ed pages */ |
---|
301 | 314 | enum zs_mapmode vm_mm; /* mapping mode */ |
---|
302 | 315 | }; |
---|
.. | .. |
---|
325 | 338 | |
---|
326 | 339 | static int create_cache(struct zs_pool *pool) |
---|
327 | 340 | { |
---|
328 | | - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, |
---|
| 341 | + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, |
---|
329 | 342 | 0, 0, NULL); |
---|
330 | 343 | if (!pool->handle_cachep) |
---|
331 | 344 | return 1; |
---|
.. | .. |
---|
349 | 362 | |
---|
350 | 363 | static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) |
---|
351 | 364 | { |
---|
352 | | - return (unsigned long)kmem_cache_alloc(pool->handle_cachep, |
---|
353 | | - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
---|
| 365 | + void *p; |
---|
| 366 | + |
---|
| 367 | + p = kmem_cache_alloc(pool->handle_cachep, |
---|
| 368 | + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
---|
| 369 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 370 | + if (p) { |
---|
| 371 | + struct zsmalloc_handle *zh = p; |
---|
| 372 | + |
---|
| 373 | + spin_lock_init(&zh->lock); |
---|
| 374 | + } |
---|
| 375 | +#endif |
---|
| 376 | + return (unsigned long)p; |
---|
354 | 377 | } |
---|
| 378 | + |
---|
| 379 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 380 | +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) |
---|
| 381 | +{ |
---|
| 382 | + return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1)); |
---|
| 383 | +} |
---|
| 384 | +#endif |
---|
355 | 385 | |
---|
356 | 386 | static void cache_free_handle(struct zs_pool *pool, unsigned long handle) |
---|
357 | 387 | { |
---|
.. | .. |
---|
361 | 391 | static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) |
---|
362 | 392 | { |
---|
363 | 393 | return kmem_cache_alloc(pool->zspage_cachep, |
---|
364 | | - flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
---|
| 394 | + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA)); |
---|
365 | 395 | } |
---|
366 | 396 | |
---|
367 | 397 | static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) |
---|
.. | .. |
---|
371 | 401 | |
---|
372 | 402 | static void record_obj(unsigned long handle, unsigned long obj) |
---|
373 | 403 | { |
---|
| 404 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 405 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 406 | + |
---|
| 407 | + WRITE_ONCE(zh->addr, obj); |
---|
| 408 | +#else |
---|
374 | 409 | /* |
---|
375 | 410 | * lsb of @obj represents handle lock while other bits |
---|
376 | 411 | * represent object value the handle is pointing so |
---|
377 | 412 | * updating shouldn't do store tearing. |
---|
378 | 413 | */ |
---|
379 | 414 | WRITE_ONCE(*(unsigned long *)handle, obj); |
---|
| 415 | +#endif |
---|
380 | 416 | } |
---|
381 | 417 | |
---|
382 | 418 | /* zpool driver */ |
---|
.. | .. |
---|
423 | 459 | case ZPOOL_MM_WO: |
---|
424 | 460 | zs_mm = ZS_MM_WO; |
---|
425 | 461 | break; |
---|
426 | | - case ZPOOL_MM_RW: /* fallthru */ |
---|
| 462 | + case ZPOOL_MM_RW: |
---|
427 | 463 | default: |
---|
428 | 464 | zs_mm = ZS_MM_RW; |
---|
429 | 465 | break; |
---|
.. | .. |
---|
442 | 478 | } |
---|
443 | 479 | |
---|
444 | 480 | static struct zpool_driver zs_zpool_driver = { |
---|
445 | | - .type = "zsmalloc", |
---|
446 | | - .owner = THIS_MODULE, |
---|
447 | | - .create = zs_zpool_create, |
---|
448 | | - .destroy = zs_zpool_destroy, |
---|
449 | | - .malloc = zs_zpool_malloc, |
---|
450 | | - .free = zs_zpool_free, |
---|
451 | | - .map = zs_zpool_map, |
---|
452 | | - .unmap = zs_zpool_unmap, |
---|
453 | | - .total_size = zs_zpool_total_size, |
---|
| 481 | + .type = "zsmalloc", |
---|
| 482 | + .owner = THIS_MODULE, |
---|
| 483 | + .create = zs_zpool_create, |
---|
| 484 | + .destroy = zs_zpool_destroy, |
---|
| 485 | + .malloc_support_movable = true, |
---|
| 486 | + .malloc = zs_zpool_malloc, |
---|
| 487 | + .free = zs_zpool_free, |
---|
| 488 | + .map = zs_zpool_map, |
---|
| 489 | + .unmap = zs_zpool_unmap, |
---|
| 490 | + .total_size = zs_zpool_total_size, |
---|
454 | 491 | }; |
---|
455 | 492 | |
---|
456 | 493 | MODULE_ALIAS("zpool-zsmalloc"); |
---|
457 | 494 | #endif /* CONFIG_ZPOOL */ |
---|
458 | 495 | |
---|
459 | 496 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
---|
460 | | -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
---|
| 497 | +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { |
---|
| 498 | + /* XXX remove this and use a spin_lock_t in pin_tag() */ |
---|
| 499 | + .lock = INIT_LOCAL_LOCK(lock), |
---|
| 500 | +}; |
---|
461 | 501 | |
---|
462 | 502 | static bool is_zspage_isolated(struct zspage *zspage) |
---|
463 | 503 | { |
---|
.. | .. |
---|
475 | 515 | return zspage->inuse; |
---|
476 | 516 | } |
---|
477 | 517 | |
---|
478 | | -static inline void set_zspage_inuse(struct zspage *zspage, int val) |
---|
479 | | -{ |
---|
480 | | - zspage->inuse = val; |
---|
481 | | -} |
---|
482 | 518 | |
---|
483 | 519 | static inline void mod_zspage_inuse(struct zspage *zspage, int val) |
---|
484 | 520 | { |
---|
.. | .. |
---|
580 | 616 | } |
---|
581 | 617 | |
---|
582 | 618 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); |
---|
583 | | - if (!zs_stat_root) |
---|
584 | | - pr_warn("debugfs 'zsmalloc' stat dir creation failed\n"); |
---|
585 | 619 | } |
---|
586 | 620 | |
---|
587 | 621 | static void __exit zs_stat_exit(void) |
---|
.. | .. |
---|
652 | 686 | |
---|
653 | 687 | static void zs_pool_stat_create(struct zs_pool *pool, const char *name) |
---|
654 | 688 | { |
---|
655 | | - struct dentry *entry; |
---|
656 | | - |
---|
657 | 689 | if (!zs_stat_root) { |
---|
658 | 690 | pr_warn("no root stat dir, not creating <%s> stat dir\n", name); |
---|
659 | 691 | return; |
---|
660 | 692 | } |
---|
661 | 693 | |
---|
662 | | - entry = debugfs_create_dir(name, zs_stat_root); |
---|
663 | | - if (!entry) { |
---|
664 | | - pr_warn("debugfs dir <%s> creation failed\n", name); |
---|
665 | | - return; |
---|
666 | | - } |
---|
667 | | - pool->stat_dentry = entry; |
---|
| 694 | + pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); |
---|
668 | 695 | |
---|
669 | | - entry = debugfs_create_file("classes", S_IFREG | 0444, |
---|
670 | | - pool->stat_dentry, pool, |
---|
671 | | - &zs_stats_size_fops); |
---|
672 | | - if (!entry) { |
---|
673 | | - pr_warn("%s: debugfs file entry <%s> creation failed\n", |
---|
674 | | - name, "classes"); |
---|
675 | | - debugfs_remove_recursive(pool->stat_dentry); |
---|
676 | | - pool->stat_dentry = NULL; |
---|
677 | | - } |
---|
| 696 | + debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, |
---|
| 697 | + &zs_stats_size_fops); |
---|
678 | 698 | } |
---|
679 | 699 | |
---|
680 | 700 | static void zs_pool_stat_destroy(struct zs_pool *pool) |
---|
.. | .. |
---|
887 | 907 | |
---|
888 | 908 | static unsigned long handle_to_obj(unsigned long handle) |
---|
889 | 909 | { |
---|
| 910 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 911 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 912 | + |
---|
| 913 | + return zh->addr; |
---|
| 914 | +#else |
---|
890 | 915 | return *(unsigned long *)handle; |
---|
| 916 | +#endif |
---|
891 | 917 | } |
---|
892 | 918 | |
---|
893 | 919 | static unsigned long obj_to_head(struct page *page, void *obj) |
---|
.. | .. |
---|
901 | 927 | |
---|
902 | 928 | static inline int testpin_tag(unsigned long handle) |
---|
903 | 929 | { |
---|
| 930 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 931 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 932 | + |
---|
| 933 | + return spin_is_locked(&zh->lock); |
---|
| 934 | +#else |
---|
904 | 935 | return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); |
---|
| 936 | +#endif |
---|
905 | 937 | } |
---|
906 | 938 | |
---|
907 | 939 | static inline int trypin_tag(unsigned long handle) |
---|
908 | 940 | { |
---|
| 941 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 942 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 943 | + |
---|
| 944 | + return spin_trylock(&zh->lock); |
---|
| 945 | +#else |
---|
909 | 946 | return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); |
---|
| 947 | +#endif |
---|
910 | 948 | } |
---|
911 | 949 | |
---|
912 | | -static void pin_tag(unsigned long handle) |
---|
| 950 | +static void pin_tag(unsigned long handle) __acquires(bitlock) |
---|
913 | 951 | { |
---|
| 952 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 953 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 954 | + |
---|
| 955 | + return spin_lock(&zh->lock); |
---|
| 956 | +#else |
---|
914 | 957 | bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); |
---|
| 958 | +#endif |
---|
915 | 959 | } |
---|
916 | 960 | |
---|
917 | | -static void unpin_tag(unsigned long handle) |
---|
| 961 | +static void unpin_tag(unsigned long handle) __releases(bitlock) |
---|
918 | 962 | { |
---|
| 963 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 964 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); |
---|
| 965 | + |
---|
| 966 | + return spin_unlock(&zh->lock); |
---|
| 967 | +#else |
---|
919 | 968 | bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); |
---|
| 969 | +#endif |
---|
920 | 970 | } |
---|
921 | 971 | |
---|
922 | 972 | static void reset_page(struct page *page) |
---|
.. | .. |
---|
1131 | 1181 | return zspage; |
---|
1132 | 1182 | } |
---|
1133 | 1183 | |
---|
1134 | | -#ifdef CONFIG_PGTABLE_MAPPING |
---|
1135 | | -static inline int __zs_cpu_up(struct mapping_area *area) |
---|
1136 | | -{ |
---|
1137 | | - /* |
---|
1138 | | - * Make sure we don't leak memory if a cpu UP notification |
---|
1139 | | - * and zs_init() race and both call zs_cpu_up() on the same cpu |
---|
1140 | | - */ |
---|
1141 | | - if (area->vm) |
---|
1142 | | - return 0; |
---|
1143 | | - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); |
---|
1144 | | - if (!area->vm) |
---|
1145 | | - return -ENOMEM; |
---|
1146 | | - return 0; |
---|
1147 | | -} |
---|
1148 | | - |
---|
1149 | | -static inline void __zs_cpu_down(struct mapping_area *area) |
---|
1150 | | -{ |
---|
1151 | | - if (area->vm) |
---|
1152 | | - free_vm_area(area->vm); |
---|
1153 | | - area->vm = NULL; |
---|
1154 | | -} |
---|
1155 | | - |
---|
1156 | | -static inline void *__zs_map_object(struct mapping_area *area, |
---|
1157 | | - struct page *pages[2], int off, int size) |
---|
1158 | | -{ |
---|
1159 | | - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); |
---|
1160 | | - area->vm_addr = area->vm->addr; |
---|
1161 | | - return area->vm_addr + off; |
---|
1162 | | -} |
---|
1163 | | - |
---|
1164 | | -static inline void __zs_unmap_object(struct mapping_area *area, |
---|
1165 | | - struct page *pages[2], int off, int size) |
---|
1166 | | -{ |
---|
1167 | | - unsigned long addr = (unsigned long)area->vm_addr; |
---|
1168 | | - |
---|
1169 | | - unmap_kernel_range(addr, PAGE_SIZE * 2); |
---|
1170 | | -} |
---|
1171 | | - |
---|
1172 | | -#else /* CONFIG_PGTABLE_MAPPING */ |
---|
1173 | | - |
---|
1174 | 1184 | static inline int __zs_cpu_up(struct mapping_area *area) |
---|
1175 | 1185 | { |
---|
1176 | 1186 | /* |
---|
.. | .. |
---|
1250 | 1260 | /* enable page faults to match kunmap_atomic() return conditions */ |
---|
1251 | 1261 | pagefault_enable(); |
---|
1252 | 1262 | } |
---|
1253 | | - |
---|
1254 | | -#endif /* CONFIG_PGTABLE_MAPPING */ |
---|
1255 | 1263 | |
---|
1256 | 1264 | static int zs_cpu_prepare(unsigned int cpu) |
---|
1257 | 1265 | { |
---|
.. | .. |
---|
1342 | 1350 | class = pool->size_class[class_idx]; |
---|
1343 | 1351 | off = (class->size * obj_idx) & ~PAGE_MASK; |
---|
1344 | 1352 | |
---|
1345 | | - area = &get_cpu_var(zs_map_area); |
---|
| 1353 | + local_lock(&zs_map_area.lock); |
---|
| 1354 | + area = this_cpu_ptr(&zs_map_area); |
---|
1346 | 1355 | area->vm_mm = mm; |
---|
1347 | 1356 | if (off + class->size <= PAGE_SIZE) { |
---|
1348 | 1357 | /* this object is contained entirely within a page */ |
---|
.. | .. |
---|
1396 | 1405 | |
---|
1397 | 1406 | __zs_unmap_object(area, pages, off, class->size); |
---|
1398 | 1407 | } |
---|
1399 | | - put_cpu_var(zs_map_area); |
---|
| 1408 | + local_unlock(&zs_map_area.lock); |
---|
1400 | 1409 | |
---|
1401 | 1410 | migrate_read_unlock(zspage); |
---|
1402 | 1411 | unpin_tag(handle); |
---|
.. | .. |
---|
1812 | 1821 | */ |
---|
1813 | 1822 | static void lock_zspage(struct zspage *zspage) |
---|
1814 | 1823 | { |
---|
1815 | | - struct page *page = get_first_page(zspage); |
---|
| 1824 | + struct page *curr_page, *page; |
---|
1816 | 1825 | |
---|
1817 | | - do { |
---|
1818 | | - lock_page(page); |
---|
1819 | | - } while ((page = get_next_page(page)) != NULL); |
---|
| 1826 | + /* |
---|
| 1827 | + * Pages we haven't locked yet can be migrated off the list while we're |
---|
| 1828 | + * trying to lock them, so we need to be careful and only attempt to |
---|
| 1829 | + * lock each page under migrate_read_lock(). Otherwise, the page we lock |
---|
| 1830 | + * may no longer belong to the zspage. This means that we may wait for |
---|
| 1831 | + * the wrong page to unlock, so we must take a reference to the page |
---|
| 1832 | + * prior to waiting for it to unlock outside migrate_read_lock(). |
---|
| 1833 | + */ |
---|
| 1834 | + while (1) { |
---|
| 1835 | + migrate_read_lock(zspage); |
---|
| 1836 | + page = get_first_page(zspage); |
---|
| 1837 | + if (trylock_page(page)) |
---|
| 1838 | + break; |
---|
| 1839 | + get_page(page); |
---|
| 1840 | + migrate_read_unlock(zspage); |
---|
| 1841 | + wait_on_page_locked(page); |
---|
| 1842 | + put_page(page); |
---|
| 1843 | + } |
---|
| 1844 | + |
---|
| 1845 | + curr_page = page; |
---|
| 1846 | + while ((page = get_next_page(curr_page))) { |
---|
| 1847 | + if (trylock_page(page)) { |
---|
| 1848 | + curr_page = page; |
---|
| 1849 | + } else { |
---|
| 1850 | + get_page(page); |
---|
| 1851 | + migrate_read_unlock(zspage); |
---|
| 1852 | + wait_on_page_locked(page); |
---|
| 1853 | + put_page(page); |
---|
| 1854 | + migrate_read_lock(zspage); |
---|
| 1855 | + } |
---|
| 1856 | + } |
---|
| 1857 | + migrate_read_unlock(zspage); |
---|
1820 | 1858 | } |
---|
1821 | 1859 | |
---|
1822 | | -static struct dentry *zs_mount(struct file_system_type *fs_type, |
---|
1823 | | - int flags, const char *dev_name, void *data) |
---|
| 1860 | +static int zs_init_fs_context(struct fs_context *fc) |
---|
1824 | 1861 | { |
---|
1825 | | - static const struct dentry_operations ops = { |
---|
1826 | | - .d_dname = simple_dname, |
---|
1827 | | - }; |
---|
1828 | | - |
---|
1829 | | - return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); |
---|
| 1862 | + return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM; |
---|
1830 | 1863 | } |
---|
1831 | 1864 | |
---|
1832 | 1865 | static struct file_system_type zsmalloc_fs = { |
---|
1833 | 1866 | .name = "zsmalloc", |
---|
1834 | | - .mount = zs_mount, |
---|
| 1867 | + .init_fs_context = zs_init_fs_context, |
---|
1835 | 1868 | .kill_sb = kill_anon_super, |
---|
1836 | 1869 | }; |
---|
1837 | 1870 | |
---|
.. | .. |
---|
1856 | 1889 | rwlock_init(&zspage->lock); |
---|
1857 | 1890 | } |
---|
1858 | 1891 | |
---|
1859 | | -static void migrate_read_lock(struct zspage *zspage) |
---|
| 1892 | +static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) |
---|
1860 | 1893 | { |
---|
1861 | 1894 | read_lock(&zspage->lock); |
---|
1862 | 1895 | } |
---|
1863 | 1896 | |
---|
1864 | | -static void migrate_read_unlock(struct zspage *zspage) |
---|
| 1897 | +static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) |
---|
1865 | 1898 | { |
---|
1866 | 1899 | read_unlock(&zspage->lock); |
---|
1867 | 1900 | } |
---|