hc
2024-05-10 ee930fffee469d076998274a2ca55e13dc1efb67
kernel/mm/z3fold.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * z3fold.c
34 *
....@@ -24,60 +25,25 @@
2425
2526 #include <linux/atomic.h>
2627 #include <linux/sched.h>
28
+#include <linux/cpumask.h>
2729 #include <linux/list.h>
2830 #include <linux/mm.h>
2931 #include <linux/module.h>
32
+#include <linux/page-flags.h>
33
+#include <linux/migrate.h>
34
+#include <linux/node.h>
35
+#include <linux/compaction.h>
3036 #include <linux/percpu.h>
37
+#include <linux/mount.h>
38
+#include <linux/pseudo_fs.h>
39
+#include <linux/fs.h>
3140 #include <linux/preempt.h>
3241 #include <linux/workqueue.h>
3342 #include <linux/slab.h>
3443 #include <linux/spinlock.h>
3544 #include <linux/zpool.h>
36
-
37
-/*****************
38
- * Structures
39
-*****************/
40
-struct z3fold_pool;
41
-struct z3fold_ops {
42
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
43
-};
44
-
45
-enum buddy {
46
- HEADLESS = 0,
47
- FIRST,
48
- MIDDLE,
49
- LAST,
50
- BUDDIES_MAX
51
-};
52
-
53
-/*
54
- * struct z3fold_header - z3fold page metadata occupying first chunks of each
55
- * z3fold page, except for HEADLESS pages
56
- * @buddy: links the z3fold page into the relevant list in the
57
- * pool
58
- * @page_lock: per-page lock
59
- * @refcount: reference count for the z3fold page
60
- * @work: work_struct for page layout optimization
61
- * @pool: pointer to the pool which this page belongs to
62
- * @cpu: CPU which this page "belongs" to
63
- * @first_chunks: the size of the first buddy in chunks, 0 if free
64
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
65
- * @last_chunks: the size of the last buddy in chunks, 0 if free
66
- * @first_num: the starting number (for the first handle)
67
- */
68
-struct z3fold_header {
69
- struct list_head buddy;
70
- spinlock_t page_lock;
71
- struct kref refcount;
72
- struct work_struct work;
73
- struct z3fold_pool *pool;
74
- short cpu;
75
- unsigned short first_chunks;
76
- unsigned short middle_chunks;
77
- unsigned short last_chunks;
78
- unsigned short start_middle;
79
- unsigned short first_num:2;
80
-};
45
+#include <linux/magic.h>
46
+#include <linux/kmemleak.h>
8147
8248 /*
8349 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
....@@ -100,6 +66,68 @@
10066
10167 #define BUDDY_MASK (0x3)
10268 #define BUDDY_SHIFT 2
69
+#define SLOTS_ALIGN (0x40)
70
+
71
+/*****************
72
+ * Structures
73
+*****************/
74
+struct z3fold_pool;
75
+struct z3fold_ops {
76
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
77
+};
78
+
79
+enum buddy {
80
+ HEADLESS = 0,
81
+ FIRST,
82
+ MIDDLE,
83
+ LAST,
84
+ BUDDIES_MAX = LAST
85
+};
86
+
87
+struct z3fold_buddy_slots {
88
+ /*
89
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
90
+ * be enough slots to hold all possible variants
91
+ */
92
+ unsigned long slot[BUDDY_MASK + 1];
93
+ unsigned long pool; /* back link */
94
+ rwlock_t lock;
95
+};
96
+#define HANDLE_FLAG_MASK (0x03)
97
+
98
+/*
99
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
100
+ * z3fold page, except for HEADLESS pages
101
+ * @buddy: links the z3fold page into the relevant list in the
102
+ * pool
103
+ * @page_lock: per-page lock
104
+ * @refcount: reference count for the z3fold page
105
+ * @work: work_struct for page layout optimization
106
+ * @slots: pointer to the structure holding buddy slots
107
+ * @pool: pointer to the containing pool
108
+ * @cpu: CPU which this page "belongs" to
109
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
110
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
111
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
112
+ * @first_num: the starting number (for the first handle)
113
+ * @mapped_count: the number of objects currently mapped
114
+ */
115
+struct z3fold_header {
116
+ struct list_head buddy;
117
+ spinlock_t page_lock;
118
+ struct kref refcount;
119
+ struct work_struct work;
120
+ struct z3fold_buddy_slots *slots;
121
+ struct z3fold_pool *pool;
122
+ short cpu;
123
+ unsigned short first_chunks;
124
+ unsigned short middle_chunks;
125
+ unsigned short last_chunks;
126
+ unsigned short start_middle;
127
+ unsigned short first_num:2;
128
+ unsigned short mapped_count:2;
129
+ unsigned short foreign_handles:2;
130
+};
103131
104132 /**
105133 * struct z3fold_pool - stores metadata for each z3fold pool
....@@ -113,11 +141,13 @@
113141 * added buddy.
114142 * @stale: list of pages marked for freeing
115143 * @pages_nr: number of z3fold pages in the pool.
144
+ * @c_handle: cache for z3fold_buddy_slots allocation
116145 * @ops: pointer to a structure of user defined operations specified at
117146 * pool creation time.
118147 * @compact_wq: workqueue for page layout background optimization
119148 * @release_wq: workqueue for safe page release
120149 * @work: work_struct for safe page release
150
+ * @inode: inode for z3fold pseudo filesystem
121151 *
122152 * This structure is allocated at pool creation time and maintains metadata
123153 * pertaining to a particular z3fold pool.
....@@ -130,12 +160,14 @@
130160 struct list_head lru;
131161 struct list_head stale;
132162 atomic64_t pages_nr;
163
+ struct kmem_cache *c_handle;
133164 const struct z3fold_ops *ops;
134165 struct zpool *zpool;
135166 const struct zpool_ops *zpool_ops;
136167 struct workqueue_struct *compact_wq;
137168 struct workqueue_struct *release_wq;
138169 struct work_struct work;
170
+ struct inode *inode;
139171 };
140172
141173 /*
....@@ -148,6 +180,19 @@
148180 PAGE_STALE,
149181 PAGE_CLAIMED, /* by either reclaim or free */
150182 };
183
+
184
+/*
185
+ * handle flags, go under HANDLE_FLAG_MASK
186
+ */
187
+enum z3fold_handle_flags {
188
+ HANDLES_NOFREE = 0,
189
+};
190
+
191
+/*
192
+ * Forward declarations
193
+ */
194
+static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
195
+static void compact_page_work(struct work_struct *w);
151196
152197 /*****************
153198 * Helpers
....@@ -162,39 +207,32 @@
162207 #define for_each_unbuddied_list(_iter, _begin) \
163208 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
164209
165
-static void compact_page_work(struct work_struct *w);
166
-
167
-/* Initializes the z3fold header of a newly allocated z3fold page */
168
-static struct z3fold_header *init_z3fold_page(struct page *page,
169
- struct z3fold_pool *pool)
210
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
211
+ gfp_t gfp)
170212 {
171
- struct z3fold_header *zhdr = page_address(page);
213
+ struct z3fold_buddy_slots *slots;
172214
173
- INIT_LIST_HEAD(&page->lru);
174
- clear_bit(PAGE_HEADLESS, &page->private);
175
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
176
- clear_bit(NEEDS_COMPACTING, &page->private);
177
- clear_bit(PAGE_STALE, &page->private);
178
- clear_bit(PAGE_CLAIMED, &page->private);
215
+ slots = kmem_cache_zalloc(pool->c_handle,
216
+ (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
179217
180
- spin_lock_init(&zhdr->page_lock);
181
- kref_init(&zhdr->refcount);
182
- zhdr->first_chunks = 0;
183
- zhdr->middle_chunks = 0;
184
- zhdr->last_chunks = 0;
185
- zhdr->first_num = 0;
186
- zhdr->start_middle = 0;
187
- zhdr->cpu = -1;
188
- zhdr->pool = pool;
189
- INIT_LIST_HEAD(&zhdr->buddy);
190
- INIT_WORK(&zhdr->work, compact_page_work);
191
- return zhdr;
218
+ if (slots) {
219
+ /* It will be freed separately in free_handle(). */
220
+ kmemleak_not_leak(slots);
221
+ slots->pool = (unsigned long)pool;
222
+ rwlock_init(&slots->lock);
223
+ }
224
+
225
+ return slots;
192226 }
193227
194
-/* Resets the struct page fields and frees the page */
195
-static void free_z3fold_page(struct page *page)
228
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
196229 {
197
- __free_page(page);
230
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
231
+}
232
+
233
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
234
+{
235
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
198236 }
199237
200238 /* Lock a z3fold page */
....@@ -215,33 +253,246 @@
215253 spin_unlock(&zhdr->page_lock);
216254 }
217255
256
+
257
+static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
258
+ bool lock)
259
+{
260
+ struct z3fold_buddy_slots *slots;
261
+ struct z3fold_header *zhdr;
262
+ int locked = 0;
263
+
264
+ if (!(handle & (1 << PAGE_HEADLESS))) {
265
+ slots = handle_to_slots(handle);
266
+ do {
267
+ unsigned long addr;
268
+
269
+ read_lock(&slots->lock);
270
+ addr = *(unsigned long *)handle;
271
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
272
+ if (lock)
273
+ locked = z3fold_page_trylock(zhdr);
274
+ read_unlock(&slots->lock);
275
+ if (locked)
276
+ break;
277
+ cpu_relax();
278
+ } while (lock);
279
+ } else {
280
+ zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
281
+ }
282
+
283
+ return zhdr;
284
+}
285
+
286
+/* Returns the z3fold page where a given handle is stored */
287
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
288
+{
289
+ return __get_z3fold_header(h, false);
290
+}
291
+
292
+/* return locked z3fold page if it's not headless */
293
+static inline struct z3fold_header *get_z3fold_header(unsigned long h)
294
+{
295
+ return __get_z3fold_header(h, true);
296
+}
297
+
298
+static inline void put_z3fold_header(struct z3fold_header *zhdr)
299
+{
300
+ struct page *page = virt_to_page(zhdr);
301
+
302
+ if (!test_bit(PAGE_HEADLESS, &page->private))
303
+ z3fold_page_unlock(zhdr);
304
+}
305
+
306
+static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
307
+{
308
+ struct z3fold_buddy_slots *slots;
309
+ int i;
310
+ bool is_free;
311
+
312
+ if (handle & (1 << PAGE_HEADLESS))
313
+ return;
314
+
315
+ if (WARN_ON(*(unsigned long *)handle == 0))
316
+ return;
317
+
318
+ slots = handle_to_slots(handle);
319
+ write_lock(&slots->lock);
320
+ *(unsigned long *)handle = 0;
321
+
322
+ if (test_bit(HANDLES_NOFREE, &slots->pool)) {
323
+ write_unlock(&slots->lock);
324
+ return; /* simple case, nothing else to do */
325
+ }
326
+
327
+ if (zhdr->slots != slots)
328
+ zhdr->foreign_handles--;
329
+
330
+ is_free = true;
331
+ for (i = 0; i <= BUDDY_MASK; i++) {
332
+ if (slots->slot[i]) {
333
+ is_free = false;
334
+ break;
335
+ }
336
+ }
337
+ write_unlock(&slots->lock);
338
+
339
+ if (is_free) {
340
+ struct z3fold_pool *pool = slots_to_pool(slots);
341
+
342
+ if (zhdr->slots == slots)
343
+ zhdr->slots = NULL;
344
+ kmem_cache_free(pool->c_handle, slots);
345
+ }
346
+}
347
+
348
+static int z3fold_init_fs_context(struct fs_context *fc)
349
+{
350
+ return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
351
+}
352
+
353
+static struct file_system_type z3fold_fs = {
354
+ .name = "z3fold",
355
+ .init_fs_context = z3fold_init_fs_context,
356
+ .kill_sb = kill_anon_super,
357
+};
358
+
359
+static struct vfsmount *z3fold_mnt;
360
+static int z3fold_mount(void)
361
+{
362
+ int ret = 0;
363
+
364
+ z3fold_mnt = kern_mount(&z3fold_fs);
365
+ if (IS_ERR(z3fold_mnt))
366
+ ret = PTR_ERR(z3fold_mnt);
367
+
368
+ return ret;
369
+}
370
+
371
+static void z3fold_unmount(void)
372
+{
373
+ kern_unmount(z3fold_mnt);
374
+}
375
+
376
+static const struct address_space_operations z3fold_aops;
377
+static int z3fold_register_migration(struct z3fold_pool *pool)
378
+{
379
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
380
+ if (IS_ERR(pool->inode)) {
381
+ pool->inode = NULL;
382
+ return 1;
383
+ }
384
+
385
+ pool->inode->i_mapping->private_data = pool;
386
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
387
+ return 0;
388
+}
389
+
390
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
391
+{
392
+ if (pool->inode)
393
+ iput(pool->inode);
394
+ }
395
+
396
+/* Initializes the z3fold header of a newly allocated z3fold page */
397
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
398
+ struct z3fold_pool *pool, gfp_t gfp)
399
+{
400
+ struct z3fold_header *zhdr = page_address(page);
401
+ struct z3fold_buddy_slots *slots;
402
+
403
+ INIT_LIST_HEAD(&page->lru);
404
+ clear_bit(PAGE_HEADLESS, &page->private);
405
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
406
+ clear_bit(NEEDS_COMPACTING, &page->private);
407
+ clear_bit(PAGE_STALE, &page->private);
408
+ clear_bit(PAGE_CLAIMED, &page->private);
409
+ if (headless)
410
+ return zhdr;
411
+
412
+ slots = alloc_slots(pool, gfp);
413
+ if (!slots)
414
+ return NULL;
415
+
416
+ spin_lock_init(&zhdr->page_lock);
417
+ kref_init(&zhdr->refcount);
418
+ zhdr->first_chunks = 0;
419
+ zhdr->middle_chunks = 0;
420
+ zhdr->last_chunks = 0;
421
+ zhdr->first_num = 0;
422
+ zhdr->start_middle = 0;
423
+ zhdr->cpu = -1;
424
+ zhdr->foreign_handles = 0;
425
+ zhdr->mapped_count = 0;
426
+ zhdr->slots = slots;
427
+ zhdr->pool = pool;
428
+ INIT_LIST_HEAD(&zhdr->buddy);
429
+ INIT_WORK(&zhdr->work, compact_page_work);
430
+ return zhdr;
431
+}
432
+
433
+/* Resets the struct page fields and frees the page */
434
+static void free_z3fold_page(struct page *page, bool headless)
435
+{
436
+ if (!headless) {
437
+ lock_page(page);
438
+ __ClearPageMovable(page);
439
+ unlock_page(page);
440
+ }
441
+ ClearPagePrivate(page);
442
+ __free_page(page);
443
+}
444
+
445
+/* Helper function to build the index */
446
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
447
+{
448
+ return (bud + zhdr->first_num) & BUDDY_MASK;
449
+}
450
+
218451 /*
219452 * Encodes the handle of a particular buddy within a z3fold page
220453 * Pool lock should be held as this function accesses first_num
221454 */
222
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
455
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
456
+ struct z3fold_buddy_slots *slots,
457
+ enum buddy bud)
223458 {
224
- unsigned long handle;
459
+ unsigned long h = (unsigned long)zhdr;
460
+ int idx = 0;
225461
226
- handle = (unsigned long)zhdr;
227
- if (bud != HEADLESS) {
228
- handle |= (bud + zhdr->first_num) & BUDDY_MASK;
229
- if (bud == LAST)
230
- handle |= (zhdr->last_chunks << BUDDY_SHIFT);
231
- }
232
- return handle;
462
+ /*
463
+ * For a headless page, its handle is its pointer with the extra
464
+ * PAGE_HEADLESS bit set
465
+ */
466
+ if (bud == HEADLESS)
467
+ return h | (1 << PAGE_HEADLESS);
468
+
469
+ /* otherwise, return pointer to encoded handle */
470
+ idx = __idx(zhdr, bud);
471
+ h += idx;
472
+ if (bud == LAST)
473
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
474
+
475
+ write_lock(&slots->lock);
476
+ slots->slot[idx] = h;
477
+ write_unlock(&slots->lock);
478
+ return (unsigned long)&slots->slot[idx];
233479 }
234480
235
-/* Returns the z3fold page where a given handle is stored */
236
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
481
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
237482 {
238
- return (struct z3fold_header *)(handle & PAGE_MASK);
483
+ return __encode_handle(zhdr, zhdr->slots, bud);
239484 }
240485
241486 /* only for LAST bud, returns zero otherwise */
242487 static unsigned short handle_to_chunks(unsigned long handle)
243488 {
244
- return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
489
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
490
+ unsigned long addr;
491
+
492
+ read_lock(&slots->lock);
493
+ addr = *(unsigned long *)handle;
494
+ read_unlock(&slots->lock);
495
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
245496 }
246497
247498 /*
....@@ -251,24 +502,39 @@
251502 */
252503 static enum buddy handle_to_buddy(unsigned long handle)
253504 {
254
- struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
255
- return (handle - zhdr->first_num) & BUDDY_MASK;
505
+ struct z3fold_header *zhdr;
506
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
507
+ unsigned long addr;
508
+
509
+ read_lock(&slots->lock);
510
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
511
+ addr = *(unsigned long *)handle;
512
+ read_unlock(&slots->lock);
513
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
514
+ return (addr - zhdr->first_num) & BUDDY_MASK;
515
+}
516
+
517
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
518
+{
519
+ return zhdr->pool;
256520 }
257521
258522 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
259523 {
260524 struct page *page = virt_to_page(zhdr);
261
- struct z3fold_pool *pool = zhdr->pool;
525
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
262526
263527 WARN_ON(!list_empty(&zhdr->buddy));
264528 set_bit(PAGE_STALE, &page->private);
265529 clear_bit(NEEDS_COMPACTING, &page->private);
266530 spin_lock(&pool->lock);
267531 if (!list_empty(&page->lru))
268
- list_del(&page->lru);
532
+ list_del_init(&page->lru);
269533 spin_unlock(&pool->lock);
534
+
270535 if (locked)
271536 z3fold_page_unlock(zhdr);
537
+
272538 spin_lock(&pool->stale_lock);
273539 list_add(&zhdr->buddy, &pool->stale);
274540 queue_work(pool->release_wq, &pool->work);
....@@ -295,9 +561,11 @@
295561 {
296562 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
297563 refcount);
298
- spin_lock(&zhdr->pool->lock);
564
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
565
+
566
+ spin_lock(&pool->lock);
299567 list_del_init(&zhdr->buddy);
300
- spin_unlock(&zhdr->pool->lock);
568
+ spin_unlock(&pool->lock);
301569
302570 WARN_ON(z3fold_page_trylock(zhdr));
303571 __release_z3fold_page(zhdr, true);
....@@ -318,7 +586,7 @@
318586 continue;
319587 spin_unlock(&pool->stale_lock);
320588 cancel_work_sync(&zhdr->work);
321
- free_z3fold_page(page);
589
+ free_z3fold_page(page, false);
322590 cond_resched();
323591 spin_lock(&pool->stale_lock);
324592 }
....@@ -349,6 +617,45 @@
349617 return nfree;
350618 }
351619
620
+/* Add to the appropriate unbuddied list */
621
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
622
+ struct z3fold_header *zhdr)
623
+{
624
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
625
+ zhdr->middle_chunks == 0) {
626
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
627
+
628
+ int freechunks = num_free_chunks(zhdr);
629
+ spin_lock(&pool->lock);
630
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
631
+ spin_unlock(&pool->lock);
632
+ zhdr->cpu = smp_processor_id();
633
+ put_cpu_ptr(pool->unbuddied);
634
+ }
635
+}
636
+
637
+static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
638
+{
639
+ enum buddy bud = HEADLESS;
640
+
641
+ if (zhdr->middle_chunks) {
642
+ if (!zhdr->first_chunks &&
643
+ chunks <= zhdr->start_middle - ZHDR_CHUNKS)
644
+ bud = FIRST;
645
+ else if (!zhdr->last_chunks)
646
+ bud = LAST;
647
+ } else {
648
+ if (!zhdr->first_chunks)
649
+ bud = FIRST;
650
+ else if (!zhdr->last_chunks)
651
+ bud = LAST;
652
+ else
653
+ bud = MIDDLE;
654
+ }
655
+
656
+ return bud;
657
+}
658
+
352659 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
353660 unsigned short dst_chunk)
354661 {
....@@ -356,6 +663,108 @@
356663 return memmove(beg + (dst_chunk << CHUNK_SHIFT),
357664 beg + (zhdr->start_middle << CHUNK_SHIFT),
358665 zhdr->middle_chunks << CHUNK_SHIFT);
666
+}
667
+
668
+static inline bool buddy_single(struct z3fold_header *zhdr)
669
+{
670
+ return !((zhdr->first_chunks && zhdr->middle_chunks) ||
671
+ (zhdr->first_chunks && zhdr->last_chunks) ||
672
+ (zhdr->middle_chunks && zhdr->last_chunks));
673
+}
674
+
675
+static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
676
+{
677
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
678
+ void *p = zhdr;
679
+ unsigned long old_handle = 0;
680
+ size_t sz = 0;
681
+ struct z3fold_header *new_zhdr = NULL;
682
+ int first_idx = __idx(zhdr, FIRST);
683
+ int middle_idx = __idx(zhdr, MIDDLE);
684
+ int last_idx = __idx(zhdr, LAST);
685
+ unsigned short *moved_chunks = NULL;
686
+
687
+ /*
688
+ * No need to protect slots here -- all the slots are "local" and
689
+ * the page lock is already taken
690
+ */
691
+ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
692
+ p += ZHDR_SIZE_ALIGNED;
693
+ sz = zhdr->first_chunks << CHUNK_SHIFT;
694
+ old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
695
+ moved_chunks = &zhdr->first_chunks;
696
+ } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
697
+ p += zhdr->start_middle << CHUNK_SHIFT;
698
+ sz = zhdr->middle_chunks << CHUNK_SHIFT;
699
+ old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
700
+ moved_chunks = &zhdr->middle_chunks;
701
+ } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
702
+ p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
703
+ sz = zhdr->last_chunks << CHUNK_SHIFT;
704
+ old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
705
+ moved_chunks = &zhdr->last_chunks;
706
+ }
707
+
708
+ if (sz > 0) {
709
+ enum buddy new_bud = HEADLESS;
710
+ short chunks = size_to_chunks(sz);
711
+ void *q;
712
+
713
+ new_zhdr = __z3fold_alloc(pool, sz, false);
714
+ if (!new_zhdr)
715
+ return NULL;
716
+
717
+ if (WARN_ON(new_zhdr == zhdr))
718
+ goto out_fail;
719
+
720
+ new_bud = get_free_buddy(new_zhdr, chunks);
721
+ q = new_zhdr;
722
+ switch (new_bud) {
723
+ case FIRST:
724
+ new_zhdr->first_chunks = chunks;
725
+ q += ZHDR_SIZE_ALIGNED;
726
+ break;
727
+ case MIDDLE:
728
+ new_zhdr->middle_chunks = chunks;
729
+ new_zhdr->start_middle =
730
+ new_zhdr->first_chunks + ZHDR_CHUNKS;
731
+ q += new_zhdr->start_middle << CHUNK_SHIFT;
732
+ break;
733
+ case LAST:
734
+ new_zhdr->last_chunks = chunks;
735
+ q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
736
+ break;
737
+ default:
738
+ goto out_fail;
739
+ }
740
+ new_zhdr->foreign_handles++;
741
+ memcpy(q, p, sz);
742
+ write_lock(&zhdr->slots->lock);
743
+ *(unsigned long *)old_handle = (unsigned long)new_zhdr +
744
+ __idx(new_zhdr, new_bud);
745
+ if (new_bud == LAST)
746
+ *(unsigned long *)old_handle |=
747
+ (new_zhdr->last_chunks << BUDDY_SHIFT);
748
+ write_unlock(&zhdr->slots->lock);
749
+ add_to_unbuddied(pool, new_zhdr);
750
+ z3fold_page_unlock(new_zhdr);
751
+
752
+ *moved_chunks = 0;
753
+ }
754
+
755
+ return new_zhdr;
756
+
757
+out_fail:
758
+ if (new_zhdr) {
759
+ if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
760
+ atomic64_dec(&pool->pages_nr);
761
+ else {
762
+ add_to_unbuddied(pool, new_zhdr);
763
+ z3fold_page_unlock(new_zhdr);
764
+ }
765
+ }
766
+ return NULL;
767
+
359768 }
360769
361770 #define BIG_CHUNK_GAP 3
....@@ -366,6 +775,9 @@
366775
367776 if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
368777 return 0; /* can't move middle chunk, it's used */
778
+
779
+ if (unlikely(PageIsolated(page)))
780
+ return 0;
369781
370782 if (zhdr->middle_chunks == 0)
371783 return 0; /* nothing to compact */
....@@ -406,10 +818,8 @@
406818
407819 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
408820 {
409
- struct z3fold_pool *pool = zhdr->pool;
821
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
410822 struct page *page;
411
- struct list_head *unbuddied;
412
- int fchunks;
413823
414824 page = virt_to_page(zhdr);
415825 if (locked)
....@@ -429,19 +839,26 @@
429839 return;
430840 }
431841
432
- z3fold_compact_page(zhdr);
433
- unbuddied = get_cpu_ptr(pool->unbuddied);
434
- fchunks = num_free_chunks(zhdr);
435
- if (fchunks < NCHUNKS &&
436
- (!zhdr->first_chunks || !zhdr->middle_chunks ||
437
- !zhdr->last_chunks)) {
438
- /* the page's not completely free and it's unbuddied */
439
- spin_lock(&pool->lock);
440
- list_add(&zhdr->buddy, &unbuddied[fchunks]);
441
- spin_unlock(&pool->lock);
442
- zhdr->cpu = smp_processor_id();
842
+ if (test_bit(PAGE_STALE, &page->private) ||
843
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
844
+ z3fold_page_unlock(zhdr);
845
+ return;
443846 }
444
- put_cpu_ptr(pool->unbuddied);
847
+
848
+ if (!zhdr->foreign_handles && buddy_single(zhdr) &&
849
+ zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
850
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
851
+ atomic64_dec(&pool->pages_nr);
852
+ else {
853
+ clear_bit(PAGE_CLAIMED, &page->private);
854
+ z3fold_page_unlock(zhdr);
855
+ }
856
+ return;
857
+ }
858
+
859
+ z3fold_compact_page(zhdr);
860
+ add_to_unbuddied(pool, zhdr);
861
+ clear_bit(PAGE_CLAIMED, &page->private);
445862 z3fold_page_unlock(zhdr);
446863 }
447864
....@@ -453,6 +870,108 @@
453870 do_compact_page(zhdr, false);
454871 }
455872
873
+/* returns _locked_ z3fold page header or NULL */
874
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
875
+ size_t size, bool can_sleep)
876
+{
877
+ struct z3fold_header *zhdr = NULL;
878
+ struct page *page;
879
+ struct list_head *unbuddied;
880
+ int chunks = size_to_chunks(size), i;
881
+
882
+lookup:
883
+ /* First, try to find an unbuddied z3fold page. */
884
+ unbuddied = get_cpu_ptr(pool->unbuddied);
885
+ for_each_unbuddied_list(i, chunks) {
886
+ struct list_head *l = &unbuddied[i];
887
+
888
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
889
+ struct z3fold_header, buddy);
890
+
891
+ if (!zhdr)
892
+ continue;
893
+
894
+ /* Re-check under lock. */
895
+ spin_lock(&pool->lock);
896
+ l = &unbuddied[i];
897
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
898
+ struct z3fold_header, buddy)) ||
899
+ !z3fold_page_trylock(zhdr)) {
900
+ spin_unlock(&pool->lock);
901
+ zhdr = NULL;
902
+ put_cpu_ptr(pool->unbuddied);
903
+ if (can_sleep)
904
+ cond_resched();
905
+ goto lookup;
906
+ }
907
+ list_del_init(&zhdr->buddy);
908
+ zhdr->cpu = -1;
909
+ spin_unlock(&pool->lock);
910
+
911
+ page = virt_to_page(zhdr);
912
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
913
+ test_bit(PAGE_CLAIMED, &page->private)) {
914
+ z3fold_page_unlock(zhdr);
915
+ zhdr = NULL;
916
+ put_cpu_ptr(pool->unbuddied);
917
+ if (can_sleep)
918
+ cond_resched();
919
+ goto lookup;
920
+ }
921
+
922
+ /*
923
+ * this page could not be removed from its unbuddied
924
+ * list while pool lock was held, and then we've taken
925
+ * page lock so kref_put could not be called before
926
+ * we got here, so it's safe to just call kref_get()
927
+ */
928
+ kref_get(&zhdr->refcount);
929
+ break;
930
+ }
931
+ put_cpu_ptr(pool->unbuddied);
932
+
933
+ if (!zhdr) {
934
+ int cpu;
935
+
936
+ /* look for _exact_ match on other cpus' lists */
937
+ for_each_online_cpu(cpu) {
938
+ struct list_head *l;
939
+
940
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
941
+ spin_lock(&pool->lock);
942
+ l = &unbuddied[chunks];
943
+
944
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
945
+ struct z3fold_header, buddy);
946
+
947
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
948
+ spin_unlock(&pool->lock);
949
+ zhdr = NULL;
950
+ continue;
951
+ }
952
+ list_del_init(&zhdr->buddy);
953
+ zhdr->cpu = -1;
954
+ spin_unlock(&pool->lock);
955
+
956
+ page = virt_to_page(zhdr);
957
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
958
+ test_bit(PAGE_CLAIMED, &page->private)) {
959
+ z3fold_page_unlock(zhdr);
960
+ zhdr = NULL;
961
+ if (can_sleep)
962
+ cond_resched();
963
+ continue;
964
+ }
965
+ kref_get(&zhdr->refcount);
966
+ break;
967
+ }
968
+ }
969
+
970
+ if (zhdr && !zhdr->slots)
971
+ zhdr->slots = alloc_slots(pool,
972
+ can_sleep ? GFP_NOIO : GFP_ATOMIC);
973
+ return zhdr;
974
+}
456975
457976 /*
458977 * API Functions
....@@ -476,6 +995,11 @@
476995 pool = kzalloc(sizeof(struct z3fold_pool), gfp);
477996 if (!pool)
478997 goto out;
998
+ pool->c_handle = kmem_cache_create("z3fold_handle",
999
+ sizeof(struct z3fold_buddy_slots),
1000
+ SLOTS_ALIGN, 0, NULL);
1001
+ if (!pool->c_handle)
1002
+ goto out_c;
4791003 spin_lock_init(&pool->lock);
4801004 spin_lock_init(&pool->stale_lock);
4811005 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
....@@ -497,15 +1021,21 @@
4971021 pool->release_wq = create_singlethread_workqueue(pool->name);
4981022 if (!pool->release_wq)
4991023 goto out_wq;
1024
+ if (z3fold_register_migration(pool))
1025
+ goto out_rwq;
5001026 INIT_WORK(&pool->work, free_pages_work);
5011027 pool->ops = ops;
5021028 return pool;
5031029
1030
+out_rwq:
1031
+ destroy_workqueue(pool->release_wq);
5041032 out_wq:
5051033 destroy_workqueue(pool->compact_wq);
5061034 out_unbuddied:
5071035 free_percpu(pool->unbuddied);
5081036 out_pool:
1037
+ kmem_cache_destroy(pool->c_handle);
1038
+out_c:
5091039 kfree(pool);
5101040 out:
5111041 return NULL;
....@@ -519,8 +1049,21 @@
5191049 */
5201050 static void z3fold_destroy_pool(struct z3fold_pool *pool)
5211051 {
522
- destroy_workqueue(pool->release_wq);
1052
+ kmem_cache_destroy(pool->c_handle);
1053
+
1054
+ /*
1055
+ * We need to destroy pool->compact_wq before pool->release_wq,
1056
+ * as any pending work on pool->compact_wq will call
1057
+ * queue_work(pool->release_wq, &pool->work).
1058
+ *
1059
+ * There are still outstanding pages until both workqueues are drained,
1060
+ * so we cannot unregister migration until then.
1061
+ */
1062
+
5231063 destroy_workqueue(pool->compact_wq);
1064
+ destroy_workqueue(pool->release_wq);
1065
+ z3fold_unregister_migration(pool);
1066
+ free_percpu(pool->unbuddied);
5241067 kfree(pool);
5251068 }
5261069
....@@ -546,13 +1089,13 @@
5461089 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
5471090 unsigned long *handle)
5481091 {
549
- int chunks = 0, i, freechunks;
1092
+ int chunks = size_to_chunks(size);
5501093 struct z3fold_header *zhdr = NULL;
5511094 struct page *page = NULL;
5521095 enum buddy bud;
5531096 bool can_sleep = gfpflags_allow_blocking(gfp);
5541097
555
- if (!size || (gfp & __GFP_HIGHMEM))
1098
+ if (!size)
5561099 return -EINVAL;
5571100
5581101 if (size > PAGE_SIZE)
....@@ -561,68 +1104,11 @@
5611104 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
5621105 bud = HEADLESS;
5631106 else {
564
- struct list_head *unbuddied;
565
- chunks = size_to_chunks(size);
566
-
567
-lookup:
568
- /* First, try to find an unbuddied z3fold page. */
569
- unbuddied = get_cpu_ptr(pool->unbuddied);
570
- for_each_unbuddied_list(i, chunks) {
571
- struct list_head *l = &unbuddied[i];
572
-
573
- zhdr = list_first_entry_or_null(READ_ONCE(l),
574
- struct z3fold_header, buddy);
575
-
576
- if (!zhdr)
577
- continue;
578
-
579
- /* Re-check under lock. */
580
- spin_lock(&pool->lock);
581
- l = &unbuddied[i];
582
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
583
- struct z3fold_header, buddy)) ||
584
- !z3fold_page_trylock(zhdr)) {
585
- spin_unlock(&pool->lock);
586
- put_cpu_ptr(pool->unbuddied);
587
- goto lookup;
588
- }
589
- list_del_init(&zhdr->buddy);
590
- zhdr->cpu = -1;
591
- spin_unlock(&pool->lock);
592
-
593
- page = virt_to_page(zhdr);
594
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
595
- z3fold_page_unlock(zhdr);
596
- zhdr = NULL;
597
- put_cpu_ptr(pool->unbuddied);
598
- if (can_sleep)
599
- cond_resched();
600
- goto lookup;
601
- }
602
-
603
- /*
604
- * this page could not be removed from its unbuddied
605
- * list while pool lock was held, and then we've taken
606
- * page lock so kref_put could not be called before
607
- * we got here, so it's safe to just call kref_get()
608
- */
609
- kref_get(&zhdr->refcount);
610
- break;
611
- }
612
- put_cpu_ptr(pool->unbuddied);
613
-
1107
+retry:
1108
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
6141109 if (zhdr) {
615
- if (zhdr->first_chunks == 0) {
616
- if (zhdr->middle_chunks != 0 &&
617
- chunks >= zhdr->start_middle)
618
- bud = LAST;
619
- else
620
- bud = FIRST;
621
- } else if (zhdr->last_chunks == 0)
622
- bud = LAST;
623
- else if (zhdr->middle_chunks == 0)
624
- bud = MIDDLE;
625
- else {
1110
+ bud = get_free_buddy(zhdr, chunks);
1111
+ if (bud == HEADLESS) {
6261112 if (kref_put(&zhdr->refcount,
6271113 release_z3fold_page_locked))
6281114 atomic64_dec(&pool->pages_nr);
....@@ -630,8 +1116,9 @@
6301116 z3fold_page_unlock(zhdr);
6311117 pr_err("No free chunks in unbuddied\n");
6321118 WARN_ON(1);
633
- goto lookup;
1119
+ goto retry;
6341120 }
1121
+ page = virt_to_page(zhdr);
6351122 goto found;
6361123 }
6371124 bud = FIRST;
....@@ -662,12 +1149,26 @@
6621149 if (!page)
6631150 return -ENOMEM;
6641151
1152
+ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
1153
+ if (!zhdr) {
1154
+ __free_page(page);
1155
+ return -ENOMEM;
1156
+ }
6651157 atomic64_inc(&pool->pages_nr);
666
- zhdr = init_z3fold_page(page, pool);
6671158
6681159 if (bud == HEADLESS) {
6691160 set_bit(PAGE_HEADLESS, &page->private);
6701161 goto headless;
1162
+ }
1163
+ if (can_sleep) {
1164
+ lock_page(page);
1165
+ __SetPageMovable(page, pool->inode->i_mapping);
1166
+ unlock_page(page);
1167
+ } else {
1168
+ if (trylock_page(page)) {
1169
+ __SetPageMovable(page, pool->inode->i_mapping);
1170
+ unlock_page(page);
1171
+ }
6711172 }
6721173 z3fold_page_lock(zhdr);
6731174
....@@ -680,19 +1181,7 @@
6801181 zhdr->middle_chunks = chunks;
6811182 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
6821183 }
683
-
684
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
685
- zhdr->middle_chunks == 0) {
686
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
687
-
688
- /* Add to unbuddied list */
689
- freechunks = num_free_chunks(zhdr);
690
- spin_lock(&pool->lock);
691
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
692
- spin_unlock(&pool->lock);
693
- zhdr->cpu = smp_processor_id();
694
- put_cpu_ptr(pool->unbuddied);
695
- }
1184
+ add_to_unbuddied(pool, zhdr);
6961185
6971186 headless:
6981187 spin_lock(&pool->lock);
....@@ -725,9 +1214,11 @@
7251214 struct z3fold_header *zhdr;
7261215 struct page *page;
7271216 enum buddy bud;
1217
+ bool page_claimed;
7281218
729
- zhdr = handle_to_z3fold_header(handle);
1219
+ zhdr = get_z3fold_header(handle);
7301220 page = virt_to_page(zhdr);
1221
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
7311222
7321223 if (test_bit(PAGE_HEADLESS, &page->private)) {
7331224 /* if a headless page is under reclaim, just leave.
....@@ -735,18 +1226,18 @@
7351226 * has not been set before, we release this page
7361227 * immediately so we don't care about its value any more.
7371228 */
738
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1229
+ if (!page_claimed) {
7391230 spin_lock(&pool->lock);
7401231 list_del(&page->lru);
7411232 spin_unlock(&pool->lock);
742
- free_z3fold_page(page);
1233
+ put_z3fold_header(zhdr);
1234
+ free_z3fold_page(page, true);
7431235 atomic64_dec(&pool->pages_nr);
7441236 }
7451237 return;
7461238 }
7471239
7481240 /* Non-headless case */
749
- z3fold_page_lock(zhdr);
7501241 bud = handle_to_buddy(handle);
7511242
7521243 switch (bud) {
....@@ -762,20 +1253,24 @@
7621253 default:
7631254 pr_err("%s: unknown bud %d\n", __func__, bud);
7641255 WARN_ON(1);
765
- z3fold_page_unlock(zhdr);
1256
+ put_z3fold_header(zhdr);
7661257 return;
7671258 }
7681259
1260
+ if (!page_claimed)
1261
+ free_handle(handle, zhdr);
7691262 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
7701263 atomic64_dec(&pool->pages_nr);
7711264 return;
7721265 }
773
- if (test_bit(PAGE_CLAIMED, &page->private)) {
1266
+ if (page_claimed) {
1267
+ /* the page has not been claimed by us */
7741268 z3fold_page_unlock(zhdr);
7751269 return;
7761270 }
7771271 if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
778
- z3fold_page_unlock(zhdr);
1272
+ put_z3fold_header(zhdr);
1273
+ clear_bit(PAGE_CLAIMED, &page->private);
7791274 return;
7801275 }
7811276 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
....@@ -784,12 +1279,14 @@
7841279 spin_unlock(&pool->lock);
7851280 zhdr->cpu = -1;
7861281 kref_get(&zhdr->refcount);
1282
+ clear_bit(PAGE_CLAIMED, &page->private);
7871283 do_compact_page(zhdr, true);
7881284 return;
7891285 }
7901286 kref_get(&zhdr->refcount);
1287
+ clear_bit(PAGE_CLAIMED, &page->private);
7911288 queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
792
- z3fold_page_unlock(zhdr);
1289
+ put_z3fold_header(zhdr);
7931290 }
7941291
7951292 /**
....@@ -830,11 +1327,15 @@
8301327 */
8311328 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
8321329 {
833
- int i, ret = 0;
1330
+ int i, ret = -1;
8341331 struct z3fold_header *zhdr = NULL;
8351332 struct page *page = NULL;
8361333 struct list_head *pos;
8371334 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1335
+ struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
1336
+
1337
+ rwlock_init(&slots.lock);
1338
+ slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
8381339
8391340 spin_lock(&pool->lock);
8401341 if (!pool->ops || !pool->ops->evict || retries == 0) {
....@@ -849,21 +1350,50 @@
8491350 list_for_each_prev(pos, &pool->lru) {
8501351 page = list_entry(pos, struct page, lru);
8511352
852
- /* this bit could have been set by free, in which case
853
- * we pass over to the next page in the pool.
854
- */
855
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
856
- continue;
857
-
8581353 zhdr = page_address(page);
859
- if (test_bit(PAGE_HEADLESS, &page->private))
860
- break;
1354
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
1355
+ /*
1356
+ * For non-headless pages, we wait to do this
1357
+ * until we have the page lock to avoid racing
1358
+ * with __z3fold_alloc(). Headless pages don't
1359
+ * have a lock (and __z3fold_alloc() will never
1360
+ * see them), but we still need to test and set
1361
+ * PAGE_CLAIMED to avoid racing with
1362
+ * z3fold_free(), so just do it now before
1363
+ * leaving the loop.
1364
+ */
1365
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1366
+ continue;
8611367
1368
+ break;
1369
+ }
1370
+
1371
+ if (kref_get_unless_zero(&zhdr->refcount) == 0) {
1372
+ zhdr = NULL;
1373
+ break;
1374
+ }
8621375 if (!z3fold_page_trylock(zhdr)) {
1376
+ if (kref_put(&zhdr->refcount,
1377
+ release_z3fold_page))
1378
+ atomic64_dec(&pool->pages_nr);
8631379 zhdr = NULL;
8641380 continue; /* can't evict at this point */
8651381 }
866
- kref_get(&zhdr->refcount);
1382
+
1383
+ /* test_and_set_bit is of course atomic, but we still
1384
+ * need to do it under page lock, otherwise checking
1385
+ * that bit in __z3fold_alloc wouldn't make sense
1386
+ */
1387
+ if (zhdr->foreign_handles ||
1388
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1389
+ if (kref_put(&zhdr->refcount,
1390
+ release_z3fold_page_locked))
1391
+ atomic64_dec(&pool->pages_nr);
1392
+ else
1393
+ z3fold_page_unlock(zhdr);
1394
+ zhdr = NULL;
1395
+ continue; /* can't evict such page */
1396
+ }
8671397 list_del_init(&zhdr->buddy);
8681398 zhdr->cpu = -1;
8691399 break;
....@@ -877,19 +1407,24 @@
8771407
8781408 if (!test_bit(PAGE_HEADLESS, &page->private)) {
8791409 /*
880
- * We need encode the handles before unlocking, since
881
- * we can race with free that will set
882
- * (first|last)_chunks to 0
1410
+ * We need encode the handles before unlocking, and
1411
+ * use our local slots structure because z3fold_free
1412
+ * can zero out zhdr->slots and we can't do much
1413
+ * about that
8831414 */
8841415 first_handle = 0;
8851416 last_handle = 0;
8861417 middle_handle = 0;
1418
+ memset(slots.slot, 0, sizeof(slots.slot));
8871419 if (zhdr->first_chunks)
888
- first_handle = encode_handle(zhdr, FIRST);
1420
+ first_handle = __encode_handle(zhdr, &slots,
1421
+ FIRST);
8891422 if (zhdr->middle_chunks)
890
- middle_handle = encode_handle(zhdr, MIDDLE);
1423
+ middle_handle = __encode_handle(zhdr, &slots,
1424
+ MIDDLE);
8911425 if (zhdr->last_chunks)
892
- last_handle = encode_handle(zhdr, LAST);
1426
+ last_handle = __encode_handle(zhdr, &slots,
1427
+ LAST);
8931428 /*
8941429 * it's safe to unlock here because we hold a
8951430 * reference to this page
....@@ -899,7 +1434,6 @@
8991434 first_handle = encode_handle(zhdr, HEADLESS);
9001435 last_handle = middle_handle = 0;
9011436 }
902
-
9031437 /* Issue the eviction callback(s) */
9041438 if (middle_handle) {
9051439 ret = pool->ops->evict(pool, middle_handle);
....@@ -919,18 +1453,20 @@
9191453 next:
9201454 if (test_bit(PAGE_HEADLESS, &page->private)) {
9211455 if (ret == 0) {
922
- free_z3fold_page(page);
1456
+ free_z3fold_page(page, true);
9231457 atomic64_dec(&pool->pages_nr);
9241458 return 0;
9251459 }
9261460 spin_lock(&pool->lock);
9271461 list_add(&page->lru, &pool->lru);
9281462 spin_unlock(&pool->lock);
929
- } else {
930
- z3fold_page_lock(zhdr);
9311463 clear_bit(PAGE_CLAIMED, &page->private);
1464
+ } else {
1465
+ struct z3fold_buddy_slots *slots = zhdr->slots;
1466
+ z3fold_page_lock(zhdr);
9321467 if (kref_put(&zhdr->refcount,
9331468 release_z3fold_page_locked)) {
1469
+ kmem_cache_free(pool->c_handle, slots);
9341470 atomic64_dec(&pool->pages_nr);
9351471 return 0;
9361472 }
....@@ -943,6 +1479,7 @@
9431479 list_add(&page->lru, &pool->lru);
9441480 spin_unlock(&pool->lock);
9451481 z3fold_page_unlock(zhdr);
1482
+ clear_bit(PAGE_CLAIMED, &page->private);
9461483 }
9471484
9481485 /* We started off locked to we need to lock the pool back */
....@@ -969,14 +1506,13 @@
9691506 void *addr;
9701507 enum buddy buddy;
9711508
972
- zhdr = handle_to_z3fold_header(handle);
1509
+ zhdr = get_z3fold_header(handle);
9731510 addr = zhdr;
9741511 page = virt_to_page(zhdr);
9751512
9761513 if (test_bit(PAGE_HEADLESS, &page->private))
9771514 goto out;
9781515
979
- z3fold_page_lock(zhdr);
9801516 buddy = handle_to_buddy(handle);
9811517 switch (buddy) {
9821518 case FIRST:
....@@ -996,8 +1532,10 @@
9961532 break;
9971533 }
9981534
999
- z3fold_page_unlock(zhdr);
1535
+ if (addr)
1536
+ zhdr->mapped_count++;
10001537 out:
1538
+ put_z3fold_header(zhdr);
10011539 return addr;
10021540 }
10031541
....@@ -1012,17 +1550,17 @@
10121550 struct page *page;
10131551 enum buddy buddy;
10141552
1015
- zhdr = handle_to_z3fold_header(handle);
1553
+ zhdr = get_z3fold_header(handle);
10161554 page = virt_to_page(zhdr);
10171555
10181556 if (test_bit(PAGE_HEADLESS, &page->private))
10191557 return;
10201558
1021
- z3fold_page_lock(zhdr);
10221559 buddy = handle_to_buddy(handle);
10231560 if (buddy == MIDDLE)
10241561 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1025
- z3fold_page_unlock(zhdr);
1562
+ zhdr->mapped_count--;
1563
+ put_z3fold_header(zhdr);
10261564 }
10271565
10281566 /**
....@@ -1035,6 +1573,140 @@
10351573 {
10361574 return atomic64_read(&pool->pages_nr);
10371575 }
1576
+
1577
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1578
+{
1579
+ struct z3fold_header *zhdr;
1580
+ struct z3fold_pool *pool;
1581
+
1582
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
1583
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
1584
+
1585
+ if (test_bit(PAGE_HEADLESS, &page->private))
1586
+ return false;
1587
+
1588
+ zhdr = page_address(page);
1589
+ z3fold_page_lock(zhdr);
1590
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
1591
+ test_bit(PAGE_STALE, &page->private))
1592
+ goto out;
1593
+
1594
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
1595
+ goto out;
1596
+
1597
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1598
+ goto out;
1599
+ pool = zhdr_to_pool(zhdr);
1600
+ spin_lock(&pool->lock);
1601
+ if (!list_empty(&zhdr->buddy))
1602
+ list_del_init(&zhdr->buddy);
1603
+ if (!list_empty(&page->lru))
1604
+ list_del_init(&page->lru);
1605
+ spin_unlock(&pool->lock);
1606
+
1607
+ kref_get(&zhdr->refcount);
1608
+ z3fold_page_unlock(zhdr);
1609
+ return true;
1610
+
1611
+out:
1612
+ z3fold_page_unlock(zhdr);
1613
+ return false;
1614
+}
1615
+
1616
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1617
+ struct page *page, enum migrate_mode mode)
1618
+{
1619
+ struct z3fold_header *zhdr, *new_zhdr;
1620
+ struct z3fold_pool *pool;
1621
+ struct address_space *new_mapping;
1622
+
1623
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
1624
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
1625
+ VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
1626
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1627
+
1628
+ zhdr = page_address(page);
1629
+ pool = zhdr_to_pool(zhdr);
1630
+
1631
+ if (!z3fold_page_trylock(zhdr))
1632
+ return -EAGAIN;
1633
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
1634
+ z3fold_page_unlock(zhdr);
1635
+ clear_bit(PAGE_CLAIMED, &page->private);
1636
+ return -EBUSY;
1637
+ }
1638
+ if (work_pending(&zhdr->work)) {
1639
+ z3fold_page_unlock(zhdr);
1640
+ return -EAGAIN;
1641
+ }
1642
+ new_zhdr = page_address(newpage);
1643
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
1644
+ newpage->private = page->private;
1645
+ page->private = 0;
1646
+ z3fold_page_unlock(zhdr);
1647
+ spin_lock_init(&new_zhdr->page_lock);
1648
+ INIT_WORK(&new_zhdr->work, compact_page_work);
1649
+ /*
1650
+ * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1651
+ * so we only have to reinitialize it.
1652
+ */
1653
+ INIT_LIST_HEAD(&new_zhdr->buddy);
1654
+ new_mapping = page_mapping(page);
1655
+ __ClearPageMovable(page);
1656
+ ClearPagePrivate(page);
1657
+
1658
+ get_page(newpage);
1659
+ z3fold_page_lock(new_zhdr);
1660
+ if (new_zhdr->first_chunks)
1661
+ encode_handle(new_zhdr, FIRST);
1662
+ if (new_zhdr->last_chunks)
1663
+ encode_handle(new_zhdr, LAST);
1664
+ if (new_zhdr->middle_chunks)
1665
+ encode_handle(new_zhdr, MIDDLE);
1666
+ set_bit(NEEDS_COMPACTING, &newpage->private);
1667
+ new_zhdr->cpu = smp_processor_id();
1668
+ spin_lock(&pool->lock);
1669
+ list_add(&newpage->lru, &pool->lru);
1670
+ spin_unlock(&pool->lock);
1671
+ __SetPageMovable(newpage, new_mapping);
1672
+ z3fold_page_unlock(new_zhdr);
1673
+
1674
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1675
+
1676
+ page_mapcount_reset(page);
1677
+ clear_bit(PAGE_CLAIMED, &page->private);
1678
+ put_page(page);
1679
+ return 0;
1680
+}
1681
+
1682
+static void z3fold_page_putback(struct page *page)
1683
+{
1684
+ struct z3fold_header *zhdr;
1685
+ struct z3fold_pool *pool;
1686
+
1687
+ zhdr = page_address(page);
1688
+ pool = zhdr_to_pool(zhdr);
1689
+
1690
+ z3fold_page_lock(zhdr);
1691
+ if (!list_empty(&zhdr->buddy))
1692
+ list_del_init(&zhdr->buddy);
1693
+ INIT_LIST_HEAD(&page->lru);
1694
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1695
+ atomic64_dec(&pool->pages_nr);
1696
+ return;
1697
+ }
1698
+ spin_lock(&pool->lock);
1699
+ list_add(&page->lru, &pool->lru);
1700
+ spin_unlock(&pool->lock);
1701
+ clear_bit(PAGE_CLAIMED, &page->private);
1702
+ z3fold_page_unlock(zhdr);
1703
+}
1704
+
1705
+static const struct address_space_operations z3fold_aops = {
1706
+ .isolate_page = z3fold_page_isolate,
1707
+ .migratepage = z3fold_page_migrate,
1708
+ .putback_page = z3fold_page_putback,
1709
+};
10381710
10391711 /*****************
10401712 * zpool
....@@ -1133,8 +1805,14 @@
11331805
11341806 static int __init init_z3fold(void)
11351807 {
1808
+ int ret;
1809
+
11361810 /* Make sure the z3fold header is not larger than the page size */
11371811 BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
1812
+ ret = z3fold_mount();
1813
+ if (ret)
1814
+ return ret;
1815
+
11381816 zpool_register_driver(&z3fold_zpool_driver);
11391817
11401818 return 0;
....@@ -1142,6 +1820,7 @@
11421820
11431821 static void __exit exit_z3fold(void)
11441822 {
1823
+ z3fold_unmount();
11451824 zpool_unregister_driver(&z3fold_zpool_driver);
11461825 }
11471826