hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/fs/dax.c
....@@ -1,17 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * fs/dax.c - Direct Access filesystem code
34 * Copyright (c) 2013-2014 Intel Corporation
45 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
56 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6
- *
7
- * This program is free software; you can redistribute it and/or modify it
8
- * under the terms and conditions of the GNU General Public License,
9
- * version 2, as published by the Free Software Foundation.
10
- *
11
- * This program is distributed in the hope it will be useful, but WITHOUT
12
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14
- * more details.
157 */
168
179 #include <linux/atomic.h>
....@@ -33,10 +25,21 @@
3325 #include <linux/sizes.h>
3426 #include <linux/mmu_notifier.h>
3527 #include <linux/iomap.h>
36
-#include "internal.h"
28
+#include <asm/pgalloc.h>
3729
3830 #define CREATE_TRACE_POINTS
3931 #include <trace/events/fs_dax.h>
32
+
33
+static inline unsigned int pe_order(enum page_entry_size pe_size)
34
+{
35
+ if (pe_size == PE_SIZE_PTE)
36
+ return PAGE_SHIFT - PAGE_SHIFT;
37
+ if (pe_size == PE_SIZE_PMD)
38
+ return PMD_SHIFT - PAGE_SHIFT;
39
+ if (pe_size == PE_SIZE_PUD)
40
+ return PUD_SHIFT - PAGE_SHIFT;
41
+ return ~0;
42
+}
4043
4144 /* We choose 4096 entries - same as per-zone page wait tables */
4245 #define DAX_WAIT_TABLE_BITS 12
....@@ -45,6 +48,9 @@
4548 /* The 'colour' (ie low bits) within a PMD of a page offset. */
4649 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
4750 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
51
+
52
+/* The order of a PMD entry */
53
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
4854
4955 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
5056
....@@ -59,63 +65,77 @@
5965 fs_initcall(init_dax_wait_table);
6066
6167 /*
62
- * We use lowest available bit in exceptional entry for locking, one bit for
63
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
64
- * an empty entry that is just used for locking. In total four special bits.
68
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
69
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
70
+ * and two more to tell us if the entry is a zero page or an empty entry that
71
+ * is just used for locking. In total four special bits.
6572 *
6673 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
6774 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
6875 * block allocation.
6976 */
70
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
71
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
72
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
73
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
74
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
77
+#define DAX_SHIFT (4)
78
+#define DAX_LOCKED (1UL << 0)
79
+#define DAX_PMD (1UL << 1)
80
+#define DAX_ZERO_PAGE (1UL << 2)
81
+#define DAX_EMPTY (1UL << 3)
7582
76
-static unsigned long dax_radix_pfn(void *entry)
83
+static unsigned long dax_to_pfn(void *entry)
7784 {
78
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
85
+ return xa_to_value(entry) >> DAX_SHIFT;
7986 }
8087
81
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
88
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
8289 {
83
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
84
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
90
+ return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
8591 }
8692
87
-static unsigned int dax_radix_order(void *entry)
93
+static bool dax_is_locked(void *entry)
8894 {
89
- if ((unsigned long)entry & RADIX_DAX_PMD)
90
- return PMD_SHIFT - PAGE_SHIFT;
95
+ return xa_to_value(entry) & DAX_LOCKED;
96
+}
97
+
98
+static unsigned int dax_entry_order(void *entry)
99
+{
100
+ if (xa_to_value(entry) & DAX_PMD)
101
+ return PMD_ORDER;
91102 return 0;
92103 }
93104
94
-static int dax_is_pmd_entry(void *entry)
105
+static unsigned long dax_is_pmd_entry(void *entry)
95106 {
96
- return (unsigned long)entry & RADIX_DAX_PMD;
107
+ return xa_to_value(entry) & DAX_PMD;
97108 }
98109
99
-static int dax_is_pte_entry(void *entry)
110
+static bool dax_is_pte_entry(void *entry)
100111 {
101
- return !((unsigned long)entry & RADIX_DAX_PMD);
112
+ return !(xa_to_value(entry) & DAX_PMD);
102113 }
103114
104115 static int dax_is_zero_entry(void *entry)
105116 {
106
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
117
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
107118 }
108119
109120 static int dax_is_empty_entry(void *entry)
110121 {
111
- return (unsigned long)entry & RADIX_DAX_EMPTY;
122
+ return xa_to_value(entry) & DAX_EMPTY;
112123 }
113124
114125 /*
115
- * DAX radix tree locking
126
+ * true if the entry that was found is of a smaller order than the entry
127
+ * we were looking for
128
+ */
129
+static bool dax_is_conflict(void *entry)
130
+{
131
+ return entry == XA_RETRY_ENTRY;
132
+}
133
+
134
+/*
135
+ * DAX page cache entry locking
116136 */
117137 struct exceptional_entry_key {
118
- struct address_space *mapping;
138
+ struct xarray *xa;
119139 pgoff_t entry_start;
120140 };
121141
....@@ -124,10 +144,21 @@
124144 struct exceptional_entry_key key;
125145 };
126146
127
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
128
- pgoff_t index, void *entry, struct exceptional_entry_key *key)
147
+/**
148
+ * enum dax_wake_mode: waitqueue wakeup behaviour
149
+ * @WAKE_ALL: wake all waiters in the waitqueue
150
+ * @WAKE_NEXT: wake only the first waiter in the waitqueue
151
+ */
152
+enum dax_wake_mode {
153
+ WAKE_ALL,
154
+ WAKE_NEXT,
155
+};
156
+
157
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
158
+ void *entry, struct exceptional_entry_key *key)
129159 {
130160 unsigned long hash;
161
+ unsigned long index = xas->xa_index;
131162
132163 /*
133164 * If 'entry' is a PMD, align the 'index' that we use for the wait
....@@ -136,22 +167,21 @@
136167 */
137168 if (dax_is_pmd_entry(entry))
138169 index &= ~PG_PMD_COLOUR;
139
-
140
- key->mapping = mapping;
170
+ key->xa = xas->xa;
141171 key->entry_start = index;
142172
143
- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
173
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
144174 return wait_table + hash;
145175 }
146176
147
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
148
- int sync, void *keyp)
177
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
178
+ unsigned int mode, int sync, void *keyp)
149179 {
150180 struct exceptional_entry_key *key = keyp;
151181 struct wait_exceptional_entry_queue *ewait =
152182 container_of(wait, struct wait_exceptional_entry_queue, wait);
153183
154
- if (key->mapping != ewait->key.mapping ||
184
+ if (key->xa != ewait->key.xa ||
155185 key->entry_start != ewait->key.entry_start)
156186 return 0;
157187 return autoremove_wake_function(wait, mode, sync, NULL);
....@@ -162,13 +192,13 @@
162192 * The important information it's conveying is whether the entry at
163193 * this index used to be a PMD entry.
164194 */
165
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
166
- pgoff_t index, void *entry, bool wake_all)
195
+static void dax_wake_entry(struct xa_state *xas, void *entry,
196
+ enum dax_wake_mode mode)
167197 {
168198 struct exceptional_entry_key key;
169199 wait_queue_head_t *wq;
170200
171
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
201
+ wq = dax_entry_waitqueue(xas, entry, &key);
172202
173203 /*
174204 * Checking for locked entry and prepare_to_wait_exclusive() happens
....@@ -177,62 +207,22 @@
177207 * must be in the waitqueue and the following check will see them.
178208 */
179209 if (waitqueue_active(wq))
180
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
210
+ __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
181211 }
182212
183213 /*
184
- * Check whether the given slot is locked. Must be called with the i_pages
185
- * lock held.
186
- */
187
-static inline int slot_locked(struct address_space *mapping, void **slot)
188
-{
189
- unsigned long entry = (unsigned long)
190
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
191
- return entry & RADIX_DAX_ENTRY_LOCK;
192
-}
193
-
194
-/*
195
- * Mark the given slot as locked. Must be called with the i_pages lock held.
196
- */
197
-static inline void *lock_slot(struct address_space *mapping, void **slot)
198
-{
199
- unsigned long entry = (unsigned long)
200
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
201
-
202
- entry |= RADIX_DAX_ENTRY_LOCK;
203
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
204
- return (void *)entry;
205
-}
206
-
207
-/*
208
- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
209
- */
210
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
211
-{
212
- unsigned long entry = (unsigned long)
213
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
214
-
215
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
216
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
217
- return (void *)entry;
218
-}
219
-
220
-static void put_unlocked_mapping_entry(struct address_space *mapping,
221
- pgoff_t index, void *entry);
222
-
223
-/*
224
- * Lookup entry in radix tree, wait for it to become unlocked if it is
225
- * exceptional entry and return it. The caller must call
226
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
227
- * put_locked_mapping_entry() when he locked the entry and now wants to
228
- * unlock it.
214
+ * Look up entry in page cache, wait for it to become unlocked if it
215
+ * is a DAX entry and return it. The caller must subsequently call
216
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
217
+ * if it did. The entry returned may have a larger order than @order.
218
+ * If @order is larger than the order of the entry found in i_pages, this
219
+ * function returns a dax_is_conflict entry.
229220 *
230221 * Must be called with the i_pages lock held.
231222 */
232
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
233
- pgoff_t index, void ***slotp)
223
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
234224 {
235
- void *entry, **slot;
225
+ void *entry;
236226 struct wait_exceptional_entry_queue ewait;
237227 wait_queue_head_t *wq;
238228
....@@ -240,23 +230,22 @@
240230 ewait.wait.func = wake_exceptional_entry_func;
241231
242232 for (;;) {
243
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
244
- &slot);
245
- if (!entry ||
246
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
247
- !slot_locked(mapping, slot)) {
248
- if (slotp)
249
- *slotp = slot;
233
+ entry = xas_find_conflict(xas);
234
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
250235 return entry;
251
- }
236
+ if (dax_entry_order(entry) < order)
237
+ return XA_RETRY_ENTRY;
238
+ if (!dax_is_locked(entry))
239
+ return entry;
252240
253
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
241
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
254242 prepare_to_wait_exclusive(wq, &ewait.wait,
255243 TASK_UNINTERRUPTIBLE);
256
- xa_unlock_irq(&mapping->i_pages);
244
+ xas_unlock_irq(xas);
245
+ xas_reset(xas);
257246 schedule();
258247 finish_wait(wq, &ewait.wait);
259
- xa_lock_irq(&mapping->i_pages);
248
+ xas_lock_irq(xas);
260249 }
261250 }
262251
....@@ -265,8 +254,7 @@
265254 * (it's cycled in clear_inode() after removing the entries from i_pages)
266255 * After we call xas_unlock_irq(), we cannot touch xas->xa.
267256 */
268
-static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index,
269
- void ***slotp, void *entry)
257
+static void wait_entry_unlocked(struct xa_state *xas, void *entry)
270258 {
271259 struct wait_exceptional_entry_queue ewait;
272260 wait_queue_head_t *wq;
....@@ -274,7 +262,7 @@
274262 init_wait(&ewait.wait);
275263 ewait.wait.func = wake_exceptional_entry_func;
276264
277
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
265
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
278266 /*
279267 * Unlike get_unlocked_entry() there is no guarantee that this
280268 * path ever successfully retrieves an unlocked entry before an
....@@ -282,45 +270,43 @@
282270 * never successfully performs its own wake up.
283271 */
284272 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
285
- xa_unlock_irq(&mapping->i_pages);
273
+ xas_unlock_irq(xas);
286274 schedule();
287275 finish_wait(wq, &ewait.wait);
288276 }
289277
290
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
278
+static void put_unlocked_entry(struct xa_state *xas, void *entry,
279
+ enum dax_wake_mode mode)
291280 {
292
- void *entry, **slot;
293
-
294
- xa_lock_irq(&mapping->i_pages);
295
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
296
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
297
- !slot_locked(mapping, slot))) {
298
- xa_unlock_irq(&mapping->i_pages);
299
- return;
300
- }
301
- unlock_slot(mapping, slot);
302
- xa_unlock_irq(&mapping->i_pages);
303
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
304
-}
305
-
306
-static void put_locked_mapping_entry(struct address_space *mapping,
307
- pgoff_t index)
308
-{
309
- unlock_mapping_entry(mapping, index);
281
+ if (entry && !dax_is_conflict(entry))
282
+ dax_wake_entry(xas, entry, mode);
310283 }
311284
312285 /*
313
- * Called when we are done with radix tree entry we looked up via
314
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
286
+ * We used the xa_state to get the entry, but then we locked the entry and
287
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
288
+ * before use.
315289 */
316
-static void put_unlocked_mapping_entry(struct address_space *mapping,
317
- pgoff_t index, void *entry)
290
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
318291 {
319
- if (!entry)
320
- return;
292
+ void *old;
321293
322
- /* We have to wake up next waiter for the radix tree entry lock */
323
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
294
+ BUG_ON(dax_is_locked(entry));
295
+ xas_reset(xas);
296
+ xas_lock_irq(xas);
297
+ old = xas_store(xas, entry);
298
+ xas_unlock_irq(xas);
299
+ BUG_ON(!dax_is_locked(old));
300
+ dax_wake_entry(xas, entry, WAKE_NEXT);
301
+}
302
+
303
+/*
304
+ * Return: The entry stored at this location before it was locked.
305
+ */
306
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
307
+{
308
+ unsigned long v = xa_to_value(entry);
309
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
324310 }
325311
326312 static unsigned long dax_entry_size(void *entry)
....@@ -335,9 +321,9 @@
335321 return PAGE_SIZE;
336322 }
337323
338
-static unsigned long dax_radix_end_pfn(void *entry)
324
+static unsigned long dax_end_pfn(void *entry)
339325 {
340
- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
326
+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
341327 }
342328
343329 /*
....@@ -345,8 +331,8 @@
345331 * 'empty' and 'zero' entries.
346332 */
347333 #define for_each_mapped_pfn(entry, pfn) \
348
- for (pfn = dax_radix_pfn(entry); \
349
- pfn < dax_radix_end_pfn(entry); pfn++)
334
+ for (pfn = dax_to_pfn(entry); \
335
+ pfn < dax_end_pfn(entry); pfn++)
350336
351337 /*
352338 * TODO: for reflink+dax we need a way to associate a single page with
....@@ -403,18 +389,25 @@
403389 return NULL;
404390 }
405391
406
-bool dax_lock_mapping_entry(struct page *page)
392
+/*
393
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
394
+ * @page: The page whose entry we want to lock
395
+ *
396
+ * Context: Process context.
397
+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
398
+ * not be locked.
399
+ */
400
+dax_entry_t dax_lock_page(struct page *page)
407401 {
408
- pgoff_t index;
409
- struct inode *inode;
410
- bool did_lock = false;
411
- void *entry = NULL, **slot;
412
- struct address_space *mapping;
402
+ XA_STATE(xas, NULL, 0);
403
+ void *entry;
413404
405
+ /* Ensure page->mapping isn't freed while we look at it */
414406 rcu_read_lock();
415407 for (;;) {
416
- mapping = READ_ONCE(page->mapping);
408
+ struct address_space *mapping = READ_ONCE(page->mapping);
417409
410
+ entry = NULL;
418411 if (!mapping || !dax_mapping(mapping))
419412 break;
420413
....@@ -425,101 +418,93 @@
425418 * otherwise we would not have a valid pfn_to_page()
426419 * translation.
427420 */
428
- inode = mapping->host;
429
- if (S_ISCHR(inode->i_mode)) {
430
- did_lock = true;
421
+ entry = (void *)~0UL;
422
+ if (S_ISCHR(mapping->host->i_mode))
431423 break;
432
- }
433424
434
- xa_lock_irq(&mapping->i_pages);
425
+ xas.xa = &mapping->i_pages;
426
+ xas_lock_irq(&xas);
435427 if (mapping != page->mapping) {
436
- xa_unlock_irq(&mapping->i_pages);
428
+ xas_unlock_irq(&xas);
437429 continue;
438430 }
439
- index = page->index;
440
-
441
- entry = __radix_tree_lookup(&mapping->i_pages, index,
442
- NULL, &slot);
443
- if (!entry) {
444
- xa_unlock_irq(&mapping->i_pages);
445
- break;
446
- } else if (slot_locked(mapping, slot)) {
431
+ xas_set(&xas, page->index);
432
+ entry = xas_load(&xas);
433
+ if (dax_is_locked(entry)) {
447434 rcu_read_unlock();
448
- wait_entry_unlocked(mapping, index, &slot, entry);
435
+ wait_entry_unlocked(&xas, entry);
449436 rcu_read_lock();
450437 continue;
451438 }
452
- lock_slot(mapping, slot);
453
- did_lock = true;
454
- xa_unlock_irq(&mapping->i_pages);
439
+ dax_lock_entry(&xas, entry);
440
+ xas_unlock_irq(&xas);
455441 break;
456442 }
457443 rcu_read_unlock();
458
-
459
- return did_lock;
444
+ return (dax_entry_t)entry;
460445 }
461446
462
-void dax_unlock_mapping_entry(struct page *page)
447
+void dax_unlock_page(struct page *page, dax_entry_t cookie)
463448 {
464449 struct address_space *mapping = page->mapping;
465
- struct inode *inode = mapping->host;
450
+ XA_STATE(xas, &mapping->i_pages, page->index);
466451
467
- if (S_ISCHR(inode->i_mode))
452
+ if (S_ISCHR(mapping->host->i_mode))
468453 return;
469454
470
- unlock_mapping_entry(mapping, page->index);
455
+ dax_unlock_entry(&xas, (void *)cookie);
471456 }
472457
473458 /*
474
- * Find radix tree entry at given index. If it points to an exceptional entry,
475
- * return it with the radix tree entry locked. If the radix tree doesn't
476
- * contain given index, create an empty exceptional entry for the index and
477
- * return with it locked.
459
+ * Find page cache entry at given index. If it is a DAX entry, return it
460
+ * with the entry locked. If the page cache doesn't contain an entry at
461
+ * that index, add a locked empty entry.
478462 *
479
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
480
- * either return that locked entry or will return an error. This error will
481
- * happen if there are any 4k entries within the 2MiB range that we are
482
- * requesting.
463
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
464
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
465
+ * This will happen if there are any PTE entries within the PMD range
466
+ * that we are requesting.
483467 *
484
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
485
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
486
- * insertion will fail if it finds any 4k entries already in the tree, and a
487
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
488
- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
489
- * well as 2MiB empty entries.
468
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
469
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
470
+ * insertion will fail if it finds any PTE entries already in the tree, and a
471
+ * PTE insertion will cause an existing PMD entry to be unmapped and
472
+ * downgraded to PTE entries. This happens for both PMD zero pages as
473
+ * well as PMD empty entries.
490474 *
491
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
492
- * real storage backing them. We will leave these real 2MiB DAX entries in
493
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
475
+ * The exception to this downgrade path is for PMD entries that have
476
+ * real storage backing them. We will leave these real PMD entries in
477
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
494478 *
495479 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
496480 * persistent memory the benefit is doubtful. We can add that later if we can
497481 * show it helps.
482
+ *
483
+ * On error, this function does not return an ERR_PTR. Instead it returns
484
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
485
+ * overlap with xarray value entries.
498486 */
499
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
500
- unsigned long size_flag)
487
+static void *grab_mapping_entry(struct xa_state *xas,
488
+ struct address_space *mapping, unsigned int order)
501489 {
502
- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
503
- void *entry, **slot;
490
+ unsigned long index = xas->xa_index;
491
+ bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
492
+ void *entry;
504493
505
-restart:
506
- xa_lock_irq(&mapping->i_pages);
507
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
508
-
509
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
510
- entry = ERR_PTR(-EIO);
511
- goto out_unlock;
512
- }
494
+retry:
495
+ pmd_downgrade = false;
496
+ xas_lock_irq(xas);
497
+ entry = get_unlocked_entry(xas, order);
513498
514499 if (entry) {
515
- if (size_flag & RADIX_DAX_PMD) {
516
- if (dax_is_pte_entry(entry)) {
517
- put_unlocked_mapping_entry(mapping, index,
518
- entry);
519
- entry = ERR_PTR(-EEXIST);
520
- goto out_unlock;
521
- }
522
- } else { /* trying to grab a PTE entry */
500
+ if (dax_is_conflict(entry))
501
+ goto fallback;
502
+ if (!xa_is_value(entry)) {
503
+ xas_set_err(xas, -EIO);
504
+ goto out_unlock;
505
+ }
506
+
507
+ if (order == 0) {
523508 if (dax_is_pmd_entry(entry) &&
524509 (dax_is_zero_entry(entry) ||
525510 dax_is_empty_entry(entry))) {
....@@ -528,92 +513,69 @@
528513 }
529514 }
530515
531
- /* No entry for given index? Make sure radix tree is big enough. */
532
- if (!entry || pmd_downgrade) {
533
- int err;
516
+ if (pmd_downgrade) {
517
+ /*
518
+ * Make sure 'entry' remains valid while we drop
519
+ * the i_pages lock.
520
+ */
521
+ dax_lock_entry(xas, entry);
534522
535
- if (pmd_downgrade) {
536
- /*
537
- * Make sure 'entry' remains valid while we drop
538
- * the i_pages lock.
539
- */
540
- entry = lock_slot(mapping, slot);
541
- }
542
-
543
- xa_unlock_irq(&mapping->i_pages);
544523 /*
545524 * Besides huge zero pages the only other thing that gets
546525 * downgraded are empty entries which don't need to be
547526 * unmapped.
548527 */
549
- if (pmd_downgrade && dax_is_zero_entry(entry))
550
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
551
- PG_PMD_NR, false);
552
-
553
- err = radix_tree_preload(
554
- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
555
- if (err) {
556
- if (pmd_downgrade)
557
- put_locked_mapping_entry(mapping, index);
558
- return ERR_PTR(err);
559
- }
560
- xa_lock_irq(&mapping->i_pages);
561
-
562
- if (!entry) {
563
- /*
564
- * We needed to drop the i_pages lock while calling
565
- * radix_tree_preload() and we didn't have an entry to
566
- * lock. See if another thread inserted an entry at
567
- * our index during this time.
568
- */
569
- entry = __radix_tree_lookup(&mapping->i_pages, index,
570
- NULL, &slot);
571
- if (entry) {
572
- radix_tree_preload_end();
573
- xa_unlock_irq(&mapping->i_pages);
574
- goto restart;
575
- }
528
+ if (dax_is_zero_entry(entry)) {
529
+ xas_unlock_irq(xas);
530
+ unmap_mapping_pages(mapping,
531
+ xas->xa_index & ~PG_PMD_COLOUR,
532
+ PG_PMD_NR, false);
533
+ xas_reset(xas);
534
+ xas_lock_irq(xas);
576535 }
577536
578
- if (pmd_downgrade) {
579
- dax_disassociate_entry(entry, mapping, false);
580
- radix_tree_delete(&mapping->i_pages, index);
581
- mapping->nrexceptional--;
582
- dax_wake_mapping_entry_waiter(mapping, index, entry,
583
- true);
584
- }
585
-
586
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
587
-
588
- err = __radix_tree_insert(&mapping->i_pages, index,
589
- dax_radix_order(entry), entry);
590
- radix_tree_preload_end();
591
- if (err) {
592
- xa_unlock_irq(&mapping->i_pages);
593
- /*
594
- * Our insertion of a DAX entry failed, most likely
595
- * because we were inserting a PMD entry and it
596
- * collided with a PTE sized entry at a different
597
- * index in the PMD range. We haven't inserted
598
- * anything into the radix tree and have no waiters to
599
- * wake.
600
- */
601
- return ERR_PTR(err);
602
- }
603
- /* Good, we have inserted empty locked entry into the tree. */
604
- mapping->nrexceptional++;
605
- xa_unlock_irq(&mapping->i_pages);
606
- return entry;
537
+ dax_disassociate_entry(entry, mapping, false);
538
+ xas_store(xas, NULL); /* undo the PMD join */
539
+ dax_wake_entry(xas, entry, WAKE_ALL);
540
+ mapping->nrexceptional--;
541
+ entry = NULL;
542
+ xas_set(xas, index);
607543 }
608
- entry = lock_slot(mapping, slot);
609
- out_unlock:
610
- xa_unlock_irq(&mapping->i_pages);
544
+
545
+ if (entry) {
546
+ dax_lock_entry(xas, entry);
547
+ } else {
548
+ unsigned long flags = DAX_EMPTY;
549
+
550
+ if (order > 0)
551
+ flags |= DAX_PMD;
552
+ entry = dax_make_entry(pfn_to_pfn_t(0), flags);
553
+ dax_lock_entry(xas, entry);
554
+ if (xas_error(xas))
555
+ goto out_unlock;
556
+ mapping->nrexceptional++;
557
+ }
558
+
559
+out_unlock:
560
+ xas_unlock_irq(xas);
561
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
562
+ goto retry;
563
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
564
+ return xa_mk_internal(VM_FAULT_OOM);
565
+ if (xas_error(xas))
566
+ return xa_mk_internal(VM_FAULT_SIGBUS);
611567 return entry;
568
+fallback:
569
+ xas_unlock_irq(xas);
570
+ return xa_mk_internal(VM_FAULT_FALLBACK);
612571 }
613572
614573 /**
615
- * dax_layout_busy_page - find first pinned page in @mapping
574
+ * dax_layout_busy_page_range - find first pinned page in @mapping
616575 * @mapping: address space to scan for a page with ref count > 1
576
+ * @start: Starting offset. Page containing 'start' is included.
577
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
578
+ * pages from 'start' till the end of file are included.
617579 *
618580 * DAX requires ZONE_DEVICE mapped pages. These pages are never
619581 * 'onlined' to the page allocator so they are considered idle when
....@@ -626,13 +588,15 @@
626588 * to be able to run unmap_mapping_range() and subsequently not race
627589 * mapping_mapped() becoming true.
628590 */
629
-struct page *dax_layout_busy_page(struct address_space *mapping)
591
+struct page *dax_layout_busy_page_range(struct address_space *mapping,
592
+ loff_t start, loff_t end)
630593 {
631
- pgoff_t indices[PAGEVEC_SIZE];
594
+ void *entry;
595
+ unsigned int scanned = 0;
632596 struct page *page = NULL;
633
- struct pagevec pvec;
634
- pgoff_t index, end;
635
- unsigned i;
597
+ pgoff_t start_idx = start >> PAGE_SHIFT;
598
+ pgoff_t end_idx;
599
+ XA_STATE(xas, &mapping->i_pages, start_idx);
636600
637601 /*
638602 * In the 'limited' case get_user_pages() for dax is disabled.
....@@ -643,112 +607,93 @@
643607 if (!dax_mapping(mapping) || !mapping_mapped(mapping))
644608 return NULL;
645609
646
- pagevec_init(&pvec);
647
- index = 0;
648
- end = -1;
649
-
610
+ /* If end == LLONG_MAX, all pages from start to till end of file */
611
+ if (end == LLONG_MAX)
612
+ end_idx = ULONG_MAX;
613
+ else
614
+ end_idx = end >> PAGE_SHIFT;
650615 /*
651616 * If we race get_user_pages_fast() here either we'll see the
652
- * elevated page count in the pagevec_lookup and wait, or
617
+ * elevated page count in the iteration and wait, or
653618 * get_user_pages_fast() will see that the page it took a reference
654619 * against is no longer mapped in the page tables and bail to the
655620 * get_user_pages() slow path. The slow path is protected by
656621 * pte_lock() and pmd_lock(). New references are not taken without
657
- * holding those locks, and unmap_mapping_range() will not zero the
622
+ * holding those locks, and unmap_mapping_pages() will not zero the
658623 * pte or pmd without holding the respective lock, so we are
659624 * guaranteed to either see new references or prevent new
660625 * references from being established.
661626 */
662
- unmap_mapping_range(mapping, 0, 0, 0);
627
+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
663628
664
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
665
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
666
- indices)) {
667
- pgoff_t nr_pages = 1;
668
-
669
- for (i = 0; i < pagevec_count(&pvec); i++) {
670
- struct page *pvec_ent = pvec.pages[i];
671
- void *entry;
672
-
673
- index = indices[i];
674
- if (index >= end)
675
- break;
676
-
677
- if (WARN_ON_ONCE(
678
- !radix_tree_exceptional_entry(pvec_ent)))
679
- continue;
680
-
681
- xa_lock_irq(&mapping->i_pages);
682
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
683
- if (entry) {
684
- page = dax_busy_page(entry);
685
- /*
686
- * Account for multi-order entries at
687
- * the end of the pagevec.
688
- */
689
- if (i + 1 >= pagevec_count(&pvec))
690
- nr_pages = 1UL << dax_radix_order(entry);
691
- }
692
- put_unlocked_mapping_entry(mapping, index, entry);
693
- xa_unlock_irq(&mapping->i_pages);
694
- if (page)
695
- break;
696
- }
697
-
698
- /*
699
- * We don't expect normal struct page entries to exist in our
700
- * tree, but we keep these pagevec calls so that this code is
701
- * consistent with the common pattern for handling pagevecs
702
- * throughout the kernel.
703
- */
704
- pagevec_remove_exceptionals(&pvec);
705
- pagevec_release(&pvec);
706
- index += nr_pages;
707
-
629
+ xas_lock_irq(&xas);
630
+ xas_for_each(&xas, entry, end_idx) {
631
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
632
+ continue;
633
+ if (unlikely(dax_is_locked(entry)))
634
+ entry = get_unlocked_entry(&xas, 0);
635
+ if (entry)
636
+ page = dax_busy_page(entry);
637
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
708638 if (page)
709639 break;
640
+ if (++scanned % XA_CHECK_SCHED)
641
+ continue;
642
+
643
+ xas_pause(&xas);
644
+ xas_unlock_irq(&xas);
645
+ cond_resched();
646
+ xas_lock_irq(&xas);
710647 }
648
+ xas_unlock_irq(&xas);
711649 return page;
650
+}
651
+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
652
+
653
+struct page *dax_layout_busy_page(struct address_space *mapping)
654
+{
655
+ return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
712656 }
713657 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
714658
715
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
659
+static int __dax_invalidate_entry(struct address_space *mapping,
716660 pgoff_t index, bool trunc)
717661 {
662
+ XA_STATE(xas, &mapping->i_pages, index);
718663 int ret = 0;
719664 void *entry;
720
- struct radix_tree_root *pages = &mapping->i_pages;
721665
722
- xa_lock_irq(pages);
723
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
724
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
666
+ xas_lock_irq(&xas);
667
+ entry = get_unlocked_entry(&xas, 0);
668
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
725669 goto out;
726670 if (!trunc &&
727
- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
728
- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
671
+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
672
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
729673 goto out;
730674 dax_disassociate_entry(entry, mapping, trunc);
731
- radix_tree_delete(pages, index);
675
+ xas_store(&xas, NULL);
732676 mapping->nrexceptional--;
733677 ret = 1;
734678 out:
735
- put_unlocked_mapping_entry(mapping, index, entry);
736
- xa_unlock_irq(pages);
679
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
680
+ xas_unlock_irq(&xas);
737681 return ret;
738682 }
683
+
739684 /*
740
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
741
- * entry to get unlocked before deleting it.
685
+ * Delete DAX entry at @index from @mapping. Wait for it
686
+ * to be unlocked before deleting it.
742687 */
743688 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
744689 {
745
- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
690
+ int ret = __dax_invalidate_entry(mapping, index, true);
746691
747692 /*
748693 * This gets called from truncate / punch_hole path. As such, the caller
749694 * must hold locks protecting against concurrent modifications of the
750
- * radix tree (usually fs-private i_mmap_sem for writing). Since the
751
- * caller has seen exceptional entry for this index, we better find it
695
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
696
+ * caller has seen a DAX entry for this index, we better find it
752697 * at that index as well...
753698 */
754699 WARN_ON_ONCE(!ret);
....@@ -756,34 +701,38 @@
756701 }
757702
758703 /*
759
- * Invalidate exceptional DAX entry if it is clean.
704
+ * Invalidate DAX entry if it is clean.
760705 */
761706 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
762707 pgoff_t index)
763708 {
764
- return __dax_invalidate_mapping_entry(mapping, index, false);
709
+ return __dax_invalidate_entry(mapping, index, false);
765710 }
766711
767
-static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
768
- sector_t sector, size_t size, struct page *to,
769
- unsigned long vaddr)
712
+static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
713
+ sector_t sector, struct page *to, unsigned long vaddr)
770714 {
771715 void *vto, *kaddr;
772716 pgoff_t pgoff;
773717 long rc;
774718 int id;
775719
776
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
720
+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
777721 if (rc)
778722 return rc;
779723
780724 id = dax_read_lock();
781
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
725
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
782726 if (rc < 0) {
783727 dax_read_unlock(id);
784728 return rc;
785729 }
786730 vto = kmap_atomic(to);
731
+#ifdef CONFIG_ARM
732
+#ifndef copy_user_page
733
+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
734
+#endif
735
+#endif
787736 copy_user_page(vto, (void __force *)kaddr, vaddr, to);
788737 kunmap_atomic(vto);
789738 dax_read_unlock(id);
....@@ -797,64 +746,57 @@
797746 * already in the tree, we will skip the insertion and just dirty the PMD as
798747 * appropriate.
799748 */
800
-static void *dax_insert_mapping_entry(struct address_space *mapping,
801
- struct vm_fault *vmf,
802
- void *entry, pfn_t pfn_t,
803
- unsigned long flags, bool dirty)
749
+static void *dax_insert_entry(struct xa_state *xas,
750
+ struct address_space *mapping, struct vm_fault *vmf,
751
+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
804752 {
805
- struct radix_tree_root *pages = &mapping->i_pages;
806
- unsigned long pfn = pfn_t_to_pfn(pfn_t);
807
- pgoff_t index = vmf->pgoff;
808
- void *new_entry;
753
+ void *new_entry = dax_make_entry(pfn, flags);
809754
810755 if (dirty)
811756 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
812757
813
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
758
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
759
+ unsigned long index = xas->xa_index;
814760 /* we are replacing a zero page with block mapping */
815761 if (dax_is_pmd_entry(entry))
816762 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
817
- PG_PMD_NR, false);
763
+ PG_PMD_NR, false);
818764 else /* pte entry */
819
- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
765
+ unmap_mapping_pages(mapping, index, 1, false);
820766 }
821767
822
- xa_lock_irq(pages);
823
- new_entry = dax_radix_locked_entry(pfn, flags);
824
- if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
768
+ xas_reset(xas);
769
+ xas_lock_irq(xas);
770
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
771
+ void *old;
772
+
825773 dax_disassociate_entry(entry, mapping, false);
826774 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
827
- }
828
-
829
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
830775 /*
831
- * Only swap our new entry into the radix tree if the current
776
+ * Only swap our new entry into the page cache if the current
832777 * entry is a zero page or an empty entry. If a normal PTE or
833
- * PMD entry is already in the tree, we leave it alone. This
778
+ * PMD entry is already in the cache, we leave it alone. This
834779 * means that if we are trying to insert a PTE and the
835780 * existing entry is a PMD, we will just leave the PMD in the
836781 * tree and dirty it if necessary.
837782 */
838
- struct radix_tree_node *node;
839
- void **slot;
840
- void *ret;
841
-
842
- ret = __radix_tree_lookup(pages, index, &node, &slot);
843
- WARN_ON_ONCE(ret != entry);
844
- __radix_tree_replace(pages, node, slot,
845
- new_entry, NULL);
783
+ old = dax_lock_entry(xas, new_entry);
784
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
785
+ DAX_LOCKED));
846786 entry = new_entry;
787
+ } else {
788
+ xas_load(xas); /* Walk the xa_state */
847789 }
848790
849791 if (dirty)
850
- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
792
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
851793
852
- xa_unlock_irq(pages);
794
+ xas_unlock_irq(xas);
853795 return entry;
854796 }
855797
856
-static inline unsigned long
857
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
798
+static inline
799
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
858800 {
859801 unsigned long address;
860802
....@@ -864,8 +806,8 @@
864806 }
865807
866808 /* Walk all mappings of a given index of a file and writeprotect them */
867
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
868
- pgoff_t index, unsigned long pfn)
809
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
810
+ unsigned long pfn)
869811 {
870812 struct vm_area_struct *vma;
871813 pte_t pte, *ptep = NULL;
....@@ -874,7 +816,8 @@
874816
875817 i_mmap_lock_read(mapping);
876818 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
877
- unsigned long address, start, end;
819
+ struct mmu_notifier_range range;
820
+ unsigned long address;
878821
879822 cond_resched();
880823
....@@ -884,11 +827,12 @@
884827 address = pgoff_address(index, vma);
885828
886829 /*
887
- * Note because we provide start/end to follow_pte_pmd it will
888
- * call mmu_notifier_invalidate_range_start() on our behalf
889
- * before taking any lock.
830
+ * follow_invalidate_pte() will use the range to call
831
+ * mmu_notifier_invalidate_range_start() on our behalf before
832
+ * taking any lock.
890833 */
891
- if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
834
+ if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
835
+ &pmdp, &ptl))
892836 continue;
893837
894838 /*
....@@ -907,7 +851,8 @@
907851 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
908852 goto unlock_pmd;
909853
910
- flush_cache_page(vma, address, pfn);
854
+ flush_cache_range(vma, address,
855
+ address + HPAGE_PMD_SIZE);
911856 pmd = pmdp_invalidate(vma, address, pmdp);
912857 pmd = pmd_wrprotect(pmd);
913858 pmd = pmd_mkclean(pmd);
....@@ -930,50 +875,53 @@
930875 pte_unmap_unlock(ptep, ptl);
931876 }
932877
933
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
878
+ mmu_notifier_invalidate_range_end(&range);
934879 }
935880 i_mmap_unlock_read(mapping);
936881 }
937882
938
-static int dax_writeback_one(struct dax_device *dax_dev,
939
- struct address_space *mapping, pgoff_t index, void *entry)
883
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
884
+ struct address_space *mapping, void *entry)
940885 {
941
- struct radix_tree_root *pages = &mapping->i_pages;
942
- void *entry2, **slot;
943
- unsigned long pfn;
886
+ unsigned long pfn, index, count;
944887 long ret = 0;
945
- size_t size;
946888
947889 /*
948890 * A page got tagged dirty in DAX mapping? Something is seriously
949891 * wrong.
950892 */
951
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
893
+ if (WARN_ON(!xa_is_value(entry)))
952894 return -EIO;
953895
954
- xa_lock_irq(pages);
955
- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
956
- /* Entry got punched out / reallocated? */
957
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
958
- goto put_unlocked;
959
- /*
960
- * Entry got reallocated elsewhere? No need to writeback. We have to
961
- * compare pfns as we must not bail out due to difference in lockbit
962
- * or entry type.
963
- */
964
- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
965
- goto put_unlocked;
966
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
967
- dax_is_zero_entry(entry))) {
968
- ret = -EIO;
969
- goto put_unlocked;
896
+ if (unlikely(dax_is_locked(entry))) {
897
+ void *old_entry = entry;
898
+
899
+ entry = get_unlocked_entry(xas, 0);
900
+
901
+ /* Entry got punched out / reallocated? */
902
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
903
+ goto put_unlocked;
904
+ /*
905
+ * Entry got reallocated elsewhere? No need to writeback.
906
+ * We have to compare pfns as we must not bail out due to
907
+ * difference in lockbit or entry type.
908
+ */
909
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
910
+ goto put_unlocked;
911
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
912
+ dax_is_zero_entry(entry))) {
913
+ ret = -EIO;
914
+ goto put_unlocked;
915
+ }
916
+
917
+ /* Another fsync thread may have already done this entry */
918
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
919
+ goto put_unlocked;
970920 }
971921
972
- /* Another fsync thread may have already written back this entry */
973
- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
974
- goto put_unlocked;
975922 /* Lock the entry to serialize with page faults */
976
- entry = lock_slot(mapping, slot);
923
+ dax_lock_entry(xas, entry);
924
+
977925 /*
978926 * We can clear the tag now but we have to be careful so that concurrent
979927 * dax_writeback_one() calls for the same index cannot finish before we
....@@ -981,37 +929,39 @@
981929 * at the entry only under the i_pages lock and once they do that
982930 * they will see the entry locked and wait for it to unlock.
983931 */
984
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
985
- xa_unlock_irq(pages);
932
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
933
+ xas_unlock_irq(xas);
986934
987935 /*
988
- * Even if dax_writeback_mapping_range() was given a wbc->range_start
989
- * in the middle of a PMD, the 'index' we are given will be aligned to
990
- * the start index of the PMD, as will the pfn we pull from 'entry'.
936
+ * If dax_writeback_mapping_range() was given a wbc->range_start
937
+ * in the middle of a PMD, the 'index' we use needs to be
938
+ * aligned to the start of the PMD.
991939 * This allows us to flush for PMD_SIZE and not have to worry about
992940 * partial PMD writebacks.
993941 */
994
- pfn = dax_radix_pfn(entry);
995
- size = PAGE_SIZE << dax_radix_order(entry);
942
+ pfn = dax_to_pfn(entry);
943
+ count = 1UL << dax_entry_order(entry);
944
+ index = xas->xa_index & ~(count - 1);
996945
997
- dax_mapping_entry_mkclean(mapping, index, pfn);
998
- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
946
+ dax_entry_mkclean(mapping, index, pfn);
947
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
999948 /*
1000949 * After we have flushed the cache, we can clear the dirty tag. There
1001950 * cannot be new dirty data in the pfn after the flush has completed as
1002951 * the pfn mappings are writeprotected and fault waits for mapping
1003952 * entry lock.
1004953 */
1005
- xa_lock_irq(pages);
1006
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
1007
- xa_unlock_irq(pages);
1008
- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
1009
- put_locked_mapping_entry(mapping, index);
954
+ xas_reset(xas);
955
+ xas_lock_irq(xas);
956
+ xas_store(xas, entry);
957
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
958
+ dax_wake_entry(xas, entry, WAKE_NEXT);
959
+
960
+ trace_dax_writeback_one(mapping->host, index, count);
1010961 return ret;
1011962
1012963 put_unlocked:
1013
- put_unlocked_mapping_entry(mapping, index, entry2);
1014
- xa_unlock_irq(pages);
964
+ put_unlocked_entry(xas, entry, WAKE_NEXT);
1015965 return ret;
1016966 }
1017967
....@@ -1021,15 +971,14 @@
1021971 * on persistent storage prior to completion of the operation.
1022972 */
1023973 int dax_writeback_mapping_range(struct address_space *mapping,
1024
- struct block_device *bdev, struct writeback_control *wbc)
974
+ struct dax_device *dax_dev, struct writeback_control *wbc)
1025975 {
976
+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1026977 struct inode *inode = mapping->host;
1027
- pgoff_t start_index, end_index;
1028
- pgoff_t indices[PAGEVEC_SIZE];
1029
- struct dax_device *dax_dev;
1030
- struct pagevec pvec;
1031
- bool done = false;
1032
- int i, ret = 0;
978
+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
979
+ void *entry;
980
+ int ret = 0;
981
+ unsigned int scanned = 0;
1033982
1034983 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1035984 return -EIO;
....@@ -1037,45 +986,28 @@
1037986 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
1038987 return 0;
1039988
1040
- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
1041
- if (!dax_dev)
1042
- return -EIO;
989
+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
1043990
1044
- start_index = wbc->range_start >> PAGE_SHIFT;
1045
- end_index = wbc->range_end >> PAGE_SHIFT;
991
+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1046992
1047
- trace_dax_writeback_range(inode, start_index, end_index);
1048
-
1049
- tag_pages_for_writeback(mapping, start_index, end_index);
1050
-
1051
- pagevec_init(&pvec);
1052
- while (!done) {
1053
- pvec.nr = find_get_entries_tag(mapping, start_index,
1054
- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
1055
- pvec.pages, indices);
1056
-
1057
- if (pvec.nr == 0)
993
+ xas_lock_irq(&xas);
994
+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
995
+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
996
+ if (ret < 0) {
997
+ mapping_set_error(mapping, ret);
1058998 break;
1059
-
1060
- for (i = 0; i < pvec.nr; i++) {
1061
- if (indices[i] > end_index) {
1062
- done = true;
1063
- break;
1064
- }
1065
-
1066
- ret = dax_writeback_one(dax_dev, mapping, indices[i],
1067
- pvec.pages[i]);
1068
- if (ret < 0) {
1069
- mapping_set_error(mapping, ret);
1070
- goto out;
1071
- }
1072999 }
1073
- start_index = indices[pvec.nr - 1] + 1;
1000
+ if (++scanned % XA_CHECK_SCHED)
1001
+ continue;
1002
+
1003
+ xas_pause(&xas);
1004
+ xas_unlock_irq(&xas);
1005
+ cond_resched();
1006
+ xas_lock_irq(&xas);
10741007 }
1075
-out:
1076
- put_dax(dax_dev);
1077
- trace_dax_writeback_range_done(inode, start_index, end_index);
1078
- return (ret < 0 ? ret : 0);
1008
+ xas_unlock_irq(&xas);
1009
+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1010
+ return ret;
10791011 }
10801012 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
10811013
....@@ -1123,69 +1055,63 @@
11231055 * If this page is ever written to we will re-fault and change the mapping to
11241056 * point to real DAX storage instead.
11251057 */
1126
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
1127
- struct vm_fault *vmf)
1058
+static vm_fault_t dax_load_hole(struct xa_state *xas,
1059
+ struct address_space *mapping, void **entry,
1060
+ struct vm_fault *vmf)
11281061 {
11291062 struct inode *inode = mapping->host;
11301063 unsigned long vaddr = vmf->address;
11311064 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
11321065 vm_fault_t ret;
11331066
1134
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
1135
- false);
1067
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1068
+ DAX_ZERO_PAGE, false);
1069
+
11361070 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
11371071 trace_dax_load_hole(inode, vmf, ret);
11381072 return ret;
11391073 }
11401074
1141
-static bool dax_range_is_aligned(struct block_device *bdev,
1142
- unsigned int offset, unsigned int length)
1075
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
11431076 {
1144
- unsigned short sector_size = bdev_logical_block_size(bdev);
1077
+ sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
1078
+ pgoff_t pgoff;
1079
+ long rc, id;
1080
+ void *kaddr;
1081
+ bool page_aligned = false;
1082
+ unsigned offset = offset_in_page(pos);
1083
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
11451084
1146
- if (!IS_ALIGNED(offset, sector_size))
1147
- return false;
1148
- if (!IS_ALIGNED(length, sector_size))
1149
- return false;
1085
+ if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
1086
+ (size == PAGE_SIZE))
1087
+ page_aligned = true;
11501088
1151
- return true;
1152
-}
1089
+ rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
1090
+ if (rc)
1091
+ return rc;
11531092
1154
-int __dax_zero_page_range(struct block_device *bdev,
1155
- struct dax_device *dax_dev, sector_t sector,
1156
- unsigned int offset, unsigned int size)
1157
-{
1158
- if (dax_range_is_aligned(bdev, offset, size)) {
1159
- sector_t start_sector = sector + (offset >> 9);
1093
+ id = dax_read_lock();
11601094
1161
- return blkdev_issue_zeroout(bdev, start_sector,
1162
- size >> 9, GFP_NOFS, 0);
1163
- } else {
1164
- pgoff_t pgoff;
1165
- long rc, id;
1166
- void *kaddr;
1167
-
1168
- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
1169
- if (rc)
1170
- return rc;
1171
-
1172
- id = dax_read_lock();
1173
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
1174
- if (rc < 0) {
1175
- dax_read_unlock(id);
1176
- return rc;
1177
- }
1178
- memset(kaddr + offset, 0, size);
1179
- dax_flush(dax_dev, kaddr + offset, size);
1095
+ if (page_aligned)
1096
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
1097
+ else
1098
+ rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
1099
+ if (rc < 0) {
11801100 dax_read_unlock(id);
1101
+ return rc;
11811102 }
1182
- return 0;
1103
+
1104
+ if (!page_aligned) {
1105
+ memset(kaddr + offset, 0, size);
1106
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
1107
+ }
1108
+ dax_read_unlock(id);
1109
+ return size;
11831110 }
1184
-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
11851111
11861112 static loff_t
11871113 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1188
- struct iomap *iomap)
1114
+ struct iomap *iomap, struct iomap *srcmap)
11891115 {
11901116 struct block_device *bdev = iomap->bdev;
11911117 struct dax_device *dax_dev = iomap->dax_dev;
....@@ -1295,7 +1221,7 @@
12951221 unsigned flags = 0;
12961222
12971223 if (iov_iter_rw(iter) == WRITE) {
1298
- lockdep_assert_held_exclusive(&inode->i_rwsem);
1224
+ lockdep_assert_held_write(&inode->i_rwsem);
12991225 flags |= IOMAP_WRITE;
13001226 } else {
13011227 lockdep_assert_held(&inode->i_rwsem);
....@@ -1322,9 +1248,7 @@
13221248 {
13231249 if (error == 0)
13241250 return VM_FAULT_NOPAGE;
1325
- if (error == -ENOMEM)
1326
- return VM_FAULT_OOM;
1327
- return VM_FAULT_SIGBUS;
1251
+ return vmf_error(error);
13281252 }
13291253
13301254 /*
....@@ -1343,10 +1267,12 @@
13431267 {
13441268 struct vm_area_struct *vma = vmf->vma;
13451269 struct address_space *mapping = vma->vm_file->f_mapping;
1270
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
13461271 struct inode *inode = mapping->host;
13471272 unsigned long vaddr = vmf->address;
13481273 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1349
- struct iomap iomap = { 0 };
1274
+ struct iomap iomap = { .type = IOMAP_HOLE };
1275
+ struct iomap srcmap = { .type = IOMAP_HOLE };
13501276 unsigned flags = IOMAP_FAULT;
13511277 int error, major = 0;
13521278 bool write = vmf->flags & FAULT_FLAG_WRITE;
....@@ -1369,9 +1295,9 @@
13691295 if (write && !vmf->cow_page)
13701296 flags |= IOMAP_WRITE;
13711297
1372
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1373
- if (IS_ERR(entry)) {
1374
- ret = dax_fault_return(PTR_ERR(entry));
1298
+ entry = grab_mapping_entry(&xas, mapping, 0);
1299
+ if (xa_is_internal(entry)) {
1300
+ ret = xa_to_internal(entry);
13751301 goto out;
13761302 }
13771303
....@@ -1391,7 +1317,7 @@
13911317 * the file system block size to be equal the page size, which means
13921318 * that we never have to deal with more than a single extent here.
13931319 */
1394
- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1320
+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
13951321 if (iomap_errp)
13961322 *iomap_errp = error;
13971323 if (error) {
....@@ -1412,8 +1338,8 @@
14121338 clear_user_highpage(vmf->cow_page, vaddr);
14131339 break;
14141340 case IOMAP_MAPPED:
1415
- error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1416
- sector, PAGE_SIZE, vmf->cow_page, vaddr);
1341
+ error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
1342
+ sector, vmf->cow_page, vaddr);
14171343 break;
14181344 default:
14191345 WARN_ON_ONCE(1);
....@@ -1444,7 +1370,7 @@
14441370 if (error < 0)
14451371 goto error_finish_iomap;
14461372
1447
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1373
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
14481374 0, write && !sync);
14491375
14501376 /*
....@@ -1472,10 +1398,10 @@
14721398 case IOMAP_UNWRITTEN:
14731399 case IOMAP_HOLE:
14741400 if (!write) {
1475
- ret = dax_load_hole(mapping, entry, vmf);
1401
+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
14761402 goto finish_iomap;
14771403 }
1478
- /*FALLTHRU*/
1404
+ fallthrough;
14791405 default:
14801406 WARN_ON_ONCE(1);
14811407 error = -EIO;
....@@ -1499,21 +1425,22 @@
14991425 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
15001426 }
15011427 unlock_entry:
1502
- put_locked_mapping_entry(mapping, vmf->pgoff);
1428
+ dax_unlock_entry(&xas, entry);
15031429 out:
15041430 trace_dax_pte_fault_done(inode, vmf, ret);
15051431 return ret | major;
15061432 }
15071433
15081434 #ifdef CONFIG_FS_DAX_PMD
1509
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1510
- void *entry)
1435
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1436
+ struct iomap *iomap, void **entry)
15111437 {
15121438 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
15131439 unsigned long pmd_addr = vmf->address & PMD_MASK;
1440
+ struct vm_area_struct *vma = vmf->vma;
15141441 struct inode *inode = mapping->host;
1442
+ pgtable_t pgtable = NULL;
15151443 struct page *zero_page;
1516
- void *ret = NULL;
15171444 spinlock_t *ptl;
15181445 pmd_t pmd_entry;
15191446 pfn_t pfn;
....@@ -1524,8 +1451,14 @@
15241451 goto fallback;
15251452
15261453 pfn = page_to_pfn_t(zero_page);
1527
- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1528
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
1454
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1455
+ DAX_PMD | DAX_ZERO_PAGE, false);
1456
+
1457
+ if (arch_needs_pgtable_deposit()) {
1458
+ pgtable = pte_alloc_one(vma->vm_mm);
1459
+ if (!pgtable)
1460
+ return VM_FAULT_OOM;
1461
+ }
15291462
15301463 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
15311464 if (!pmd_none(*(vmf->pmd))) {
....@@ -1533,15 +1466,21 @@
15331466 goto fallback;
15341467 }
15351468
1469
+ if (pgtable) {
1470
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1471
+ mm_inc_nr_ptes(vma->vm_mm);
1472
+ }
15361473 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
15371474 pmd_entry = pmd_mkhuge(pmd_entry);
15381475 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
15391476 spin_unlock(ptl);
1540
- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
1477
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
15411478 return VM_FAULT_NOPAGE;
15421479
15431480 fallback:
1544
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
1481
+ if (pgtable)
1482
+ pte_free(vma->vm_mm, pgtable);
1483
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
15451484 return VM_FAULT_FALLBACK;
15461485 }
15471486
....@@ -1550,14 +1489,16 @@
15501489 {
15511490 struct vm_area_struct *vma = vmf->vma;
15521491 struct address_space *mapping = vma->vm_file->f_mapping;
1492
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
15531493 unsigned long pmd_addr = vmf->address & PMD_MASK;
15541494 bool write = vmf->flags & FAULT_FLAG_WRITE;
15551495 bool sync;
15561496 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
15571497 struct inode *inode = mapping->host;
15581498 vm_fault_t result = VM_FAULT_FALLBACK;
1559
- struct iomap iomap = { 0 };
1560
- pgoff_t max_pgoff, pgoff;
1499
+ struct iomap iomap = { .type = IOMAP_HOLE };
1500
+ struct iomap srcmap = { .type = IOMAP_HOLE };
1501
+ pgoff_t max_pgoff;
15611502 void *entry;
15621503 loff_t pos;
15631504 int error;
....@@ -1568,7 +1509,6 @@
15681509 * supposed to hold locks serializing us with truncate / punch hole so
15691510 * this is a reliable test.
15701511 */
1571
- pgoff = linear_page_index(vma, pmd_addr);
15721512 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
15731513
15741514 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
....@@ -1577,7 +1517,7 @@
15771517 * Make sure that the faulting address's PMD offset (color) matches
15781518 * the PMD offset from the start of the file. This is necessary so
15791519 * that a PMD range in the page table overlaps exactly with a PMD
1580
- * range in the radix tree.
1520
+ * range in the page cache.
15811521 */
15821522 if ((vmf->pgoff & PG_PMD_COLOUR) !=
15831523 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
....@@ -1593,24 +1533,26 @@
15931533 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
15941534 goto fallback;
15951535
1596
- if (pgoff >= max_pgoff) {
1536
+ if (xas.xa_index >= max_pgoff) {
15971537 result = VM_FAULT_SIGBUS;
15981538 goto out;
15991539 }
16001540
16011541 /* If the PMD would extend beyond the file size */
1602
- if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
1542
+ if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
16031543 goto fallback;
16041544
16051545 /*
1606
- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1607
- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1608
- * is already in the tree, for instance), it will return -EEXIST and
1609
- * we just fall back to 4k entries.
1546
+ * grab_mapping_entry() will make sure we get an empty PMD entry,
1547
+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
1548
+ * entry is already in the array, for instance), it will return
1549
+ * VM_FAULT_FALLBACK.
16101550 */
1611
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1612
- if (IS_ERR(entry))
1551
+ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
1552
+ if (xa_is_internal(entry)) {
1553
+ result = xa_to_internal(entry);
16131554 goto fallback;
1555
+ }
16141556
16151557 /*
16161558 * It is possible, particularly with mixed reads & writes to private
....@@ -1629,8 +1571,9 @@
16291571 * setting up a mapping, so really we're using iomap_begin() as a way
16301572 * to look up our filesystem block.
16311573 */
1632
- pos = (loff_t)pgoff << PAGE_SHIFT;
1633
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1574
+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1575
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
1576
+ &srcmap);
16341577 if (error)
16351578 goto unlock_entry;
16361579
....@@ -1645,8 +1588,8 @@
16451588 if (error < 0)
16461589 goto finish_iomap;
16471590
1648
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1649
- RADIX_DAX_PMD, write && !sync);
1591
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1592
+ DAX_PMD, write && !sync);
16501593
16511594 /*
16521595 * If we are doing synchronous page fault and inode needs fsync,
....@@ -1669,7 +1612,7 @@
16691612 case IOMAP_HOLE:
16701613 if (WARN_ON_ONCE(write))
16711614 break;
1672
- result = dax_pmd_load_hole(vmf, &iomap, entry);
1615
+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
16731616 break;
16741617 default:
16751618 WARN_ON_ONCE(1);
....@@ -1692,7 +1635,7 @@
16921635 &iomap);
16931636 }
16941637 unlock_entry:
1695
- put_locked_mapping_entry(mapping, pgoff);
1638
+ dax_unlock_entry(&xas, entry);
16961639 fallback:
16971640 if (result == VM_FAULT_FALLBACK) {
16981641 split_huge_pmd(vma, vmf->pmd, vmf->address);
....@@ -1737,53 +1680,46 @@
17371680 }
17381681 EXPORT_SYMBOL_GPL(dax_iomap_fault);
17391682
1740
-/**
1683
+/*
17411684 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
17421685 * @vmf: The description of the fault
1743
- * @pe_size: Size of entry to be inserted
17441686 * @pfn: PFN to insert
1687
+ * @order: Order of entry to insert.
17451688 *
1746
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
1747
- * DAX file. It takes care of marking corresponding radix tree entry as dirty
1748
- * as well.
1689
+ * This function inserts a writeable PTE or PMD entry into the page tables
1690
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
17491691 */
1750
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
1751
- enum page_entry_size pe_size,
1752
- pfn_t pfn)
1692
+static vm_fault_t
1693
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
17531694 {
17541695 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1755
- void *entry, **slot;
1756
- pgoff_t index = vmf->pgoff;
1696
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1697
+ void *entry;
17571698 vm_fault_t ret;
17581699
1759
- xa_lock_irq(&mapping->i_pages);
1760
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
1700
+ xas_lock_irq(&xas);
1701
+ entry = get_unlocked_entry(&xas, order);
17611702 /* Did we race with someone splitting entry or so? */
1762
- if (!entry ||
1763
- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
1764
- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
1765
- put_unlocked_mapping_entry(mapping, index, entry);
1766
- xa_unlock_irq(&mapping->i_pages);
1703
+ if (!entry || dax_is_conflict(entry) ||
1704
+ (order == 0 && !dax_is_pte_entry(entry))) {
1705
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
1706
+ xas_unlock_irq(&xas);
17671707 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
17681708 VM_FAULT_NOPAGE);
17691709 return VM_FAULT_NOPAGE;
17701710 }
1771
- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
1772
- entry = lock_slot(mapping, slot);
1773
- xa_unlock_irq(&mapping->i_pages);
1774
- switch (pe_size) {
1775
- case PE_SIZE_PTE:
1711
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1712
+ dax_lock_entry(&xas, entry);
1713
+ xas_unlock_irq(&xas);
1714
+ if (order == 0)
17761715 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1777
- break;
17781716 #ifdef CONFIG_FS_DAX_PMD
1779
- case PE_SIZE_PMD:
1717
+ else if (order == PMD_ORDER)
17801718 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1781
- break;
17821719 #endif
1783
- default:
1720
+ else
17841721 ret = VM_FAULT_FALLBACK;
1785
- }
1786
- put_locked_mapping_entry(mapping, index);
1722
+ dax_unlock_entry(&xas, entry);
17871723 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
17881724 return ret;
17891725 }
....@@ -1803,17 +1739,12 @@
18031739 {
18041740 int err;
18051741 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1806
- size_t len = 0;
1742
+ unsigned int order = pe_order(pe_size);
1743
+ size_t len = PAGE_SIZE << order;
18071744
1808
- if (pe_size == PE_SIZE_PTE)
1809
- len = PAGE_SIZE;
1810
- else if (pe_size == PE_SIZE_PMD)
1811
- len = PMD_SIZE;
1812
- else
1813
- WARN_ON_ONCE(1);
18141745 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
18151746 if (err)
18161747 return VM_FAULT_SIGBUS;
1817
- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
1748
+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
18181749 }
18191750 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);