.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * fs/dax.c - Direct Access filesystem code |
---|
3 | 4 | * Copyright (c) 2013-2014 Intel Corporation |
---|
4 | 5 | * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> |
---|
5 | 6 | * Author: Ross Zwisler <ross.zwisler@linux.intel.com> |
---|
6 | | - * |
---|
7 | | - * This program is free software; you can redistribute it and/or modify it |
---|
8 | | - * under the terms and conditions of the GNU General Public License, |
---|
9 | | - * version 2, as published by the Free Software Foundation. |
---|
10 | | - * |
---|
11 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
---|
12 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
13 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
---|
14 | | - * more details. |
---|
15 | 7 | */ |
---|
16 | 8 | |
---|
17 | 9 | #include <linux/atomic.h> |
---|
.. | .. |
---|
33 | 25 | #include <linux/sizes.h> |
---|
34 | 26 | #include <linux/mmu_notifier.h> |
---|
35 | 27 | #include <linux/iomap.h> |
---|
36 | | -#include "internal.h" |
---|
| 28 | +#include <asm/pgalloc.h> |
---|
37 | 29 | |
---|
38 | 30 | #define CREATE_TRACE_POINTS |
---|
39 | 31 | #include <trace/events/fs_dax.h> |
---|
| 32 | + |
---|
| 33 | +static inline unsigned int pe_order(enum page_entry_size pe_size) |
---|
| 34 | +{ |
---|
| 35 | + if (pe_size == PE_SIZE_PTE) |
---|
| 36 | + return PAGE_SHIFT - PAGE_SHIFT; |
---|
| 37 | + if (pe_size == PE_SIZE_PMD) |
---|
| 38 | + return PMD_SHIFT - PAGE_SHIFT; |
---|
| 39 | + if (pe_size == PE_SIZE_PUD) |
---|
| 40 | + return PUD_SHIFT - PAGE_SHIFT; |
---|
| 41 | + return ~0; |
---|
| 42 | +} |
---|
40 | 43 | |
---|
41 | 44 | /* We choose 4096 entries - same as per-zone page wait tables */ |
---|
42 | 45 | #define DAX_WAIT_TABLE_BITS 12 |
---|
.. | .. |
---|
45 | 48 | /* The 'colour' (ie low bits) within a PMD of a page offset. */ |
---|
46 | 49 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) |
---|
47 | 50 | #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) |
---|
| 51 | + |
---|
| 52 | +/* The order of a PMD entry */ |
---|
| 53 | +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) |
---|
48 | 54 | |
---|
49 | 55 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; |
---|
50 | 56 | |
---|
.. | .. |
---|
59 | 65 | fs_initcall(init_dax_wait_table); |
---|
60 | 66 | |
---|
61 | 67 | /* |
---|
62 | | - * We use lowest available bit in exceptional entry for locking, one bit for |
---|
63 | | - * the entry size (PMD) and two more to tell us if the entry is a zero page or |
---|
64 | | - * an empty entry that is just used for locking. In total four special bits. |
---|
| 68 | + * DAX pagecache entries use XArray value entries so they can't be mistaken |
---|
| 69 | + * for pages. We use one bit for locking, one bit for the entry size (PMD) |
---|
| 70 | + * and two more to tell us if the entry is a zero page or an empty entry that |
---|
| 71 | + * is just used for locking. In total four special bits. |
---|
65 | 72 | * |
---|
66 | 73 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE |
---|
67 | 74 | * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem |
---|
68 | 75 | * block allocation. |
---|
69 | 76 | */ |
---|
70 | | -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) |
---|
71 | | -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) |
---|
72 | | -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) |
---|
73 | | -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) |
---|
74 | | -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) |
---|
| 77 | +#define DAX_SHIFT (4) |
---|
| 78 | +#define DAX_LOCKED (1UL << 0) |
---|
| 79 | +#define DAX_PMD (1UL << 1) |
---|
| 80 | +#define DAX_ZERO_PAGE (1UL << 2) |
---|
| 81 | +#define DAX_EMPTY (1UL << 3) |
---|
75 | 82 | |
---|
76 | | -static unsigned long dax_radix_pfn(void *entry) |
---|
| 83 | +static unsigned long dax_to_pfn(void *entry) |
---|
77 | 84 | { |
---|
78 | | - return (unsigned long)entry >> RADIX_DAX_SHIFT; |
---|
| 85 | + return xa_to_value(entry) >> DAX_SHIFT; |
---|
79 | 86 | } |
---|
80 | 87 | |
---|
81 | | -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) |
---|
| 88 | +static void *dax_make_entry(pfn_t pfn, unsigned long flags) |
---|
82 | 89 | { |
---|
83 | | - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | |
---|
84 | | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); |
---|
| 90 | + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); |
---|
85 | 91 | } |
---|
86 | 92 | |
---|
87 | | -static unsigned int dax_radix_order(void *entry) |
---|
| 93 | +static bool dax_is_locked(void *entry) |
---|
88 | 94 | { |
---|
89 | | - if ((unsigned long)entry & RADIX_DAX_PMD) |
---|
90 | | - return PMD_SHIFT - PAGE_SHIFT; |
---|
| 95 | + return xa_to_value(entry) & DAX_LOCKED; |
---|
| 96 | +} |
---|
| 97 | + |
---|
| 98 | +static unsigned int dax_entry_order(void *entry) |
---|
| 99 | +{ |
---|
| 100 | + if (xa_to_value(entry) & DAX_PMD) |
---|
| 101 | + return PMD_ORDER; |
---|
91 | 102 | return 0; |
---|
92 | 103 | } |
---|
93 | 104 | |
---|
94 | | -static int dax_is_pmd_entry(void *entry) |
---|
| 105 | +static unsigned long dax_is_pmd_entry(void *entry) |
---|
95 | 106 | { |
---|
96 | | - return (unsigned long)entry & RADIX_DAX_PMD; |
---|
| 107 | + return xa_to_value(entry) & DAX_PMD; |
---|
97 | 108 | } |
---|
98 | 109 | |
---|
99 | | -static int dax_is_pte_entry(void *entry) |
---|
| 110 | +static bool dax_is_pte_entry(void *entry) |
---|
100 | 111 | { |
---|
101 | | - return !((unsigned long)entry & RADIX_DAX_PMD); |
---|
| 112 | + return !(xa_to_value(entry) & DAX_PMD); |
---|
102 | 113 | } |
---|
103 | 114 | |
---|
104 | 115 | static int dax_is_zero_entry(void *entry) |
---|
105 | 116 | { |
---|
106 | | - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; |
---|
| 117 | + return xa_to_value(entry) & DAX_ZERO_PAGE; |
---|
107 | 118 | } |
---|
108 | 119 | |
---|
109 | 120 | static int dax_is_empty_entry(void *entry) |
---|
110 | 121 | { |
---|
111 | | - return (unsigned long)entry & RADIX_DAX_EMPTY; |
---|
| 122 | + return xa_to_value(entry) & DAX_EMPTY; |
---|
112 | 123 | } |
---|
113 | 124 | |
---|
114 | 125 | /* |
---|
115 | | - * DAX radix tree locking |
---|
| 126 | + * true if the entry that was found is of a smaller order than the entry |
---|
| 127 | + * we were looking for |
---|
| 128 | + */ |
---|
| 129 | +static bool dax_is_conflict(void *entry) |
---|
| 130 | +{ |
---|
| 131 | + return entry == XA_RETRY_ENTRY; |
---|
| 132 | +} |
---|
| 133 | + |
---|
| 134 | +/* |
---|
| 135 | + * DAX page cache entry locking |
---|
116 | 136 | */ |
---|
117 | 137 | struct exceptional_entry_key { |
---|
118 | | - struct address_space *mapping; |
---|
| 138 | + struct xarray *xa; |
---|
119 | 139 | pgoff_t entry_start; |
---|
120 | 140 | }; |
---|
121 | 141 | |
---|
.. | .. |
---|
124 | 144 | struct exceptional_entry_key key; |
---|
125 | 145 | }; |
---|
126 | 146 | |
---|
127 | | -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, |
---|
128 | | - pgoff_t index, void *entry, struct exceptional_entry_key *key) |
---|
| 147 | +/** |
---|
| 148 | + * enum dax_wake_mode: waitqueue wakeup behaviour |
---|
| 149 | + * @WAKE_ALL: wake all waiters in the waitqueue |
---|
| 150 | + * @WAKE_NEXT: wake only the first waiter in the waitqueue |
---|
| 151 | + */ |
---|
| 152 | +enum dax_wake_mode { |
---|
| 153 | + WAKE_ALL, |
---|
| 154 | + WAKE_NEXT, |
---|
| 155 | +}; |
---|
| 156 | + |
---|
| 157 | +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, |
---|
| 158 | + void *entry, struct exceptional_entry_key *key) |
---|
129 | 159 | { |
---|
130 | 160 | unsigned long hash; |
---|
| 161 | + unsigned long index = xas->xa_index; |
---|
131 | 162 | |
---|
132 | 163 | /* |
---|
133 | 164 | * If 'entry' is a PMD, align the 'index' that we use for the wait |
---|
.. | .. |
---|
136 | 167 | */ |
---|
137 | 168 | if (dax_is_pmd_entry(entry)) |
---|
138 | 169 | index &= ~PG_PMD_COLOUR; |
---|
139 | | - |
---|
140 | | - key->mapping = mapping; |
---|
| 170 | + key->xa = xas->xa; |
---|
141 | 171 | key->entry_start = index; |
---|
142 | 172 | |
---|
143 | | - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); |
---|
| 173 | + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); |
---|
144 | 174 | return wait_table + hash; |
---|
145 | 175 | } |
---|
146 | 176 | |
---|
147 | | -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, |
---|
148 | | - int sync, void *keyp) |
---|
| 177 | +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, |
---|
| 178 | + unsigned int mode, int sync, void *keyp) |
---|
149 | 179 | { |
---|
150 | 180 | struct exceptional_entry_key *key = keyp; |
---|
151 | 181 | struct wait_exceptional_entry_queue *ewait = |
---|
152 | 182 | container_of(wait, struct wait_exceptional_entry_queue, wait); |
---|
153 | 183 | |
---|
154 | | - if (key->mapping != ewait->key.mapping || |
---|
| 184 | + if (key->xa != ewait->key.xa || |
---|
155 | 185 | key->entry_start != ewait->key.entry_start) |
---|
156 | 186 | return 0; |
---|
157 | 187 | return autoremove_wake_function(wait, mode, sync, NULL); |
---|
.. | .. |
---|
162 | 192 | * The important information it's conveying is whether the entry at |
---|
163 | 193 | * this index used to be a PMD entry. |
---|
164 | 194 | */ |
---|
165 | | -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
---|
166 | | - pgoff_t index, void *entry, bool wake_all) |
---|
| 195 | +static void dax_wake_entry(struct xa_state *xas, void *entry, |
---|
| 196 | + enum dax_wake_mode mode) |
---|
167 | 197 | { |
---|
168 | 198 | struct exceptional_entry_key key; |
---|
169 | 199 | wait_queue_head_t *wq; |
---|
170 | 200 | |
---|
171 | | - wq = dax_entry_waitqueue(mapping, index, entry, &key); |
---|
| 201 | + wq = dax_entry_waitqueue(xas, entry, &key); |
---|
172 | 202 | |
---|
173 | 203 | /* |
---|
174 | 204 | * Checking for locked entry and prepare_to_wait_exclusive() happens |
---|
.. | .. |
---|
177 | 207 | * must be in the waitqueue and the following check will see them. |
---|
178 | 208 | */ |
---|
179 | 209 | if (waitqueue_active(wq)) |
---|
180 | | - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
---|
| 210 | + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); |
---|
181 | 211 | } |
---|
182 | 212 | |
---|
183 | 213 | /* |
---|
184 | | - * Check whether the given slot is locked. Must be called with the i_pages |
---|
185 | | - * lock held. |
---|
186 | | - */ |
---|
187 | | -static inline int slot_locked(struct address_space *mapping, void **slot) |
---|
188 | | -{ |
---|
189 | | - unsigned long entry = (unsigned long) |
---|
190 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
---|
191 | | - return entry & RADIX_DAX_ENTRY_LOCK; |
---|
192 | | -} |
---|
193 | | - |
---|
194 | | -/* |
---|
195 | | - * Mark the given slot as locked. Must be called with the i_pages lock held. |
---|
196 | | - */ |
---|
197 | | -static inline void *lock_slot(struct address_space *mapping, void **slot) |
---|
198 | | -{ |
---|
199 | | - unsigned long entry = (unsigned long) |
---|
200 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
---|
201 | | - |
---|
202 | | - entry |= RADIX_DAX_ENTRY_LOCK; |
---|
203 | | - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); |
---|
204 | | - return (void *)entry; |
---|
205 | | -} |
---|
206 | | - |
---|
207 | | -/* |
---|
208 | | - * Mark the given slot as unlocked. Must be called with the i_pages lock held. |
---|
209 | | - */ |
---|
210 | | -static inline void *unlock_slot(struct address_space *mapping, void **slot) |
---|
211 | | -{ |
---|
212 | | - unsigned long entry = (unsigned long) |
---|
213 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
---|
214 | | - |
---|
215 | | - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; |
---|
216 | | - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); |
---|
217 | | - return (void *)entry; |
---|
218 | | -} |
---|
219 | | - |
---|
220 | | -static void put_unlocked_mapping_entry(struct address_space *mapping, |
---|
221 | | - pgoff_t index, void *entry); |
---|
222 | | - |
---|
223 | | -/* |
---|
224 | | - * Lookup entry in radix tree, wait for it to become unlocked if it is |
---|
225 | | - * exceptional entry and return it. The caller must call |
---|
226 | | - * put_unlocked_mapping_entry() when he decided not to lock the entry or |
---|
227 | | - * put_locked_mapping_entry() when he locked the entry and now wants to |
---|
228 | | - * unlock it. |
---|
| 214 | + * Look up entry in page cache, wait for it to become unlocked if it |
---|
| 215 | + * is a DAX entry and return it. The caller must subsequently call |
---|
| 216 | + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() |
---|
| 217 | + * if it did. The entry returned may have a larger order than @order. |
---|
| 218 | + * If @order is larger than the order of the entry found in i_pages, this |
---|
| 219 | + * function returns a dax_is_conflict entry. |
---|
229 | 220 | * |
---|
230 | 221 | * Must be called with the i_pages lock held. |
---|
231 | 222 | */ |
---|
232 | | -static void *get_unlocked_mapping_entry(struct address_space *mapping, |
---|
233 | | - pgoff_t index, void ***slotp) |
---|
| 223 | +static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) |
---|
234 | 224 | { |
---|
235 | | - void *entry, **slot; |
---|
| 225 | + void *entry; |
---|
236 | 226 | struct wait_exceptional_entry_queue ewait; |
---|
237 | 227 | wait_queue_head_t *wq; |
---|
238 | 228 | |
---|
.. | .. |
---|
240 | 230 | ewait.wait.func = wake_exceptional_entry_func; |
---|
241 | 231 | |
---|
242 | 232 | for (;;) { |
---|
243 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, |
---|
244 | | - &slot); |
---|
245 | | - if (!entry || |
---|
246 | | - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || |
---|
247 | | - !slot_locked(mapping, slot)) { |
---|
248 | | - if (slotp) |
---|
249 | | - *slotp = slot; |
---|
| 233 | + entry = xas_find_conflict(xas); |
---|
| 234 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
---|
250 | 235 | return entry; |
---|
251 | | - } |
---|
| 236 | + if (dax_entry_order(entry) < order) |
---|
| 237 | + return XA_RETRY_ENTRY; |
---|
| 238 | + if (!dax_is_locked(entry)) |
---|
| 239 | + return entry; |
---|
252 | 240 | |
---|
253 | | - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); |
---|
| 241 | + wq = dax_entry_waitqueue(xas, entry, &ewait.key); |
---|
254 | 242 | prepare_to_wait_exclusive(wq, &ewait.wait, |
---|
255 | 243 | TASK_UNINTERRUPTIBLE); |
---|
256 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 244 | + xas_unlock_irq(xas); |
---|
| 245 | + xas_reset(xas); |
---|
257 | 246 | schedule(); |
---|
258 | 247 | finish_wait(wq, &ewait.wait); |
---|
259 | | - xa_lock_irq(&mapping->i_pages); |
---|
| 248 | + xas_lock_irq(xas); |
---|
260 | 249 | } |
---|
261 | 250 | } |
---|
262 | 251 | |
---|
.. | .. |
---|
265 | 254 | * (it's cycled in clear_inode() after removing the entries from i_pages) |
---|
266 | 255 | * After we call xas_unlock_irq(), we cannot touch xas->xa. |
---|
267 | 256 | */ |
---|
268 | | -static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, |
---|
269 | | - void ***slotp, void *entry) |
---|
| 257 | +static void wait_entry_unlocked(struct xa_state *xas, void *entry) |
---|
270 | 258 | { |
---|
271 | 259 | struct wait_exceptional_entry_queue ewait; |
---|
272 | 260 | wait_queue_head_t *wq; |
---|
.. | .. |
---|
274 | 262 | init_wait(&ewait.wait); |
---|
275 | 263 | ewait.wait.func = wake_exceptional_entry_func; |
---|
276 | 264 | |
---|
277 | | - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); |
---|
| 265 | + wq = dax_entry_waitqueue(xas, entry, &ewait.key); |
---|
278 | 266 | /* |
---|
279 | 267 | * Unlike get_unlocked_entry() there is no guarantee that this |
---|
280 | 268 | * path ever successfully retrieves an unlocked entry before an |
---|
.. | .. |
---|
282 | 270 | * never successfully performs its own wake up. |
---|
283 | 271 | */ |
---|
284 | 272 | prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); |
---|
285 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 273 | + xas_unlock_irq(xas); |
---|
286 | 274 | schedule(); |
---|
287 | 275 | finish_wait(wq, &ewait.wait); |
---|
288 | 276 | } |
---|
289 | 277 | |
---|
290 | | -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) |
---|
| 278 | +static void put_unlocked_entry(struct xa_state *xas, void *entry, |
---|
| 279 | + enum dax_wake_mode mode) |
---|
291 | 280 | { |
---|
292 | | - void *entry, **slot; |
---|
293 | | - |
---|
294 | | - xa_lock_irq(&mapping->i_pages); |
---|
295 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); |
---|
296 | | - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || |
---|
297 | | - !slot_locked(mapping, slot))) { |
---|
298 | | - xa_unlock_irq(&mapping->i_pages); |
---|
299 | | - return; |
---|
300 | | - } |
---|
301 | | - unlock_slot(mapping, slot); |
---|
302 | | - xa_unlock_irq(&mapping->i_pages); |
---|
303 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, false); |
---|
304 | | -} |
---|
305 | | - |
---|
306 | | -static void put_locked_mapping_entry(struct address_space *mapping, |
---|
307 | | - pgoff_t index) |
---|
308 | | -{ |
---|
309 | | - unlock_mapping_entry(mapping, index); |
---|
| 281 | + if (entry && !dax_is_conflict(entry)) |
---|
| 282 | + dax_wake_entry(xas, entry, mode); |
---|
310 | 283 | } |
---|
311 | 284 | |
---|
312 | 285 | /* |
---|
313 | | - * Called when we are done with radix tree entry we looked up via |
---|
314 | | - * get_unlocked_mapping_entry() and which we didn't lock in the end. |
---|
| 286 | + * We used the xa_state to get the entry, but then we locked the entry and |
---|
| 287 | + * dropped the xa_lock, so we know the xa_state is stale and must be reset |
---|
| 288 | + * before use. |
---|
315 | 289 | */ |
---|
316 | | -static void put_unlocked_mapping_entry(struct address_space *mapping, |
---|
317 | | - pgoff_t index, void *entry) |
---|
| 290 | +static void dax_unlock_entry(struct xa_state *xas, void *entry) |
---|
318 | 291 | { |
---|
319 | | - if (!entry) |
---|
320 | | - return; |
---|
| 292 | + void *old; |
---|
321 | 293 | |
---|
322 | | - /* We have to wake up next waiter for the radix tree entry lock */ |
---|
323 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, false); |
---|
| 294 | + BUG_ON(dax_is_locked(entry)); |
---|
| 295 | + xas_reset(xas); |
---|
| 296 | + xas_lock_irq(xas); |
---|
| 297 | + old = xas_store(xas, entry); |
---|
| 298 | + xas_unlock_irq(xas); |
---|
| 299 | + BUG_ON(!dax_is_locked(old)); |
---|
| 300 | + dax_wake_entry(xas, entry, WAKE_NEXT); |
---|
| 301 | +} |
---|
| 302 | + |
---|
| 303 | +/* |
---|
| 304 | + * Return: The entry stored at this location before it was locked. |
---|
| 305 | + */ |
---|
| 306 | +static void *dax_lock_entry(struct xa_state *xas, void *entry) |
---|
| 307 | +{ |
---|
| 308 | + unsigned long v = xa_to_value(entry); |
---|
| 309 | + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); |
---|
324 | 310 | } |
---|
325 | 311 | |
---|
326 | 312 | static unsigned long dax_entry_size(void *entry) |
---|
.. | .. |
---|
335 | 321 | return PAGE_SIZE; |
---|
336 | 322 | } |
---|
337 | 323 | |
---|
338 | | -static unsigned long dax_radix_end_pfn(void *entry) |
---|
| 324 | +static unsigned long dax_end_pfn(void *entry) |
---|
339 | 325 | { |
---|
340 | | - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; |
---|
| 326 | + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; |
---|
341 | 327 | } |
---|
342 | 328 | |
---|
343 | 329 | /* |
---|
.. | .. |
---|
345 | 331 | * 'empty' and 'zero' entries. |
---|
346 | 332 | */ |
---|
347 | 333 | #define for_each_mapped_pfn(entry, pfn) \ |
---|
348 | | - for (pfn = dax_radix_pfn(entry); \ |
---|
349 | | - pfn < dax_radix_end_pfn(entry); pfn++) |
---|
| 334 | + for (pfn = dax_to_pfn(entry); \ |
---|
| 335 | + pfn < dax_end_pfn(entry); pfn++) |
---|
350 | 336 | |
---|
351 | 337 | /* |
---|
352 | 338 | * TODO: for reflink+dax we need a way to associate a single page with |
---|
.. | .. |
---|
403 | 389 | return NULL; |
---|
404 | 390 | } |
---|
405 | 391 | |
---|
406 | | -bool dax_lock_mapping_entry(struct page *page) |
---|
| 392 | +/* |
---|
| 393 | + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page |
---|
| 394 | + * @page: The page whose entry we want to lock |
---|
| 395 | + * |
---|
| 396 | + * Context: Process context. |
---|
| 397 | + * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could |
---|
| 398 | + * not be locked. |
---|
| 399 | + */ |
---|
| 400 | +dax_entry_t dax_lock_page(struct page *page) |
---|
407 | 401 | { |
---|
408 | | - pgoff_t index; |
---|
409 | | - struct inode *inode; |
---|
410 | | - bool did_lock = false; |
---|
411 | | - void *entry = NULL, **slot; |
---|
412 | | - struct address_space *mapping; |
---|
| 402 | + XA_STATE(xas, NULL, 0); |
---|
| 403 | + void *entry; |
---|
413 | 404 | |
---|
| 405 | + /* Ensure page->mapping isn't freed while we look at it */ |
---|
414 | 406 | rcu_read_lock(); |
---|
415 | 407 | for (;;) { |
---|
416 | | - mapping = READ_ONCE(page->mapping); |
---|
| 408 | + struct address_space *mapping = READ_ONCE(page->mapping); |
---|
417 | 409 | |
---|
| 410 | + entry = NULL; |
---|
418 | 411 | if (!mapping || !dax_mapping(mapping)) |
---|
419 | 412 | break; |
---|
420 | 413 | |
---|
.. | .. |
---|
425 | 418 | * otherwise we would not have a valid pfn_to_page() |
---|
426 | 419 | * translation. |
---|
427 | 420 | */ |
---|
428 | | - inode = mapping->host; |
---|
429 | | - if (S_ISCHR(inode->i_mode)) { |
---|
430 | | - did_lock = true; |
---|
| 421 | + entry = (void *)~0UL; |
---|
| 422 | + if (S_ISCHR(mapping->host->i_mode)) |
---|
431 | 423 | break; |
---|
432 | | - } |
---|
433 | 424 | |
---|
434 | | - xa_lock_irq(&mapping->i_pages); |
---|
| 425 | + xas.xa = &mapping->i_pages; |
---|
| 426 | + xas_lock_irq(&xas); |
---|
435 | 427 | if (mapping != page->mapping) { |
---|
436 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 428 | + xas_unlock_irq(&xas); |
---|
437 | 429 | continue; |
---|
438 | 430 | } |
---|
439 | | - index = page->index; |
---|
440 | | - |
---|
441 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, |
---|
442 | | - NULL, &slot); |
---|
443 | | - if (!entry) { |
---|
444 | | - xa_unlock_irq(&mapping->i_pages); |
---|
445 | | - break; |
---|
446 | | - } else if (slot_locked(mapping, slot)) { |
---|
| 431 | + xas_set(&xas, page->index); |
---|
| 432 | + entry = xas_load(&xas); |
---|
| 433 | + if (dax_is_locked(entry)) { |
---|
447 | 434 | rcu_read_unlock(); |
---|
448 | | - wait_entry_unlocked(mapping, index, &slot, entry); |
---|
| 435 | + wait_entry_unlocked(&xas, entry); |
---|
449 | 436 | rcu_read_lock(); |
---|
450 | 437 | continue; |
---|
451 | 438 | } |
---|
452 | | - lock_slot(mapping, slot); |
---|
453 | | - did_lock = true; |
---|
454 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 439 | + dax_lock_entry(&xas, entry); |
---|
| 440 | + xas_unlock_irq(&xas); |
---|
455 | 441 | break; |
---|
456 | 442 | } |
---|
457 | 443 | rcu_read_unlock(); |
---|
458 | | - |
---|
459 | | - return did_lock; |
---|
| 444 | + return (dax_entry_t)entry; |
---|
460 | 445 | } |
---|
461 | 446 | |
---|
462 | | -void dax_unlock_mapping_entry(struct page *page) |
---|
| 447 | +void dax_unlock_page(struct page *page, dax_entry_t cookie) |
---|
463 | 448 | { |
---|
464 | 449 | struct address_space *mapping = page->mapping; |
---|
465 | | - struct inode *inode = mapping->host; |
---|
| 450 | + XA_STATE(xas, &mapping->i_pages, page->index); |
---|
466 | 451 | |
---|
467 | | - if (S_ISCHR(inode->i_mode)) |
---|
| 452 | + if (S_ISCHR(mapping->host->i_mode)) |
---|
468 | 453 | return; |
---|
469 | 454 | |
---|
470 | | - unlock_mapping_entry(mapping, page->index); |
---|
| 455 | + dax_unlock_entry(&xas, (void *)cookie); |
---|
471 | 456 | } |
---|
472 | 457 | |
---|
473 | 458 | /* |
---|
474 | | - * Find radix tree entry at given index. If it points to an exceptional entry, |
---|
475 | | - * return it with the radix tree entry locked. If the radix tree doesn't |
---|
476 | | - * contain given index, create an empty exceptional entry for the index and |
---|
477 | | - * return with it locked. |
---|
| 459 | + * Find page cache entry at given index. If it is a DAX entry, return it |
---|
| 460 | + * with the entry locked. If the page cache doesn't contain an entry at |
---|
| 461 | + * that index, add a locked empty entry. |
---|
478 | 462 | * |
---|
479 | | - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will |
---|
480 | | - * either return that locked entry or will return an error. This error will |
---|
481 | | - * happen if there are any 4k entries within the 2MiB range that we are |
---|
482 | | - * requesting. |
---|
| 463 | + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will |
---|
| 464 | + * either return that locked entry or will return VM_FAULT_FALLBACK. |
---|
| 465 | + * This will happen if there are any PTE entries within the PMD range |
---|
| 466 | + * that we are requesting. |
---|
483 | 467 | * |
---|
484 | | - * We always favor 4k entries over 2MiB entries. There isn't a flow where we |
---|
485 | | - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB |
---|
486 | | - * insertion will fail if it finds any 4k entries already in the tree, and a |
---|
487 | | - * 4k insertion will cause an existing 2MiB entry to be unmapped and |
---|
488 | | - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as |
---|
489 | | - * well as 2MiB empty entries. |
---|
| 468 | + * We always favor PTE entries over PMD entries. There isn't a flow where we |
---|
| 469 | + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD |
---|
| 470 | + * insertion will fail if it finds any PTE entries already in the tree, and a |
---|
| 471 | + * PTE insertion will cause an existing PMD entry to be unmapped and |
---|
| 472 | + * downgraded to PTE entries. This happens for both PMD zero pages as |
---|
| 473 | + * well as PMD empty entries. |
---|
490 | 474 | * |
---|
491 | | - * The exception to this downgrade path is for 2MiB DAX PMD entries that have |
---|
492 | | - * real storage backing them. We will leave these real 2MiB DAX entries in |
---|
493 | | - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. |
---|
| 475 | + * The exception to this downgrade path is for PMD entries that have |
---|
| 476 | + * real storage backing them. We will leave these real PMD entries in |
---|
| 477 | + * the tree, and PTE writes will simply dirty the entire PMD entry. |
---|
494 | 478 | * |
---|
495 | 479 | * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For |
---|
496 | 480 | * persistent memory the benefit is doubtful. We can add that later if we can |
---|
497 | 481 | * show it helps. |
---|
| 482 | + * |
---|
| 483 | + * On error, this function does not return an ERR_PTR. Instead it returns |
---|
| 484 | + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values |
---|
| 485 | + * overlap with xarray value entries. |
---|
498 | 486 | */ |
---|
499 | | -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, |
---|
500 | | - unsigned long size_flag) |
---|
| 487 | +static void *grab_mapping_entry(struct xa_state *xas, |
---|
| 488 | + struct address_space *mapping, unsigned int order) |
---|
501 | 489 | { |
---|
502 | | - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ |
---|
503 | | - void *entry, **slot; |
---|
| 490 | + unsigned long index = xas->xa_index; |
---|
| 491 | + bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ |
---|
| 492 | + void *entry; |
---|
504 | 493 | |
---|
505 | | -restart: |
---|
506 | | - xa_lock_irq(&mapping->i_pages); |
---|
507 | | - entry = get_unlocked_mapping_entry(mapping, index, &slot); |
---|
508 | | - |
---|
509 | | - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { |
---|
510 | | - entry = ERR_PTR(-EIO); |
---|
511 | | - goto out_unlock; |
---|
512 | | - } |
---|
| 494 | +retry: |
---|
| 495 | + pmd_downgrade = false; |
---|
| 496 | + xas_lock_irq(xas); |
---|
| 497 | + entry = get_unlocked_entry(xas, order); |
---|
513 | 498 | |
---|
514 | 499 | if (entry) { |
---|
515 | | - if (size_flag & RADIX_DAX_PMD) { |
---|
516 | | - if (dax_is_pte_entry(entry)) { |
---|
517 | | - put_unlocked_mapping_entry(mapping, index, |
---|
518 | | - entry); |
---|
519 | | - entry = ERR_PTR(-EEXIST); |
---|
520 | | - goto out_unlock; |
---|
521 | | - } |
---|
522 | | - } else { /* trying to grab a PTE entry */ |
---|
| 500 | + if (dax_is_conflict(entry)) |
---|
| 501 | + goto fallback; |
---|
| 502 | + if (!xa_is_value(entry)) { |
---|
| 503 | + xas_set_err(xas, -EIO); |
---|
| 504 | + goto out_unlock; |
---|
| 505 | + } |
---|
| 506 | + |
---|
| 507 | + if (order == 0) { |
---|
523 | 508 | if (dax_is_pmd_entry(entry) && |
---|
524 | 509 | (dax_is_zero_entry(entry) || |
---|
525 | 510 | dax_is_empty_entry(entry))) { |
---|
.. | .. |
---|
528 | 513 | } |
---|
529 | 514 | } |
---|
530 | 515 | |
---|
531 | | - /* No entry for given index? Make sure radix tree is big enough. */ |
---|
532 | | - if (!entry || pmd_downgrade) { |
---|
533 | | - int err; |
---|
| 516 | + if (pmd_downgrade) { |
---|
| 517 | + /* |
---|
| 518 | + * Make sure 'entry' remains valid while we drop |
---|
| 519 | + * the i_pages lock. |
---|
| 520 | + */ |
---|
| 521 | + dax_lock_entry(xas, entry); |
---|
534 | 522 | |
---|
535 | | - if (pmd_downgrade) { |
---|
536 | | - /* |
---|
537 | | - * Make sure 'entry' remains valid while we drop |
---|
538 | | - * the i_pages lock. |
---|
539 | | - */ |
---|
540 | | - entry = lock_slot(mapping, slot); |
---|
541 | | - } |
---|
542 | | - |
---|
543 | | - xa_unlock_irq(&mapping->i_pages); |
---|
544 | 523 | /* |
---|
545 | 524 | * Besides huge zero pages the only other thing that gets |
---|
546 | 525 | * downgraded are empty entries which don't need to be |
---|
547 | 526 | * unmapped. |
---|
548 | 527 | */ |
---|
549 | | - if (pmd_downgrade && dax_is_zero_entry(entry)) |
---|
550 | | - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, |
---|
551 | | - PG_PMD_NR, false); |
---|
552 | | - |
---|
553 | | - err = radix_tree_preload( |
---|
554 | | - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); |
---|
555 | | - if (err) { |
---|
556 | | - if (pmd_downgrade) |
---|
557 | | - put_locked_mapping_entry(mapping, index); |
---|
558 | | - return ERR_PTR(err); |
---|
559 | | - } |
---|
560 | | - xa_lock_irq(&mapping->i_pages); |
---|
561 | | - |
---|
562 | | - if (!entry) { |
---|
563 | | - /* |
---|
564 | | - * We needed to drop the i_pages lock while calling |
---|
565 | | - * radix_tree_preload() and we didn't have an entry to |
---|
566 | | - * lock. See if another thread inserted an entry at |
---|
567 | | - * our index during this time. |
---|
568 | | - */ |
---|
569 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, |
---|
570 | | - NULL, &slot); |
---|
571 | | - if (entry) { |
---|
572 | | - radix_tree_preload_end(); |
---|
573 | | - xa_unlock_irq(&mapping->i_pages); |
---|
574 | | - goto restart; |
---|
575 | | - } |
---|
| 528 | + if (dax_is_zero_entry(entry)) { |
---|
| 529 | + xas_unlock_irq(xas); |
---|
| 530 | + unmap_mapping_pages(mapping, |
---|
| 531 | + xas->xa_index & ~PG_PMD_COLOUR, |
---|
| 532 | + PG_PMD_NR, false); |
---|
| 533 | + xas_reset(xas); |
---|
| 534 | + xas_lock_irq(xas); |
---|
576 | 535 | } |
---|
577 | 536 | |
---|
578 | | - if (pmd_downgrade) { |
---|
579 | | - dax_disassociate_entry(entry, mapping, false); |
---|
580 | | - radix_tree_delete(&mapping->i_pages, index); |
---|
581 | | - mapping->nrexceptional--; |
---|
582 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, |
---|
583 | | - true); |
---|
584 | | - } |
---|
585 | | - |
---|
586 | | - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); |
---|
587 | | - |
---|
588 | | - err = __radix_tree_insert(&mapping->i_pages, index, |
---|
589 | | - dax_radix_order(entry), entry); |
---|
590 | | - radix_tree_preload_end(); |
---|
591 | | - if (err) { |
---|
592 | | - xa_unlock_irq(&mapping->i_pages); |
---|
593 | | - /* |
---|
594 | | - * Our insertion of a DAX entry failed, most likely |
---|
595 | | - * because we were inserting a PMD entry and it |
---|
596 | | - * collided with a PTE sized entry at a different |
---|
597 | | - * index in the PMD range. We haven't inserted |
---|
598 | | - * anything into the radix tree and have no waiters to |
---|
599 | | - * wake. |
---|
600 | | - */ |
---|
601 | | - return ERR_PTR(err); |
---|
602 | | - } |
---|
603 | | - /* Good, we have inserted empty locked entry into the tree. */ |
---|
604 | | - mapping->nrexceptional++; |
---|
605 | | - xa_unlock_irq(&mapping->i_pages); |
---|
606 | | - return entry; |
---|
| 537 | + dax_disassociate_entry(entry, mapping, false); |
---|
| 538 | + xas_store(xas, NULL); /* undo the PMD join */ |
---|
| 539 | + dax_wake_entry(xas, entry, WAKE_ALL); |
---|
| 540 | + mapping->nrexceptional--; |
---|
| 541 | + entry = NULL; |
---|
| 542 | + xas_set(xas, index); |
---|
607 | 543 | } |
---|
608 | | - entry = lock_slot(mapping, slot); |
---|
609 | | - out_unlock: |
---|
610 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 544 | + |
---|
| 545 | + if (entry) { |
---|
| 546 | + dax_lock_entry(xas, entry); |
---|
| 547 | + } else { |
---|
| 548 | + unsigned long flags = DAX_EMPTY; |
---|
| 549 | + |
---|
| 550 | + if (order > 0) |
---|
| 551 | + flags |= DAX_PMD; |
---|
| 552 | + entry = dax_make_entry(pfn_to_pfn_t(0), flags); |
---|
| 553 | + dax_lock_entry(xas, entry); |
---|
| 554 | + if (xas_error(xas)) |
---|
| 555 | + goto out_unlock; |
---|
| 556 | + mapping->nrexceptional++; |
---|
| 557 | + } |
---|
| 558 | + |
---|
| 559 | +out_unlock: |
---|
| 560 | + xas_unlock_irq(xas); |
---|
| 561 | + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) |
---|
| 562 | + goto retry; |
---|
| 563 | + if (xas->xa_node == XA_ERROR(-ENOMEM)) |
---|
| 564 | + return xa_mk_internal(VM_FAULT_OOM); |
---|
| 565 | + if (xas_error(xas)) |
---|
| 566 | + return xa_mk_internal(VM_FAULT_SIGBUS); |
---|
611 | 567 | return entry; |
---|
| 568 | +fallback: |
---|
| 569 | + xas_unlock_irq(xas); |
---|
| 570 | + return xa_mk_internal(VM_FAULT_FALLBACK); |
---|
612 | 571 | } |
---|
613 | 572 | |
---|
614 | 573 | /** |
---|
615 | | - * dax_layout_busy_page - find first pinned page in @mapping |
---|
| 574 | + * dax_layout_busy_page_range - find first pinned page in @mapping |
---|
616 | 575 | * @mapping: address space to scan for a page with ref count > 1 |
---|
| 576 | + * @start: Starting offset. Page containing 'start' is included. |
---|
| 577 | + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, |
---|
| 578 | + * pages from 'start' till the end of file are included. |
---|
617 | 579 | * |
---|
618 | 580 | * DAX requires ZONE_DEVICE mapped pages. These pages are never |
---|
619 | 581 | * 'onlined' to the page allocator so they are considered idle when |
---|
.. | .. |
---|
626 | 588 | * to be able to run unmap_mapping_range() and subsequently not race |
---|
627 | 589 | * mapping_mapped() becoming true. |
---|
628 | 590 | */ |
---|
629 | | -struct page *dax_layout_busy_page(struct address_space *mapping) |
---|
| 591 | +struct page *dax_layout_busy_page_range(struct address_space *mapping, |
---|
| 592 | + loff_t start, loff_t end) |
---|
630 | 593 | { |
---|
631 | | - pgoff_t indices[PAGEVEC_SIZE]; |
---|
| 594 | + void *entry; |
---|
| 595 | + unsigned int scanned = 0; |
---|
632 | 596 | struct page *page = NULL; |
---|
633 | | - struct pagevec pvec; |
---|
634 | | - pgoff_t index, end; |
---|
635 | | - unsigned i; |
---|
| 597 | + pgoff_t start_idx = start >> PAGE_SHIFT; |
---|
| 598 | + pgoff_t end_idx; |
---|
| 599 | + XA_STATE(xas, &mapping->i_pages, start_idx); |
---|
636 | 600 | |
---|
637 | 601 | /* |
---|
638 | 602 | * In the 'limited' case get_user_pages() for dax is disabled. |
---|
.. | .. |
---|
643 | 607 | if (!dax_mapping(mapping) || !mapping_mapped(mapping)) |
---|
644 | 608 | return NULL; |
---|
645 | 609 | |
---|
646 | | - pagevec_init(&pvec); |
---|
647 | | - index = 0; |
---|
648 | | - end = -1; |
---|
649 | | - |
---|
| 610 | + /* If end == LLONG_MAX, all pages from start to till end of file */ |
---|
| 611 | + if (end == LLONG_MAX) |
---|
| 612 | + end_idx = ULONG_MAX; |
---|
| 613 | + else |
---|
| 614 | + end_idx = end >> PAGE_SHIFT; |
---|
650 | 615 | /* |
---|
651 | 616 | * If we race get_user_pages_fast() here either we'll see the |
---|
652 | | - * elevated page count in the pagevec_lookup and wait, or |
---|
| 617 | + * elevated page count in the iteration and wait, or |
---|
653 | 618 | * get_user_pages_fast() will see that the page it took a reference |
---|
654 | 619 | * against is no longer mapped in the page tables and bail to the |
---|
655 | 620 | * get_user_pages() slow path. The slow path is protected by |
---|
656 | 621 | * pte_lock() and pmd_lock(). New references are not taken without |
---|
657 | | - * holding those locks, and unmap_mapping_range() will not zero the |
---|
| 622 | + * holding those locks, and unmap_mapping_pages() will not zero the |
---|
658 | 623 | * pte or pmd without holding the respective lock, so we are |
---|
659 | 624 | * guaranteed to either see new references or prevent new |
---|
660 | 625 | * references from being established. |
---|
661 | 626 | */ |
---|
662 | | - unmap_mapping_range(mapping, 0, 0, 0); |
---|
| 627 | + unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); |
---|
663 | 628 | |
---|
664 | | - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, |
---|
665 | | - min(end - index, (pgoff_t)PAGEVEC_SIZE), |
---|
666 | | - indices)) { |
---|
667 | | - pgoff_t nr_pages = 1; |
---|
668 | | - |
---|
669 | | - for (i = 0; i < pagevec_count(&pvec); i++) { |
---|
670 | | - struct page *pvec_ent = pvec.pages[i]; |
---|
671 | | - void *entry; |
---|
672 | | - |
---|
673 | | - index = indices[i]; |
---|
674 | | - if (index >= end) |
---|
675 | | - break; |
---|
676 | | - |
---|
677 | | - if (WARN_ON_ONCE( |
---|
678 | | - !radix_tree_exceptional_entry(pvec_ent))) |
---|
679 | | - continue; |
---|
680 | | - |
---|
681 | | - xa_lock_irq(&mapping->i_pages); |
---|
682 | | - entry = get_unlocked_mapping_entry(mapping, index, NULL); |
---|
683 | | - if (entry) { |
---|
684 | | - page = dax_busy_page(entry); |
---|
685 | | - /* |
---|
686 | | - * Account for multi-order entries at |
---|
687 | | - * the end of the pagevec. |
---|
688 | | - */ |
---|
689 | | - if (i + 1 >= pagevec_count(&pvec)) |
---|
690 | | - nr_pages = 1UL << dax_radix_order(entry); |
---|
691 | | - } |
---|
692 | | - put_unlocked_mapping_entry(mapping, index, entry); |
---|
693 | | - xa_unlock_irq(&mapping->i_pages); |
---|
694 | | - if (page) |
---|
695 | | - break; |
---|
696 | | - } |
---|
697 | | - |
---|
698 | | - /* |
---|
699 | | - * We don't expect normal struct page entries to exist in our |
---|
700 | | - * tree, but we keep these pagevec calls so that this code is |
---|
701 | | - * consistent with the common pattern for handling pagevecs |
---|
702 | | - * throughout the kernel. |
---|
703 | | - */ |
---|
704 | | - pagevec_remove_exceptionals(&pvec); |
---|
705 | | - pagevec_release(&pvec); |
---|
706 | | - index += nr_pages; |
---|
707 | | - |
---|
| 629 | + xas_lock_irq(&xas); |
---|
| 630 | + xas_for_each(&xas, entry, end_idx) { |
---|
| 631 | + if (WARN_ON_ONCE(!xa_is_value(entry))) |
---|
| 632 | + continue; |
---|
| 633 | + if (unlikely(dax_is_locked(entry))) |
---|
| 634 | + entry = get_unlocked_entry(&xas, 0); |
---|
| 635 | + if (entry) |
---|
| 636 | + page = dax_busy_page(entry); |
---|
| 637 | + put_unlocked_entry(&xas, entry, WAKE_NEXT); |
---|
708 | 638 | if (page) |
---|
709 | 639 | break; |
---|
| 640 | + if (++scanned % XA_CHECK_SCHED) |
---|
| 641 | + continue; |
---|
| 642 | + |
---|
| 643 | + xas_pause(&xas); |
---|
| 644 | + xas_unlock_irq(&xas); |
---|
| 645 | + cond_resched(); |
---|
| 646 | + xas_lock_irq(&xas); |
---|
710 | 647 | } |
---|
| 648 | + xas_unlock_irq(&xas); |
---|
711 | 649 | return page; |
---|
| 650 | +} |
---|
| 651 | +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); |
---|
| 652 | + |
---|
| 653 | +struct page *dax_layout_busy_page(struct address_space *mapping) |
---|
| 654 | +{ |
---|
| 655 | + return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); |
---|
712 | 656 | } |
---|
713 | 657 | EXPORT_SYMBOL_GPL(dax_layout_busy_page); |
---|
714 | 658 | |
---|
715 | | -static int __dax_invalidate_mapping_entry(struct address_space *mapping, |
---|
| 659 | +static int __dax_invalidate_entry(struct address_space *mapping, |
---|
716 | 660 | pgoff_t index, bool trunc) |
---|
717 | 661 | { |
---|
| 662 | + XA_STATE(xas, &mapping->i_pages, index); |
---|
718 | 663 | int ret = 0; |
---|
719 | 664 | void *entry; |
---|
720 | | - struct radix_tree_root *pages = &mapping->i_pages; |
---|
721 | 665 | |
---|
722 | | - xa_lock_irq(pages); |
---|
723 | | - entry = get_unlocked_mapping_entry(mapping, index, NULL); |
---|
724 | | - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) |
---|
| 666 | + xas_lock_irq(&xas); |
---|
| 667 | + entry = get_unlocked_entry(&xas, 0); |
---|
| 668 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
---|
725 | 669 | goto out; |
---|
726 | 670 | if (!trunc && |
---|
727 | | - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || |
---|
728 | | - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) |
---|
| 671 | + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || |
---|
| 672 | + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) |
---|
729 | 673 | goto out; |
---|
730 | 674 | dax_disassociate_entry(entry, mapping, trunc); |
---|
731 | | - radix_tree_delete(pages, index); |
---|
| 675 | + xas_store(&xas, NULL); |
---|
732 | 676 | mapping->nrexceptional--; |
---|
733 | 677 | ret = 1; |
---|
734 | 678 | out: |
---|
735 | | - put_unlocked_mapping_entry(mapping, index, entry); |
---|
736 | | - xa_unlock_irq(pages); |
---|
| 679 | + put_unlocked_entry(&xas, entry, WAKE_ALL); |
---|
| 680 | + xas_unlock_irq(&xas); |
---|
737 | 681 | return ret; |
---|
738 | 682 | } |
---|
| 683 | + |
---|
739 | 684 | /* |
---|
740 | | - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
---|
741 | | - * entry to get unlocked before deleting it. |
---|
| 685 | + * Delete DAX entry at @index from @mapping. Wait for it |
---|
| 686 | + * to be unlocked before deleting it. |
---|
742 | 687 | */ |
---|
743 | 688 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) |
---|
744 | 689 | { |
---|
745 | | - int ret = __dax_invalidate_mapping_entry(mapping, index, true); |
---|
| 690 | + int ret = __dax_invalidate_entry(mapping, index, true); |
---|
746 | 691 | |
---|
747 | 692 | /* |
---|
748 | 693 | * This gets called from truncate / punch_hole path. As such, the caller |
---|
749 | 694 | * must hold locks protecting against concurrent modifications of the |
---|
750 | | - * radix tree (usually fs-private i_mmap_sem for writing). Since the |
---|
751 | | - * caller has seen exceptional entry for this index, we better find it |
---|
| 695 | + * page cache (usually fs-private i_mmap_sem for writing). Since the |
---|
| 696 | + * caller has seen a DAX entry for this index, we better find it |
---|
752 | 697 | * at that index as well... |
---|
753 | 698 | */ |
---|
754 | 699 | WARN_ON_ONCE(!ret); |
---|
.. | .. |
---|
756 | 701 | } |
---|
757 | 702 | |
---|
758 | 703 | /* |
---|
759 | | - * Invalidate exceptional DAX entry if it is clean. |
---|
| 704 | + * Invalidate DAX entry if it is clean. |
---|
760 | 705 | */ |
---|
761 | 706 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, |
---|
762 | 707 | pgoff_t index) |
---|
763 | 708 | { |
---|
764 | | - return __dax_invalidate_mapping_entry(mapping, index, false); |
---|
| 709 | + return __dax_invalidate_entry(mapping, index, false); |
---|
765 | 710 | } |
---|
766 | 711 | |
---|
767 | | -static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, |
---|
768 | | - sector_t sector, size_t size, struct page *to, |
---|
769 | | - unsigned long vaddr) |
---|
| 712 | +static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, |
---|
| 713 | + sector_t sector, struct page *to, unsigned long vaddr) |
---|
770 | 714 | { |
---|
771 | 715 | void *vto, *kaddr; |
---|
772 | 716 | pgoff_t pgoff; |
---|
773 | 717 | long rc; |
---|
774 | 718 | int id; |
---|
775 | 719 | |
---|
776 | | - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); |
---|
| 720 | + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); |
---|
777 | 721 | if (rc) |
---|
778 | 722 | return rc; |
---|
779 | 723 | |
---|
780 | 724 | id = dax_read_lock(); |
---|
781 | | - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); |
---|
| 725 | + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL); |
---|
782 | 726 | if (rc < 0) { |
---|
783 | 727 | dax_read_unlock(id); |
---|
784 | 728 | return rc; |
---|
785 | 729 | } |
---|
786 | 730 | vto = kmap_atomic(to); |
---|
| 731 | +#ifdef CONFIG_ARM |
---|
| 732 | +#ifndef copy_user_page |
---|
| 733 | +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) |
---|
| 734 | +#endif |
---|
| 735 | +#endif |
---|
787 | 736 | copy_user_page(vto, (void __force *)kaddr, vaddr, to); |
---|
788 | 737 | kunmap_atomic(vto); |
---|
789 | 738 | dax_read_unlock(id); |
---|
.. | .. |
---|
797 | 746 | * already in the tree, we will skip the insertion and just dirty the PMD as |
---|
798 | 747 | * appropriate. |
---|
799 | 748 | */ |
---|
800 | | -static void *dax_insert_mapping_entry(struct address_space *mapping, |
---|
801 | | - struct vm_fault *vmf, |
---|
802 | | - void *entry, pfn_t pfn_t, |
---|
803 | | - unsigned long flags, bool dirty) |
---|
| 749 | +static void *dax_insert_entry(struct xa_state *xas, |
---|
| 750 | + struct address_space *mapping, struct vm_fault *vmf, |
---|
| 751 | + void *entry, pfn_t pfn, unsigned long flags, bool dirty) |
---|
804 | 752 | { |
---|
805 | | - struct radix_tree_root *pages = &mapping->i_pages; |
---|
806 | | - unsigned long pfn = pfn_t_to_pfn(pfn_t); |
---|
807 | | - pgoff_t index = vmf->pgoff; |
---|
808 | | - void *new_entry; |
---|
| 753 | + void *new_entry = dax_make_entry(pfn, flags); |
---|
809 | 754 | |
---|
810 | 755 | if (dirty) |
---|
811 | 756 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
---|
812 | 757 | |
---|
813 | | - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { |
---|
| 758 | + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { |
---|
| 759 | + unsigned long index = xas->xa_index; |
---|
814 | 760 | /* we are replacing a zero page with block mapping */ |
---|
815 | 761 | if (dax_is_pmd_entry(entry)) |
---|
816 | 762 | unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, |
---|
817 | | - PG_PMD_NR, false); |
---|
| 763 | + PG_PMD_NR, false); |
---|
818 | 764 | else /* pte entry */ |
---|
819 | | - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); |
---|
| 765 | + unmap_mapping_pages(mapping, index, 1, false); |
---|
820 | 766 | } |
---|
821 | 767 | |
---|
822 | | - xa_lock_irq(pages); |
---|
823 | | - new_entry = dax_radix_locked_entry(pfn, flags); |
---|
824 | | - if (dax_entry_size(entry) != dax_entry_size(new_entry)) { |
---|
| 768 | + xas_reset(xas); |
---|
| 769 | + xas_lock_irq(xas); |
---|
| 770 | + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
---|
| 771 | + void *old; |
---|
| 772 | + |
---|
825 | 773 | dax_disassociate_entry(entry, mapping, false); |
---|
826 | 774 | dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); |
---|
827 | | - } |
---|
828 | | - |
---|
829 | | - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
---|
830 | 775 | /* |
---|
831 | | - * Only swap our new entry into the radix tree if the current |
---|
| 776 | + * Only swap our new entry into the page cache if the current |
---|
832 | 777 | * entry is a zero page or an empty entry. If a normal PTE or |
---|
833 | | - * PMD entry is already in the tree, we leave it alone. This |
---|
| 778 | + * PMD entry is already in the cache, we leave it alone. This |
---|
834 | 779 | * means that if we are trying to insert a PTE and the |
---|
835 | 780 | * existing entry is a PMD, we will just leave the PMD in the |
---|
836 | 781 | * tree and dirty it if necessary. |
---|
837 | 782 | */ |
---|
838 | | - struct radix_tree_node *node; |
---|
839 | | - void **slot; |
---|
840 | | - void *ret; |
---|
841 | | - |
---|
842 | | - ret = __radix_tree_lookup(pages, index, &node, &slot); |
---|
843 | | - WARN_ON_ONCE(ret != entry); |
---|
844 | | - __radix_tree_replace(pages, node, slot, |
---|
845 | | - new_entry, NULL); |
---|
| 783 | + old = dax_lock_entry(xas, new_entry); |
---|
| 784 | + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | |
---|
| 785 | + DAX_LOCKED)); |
---|
846 | 786 | entry = new_entry; |
---|
| 787 | + } else { |
---|
| 788 | + xas_load(xas); /* Walk the xa_state */ |
---|
847 | 789 | } |
---|
848 | 790 | |
---|
849 | 791 | if (dirty) |
---|
850 | | - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); |
---|
| 792 | + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); |
---|
851 | 793 | |
---|
852 | | - xa_unlock_irq(pages); |
---|
| 794 | + xas_unlock_irq(xas); |
---|
853 | 795 | return entry; |
---|
854 | 796 | } |
---|
855 | 797 | |
---|
856 | | -static inline unsigned long |
---|
857 | | -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) |
---|
| 798 | +static inline |
---|
| 799 | +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) |
---|
858 | 800 | { |
---|
859 | 801 | unsigned long address; |
---|
860 | 802 | |
---|
.. | .. |
---|
864 | 806 | } |
---|
865 | 807 | |
---|
866 | 808 | /* Walk all mappings of a given index of a file and writeprotect them */ |
---|
867 | | -static void dax_mapping_entry_mkclean(struct address_space *mapping, |
---|
868 | | - pgoff_t index, unsigned long pfn) |
---|
| 809 | +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, |
---|
| 810 | + unsigned long pfn) |
---|
869 | 811 | { |
---|
870 | 812 | struct vm_area_struct *vma; |
---|
871 | 813 | pte_t pte, *ptep = NULL; |
---|
.. | .. |
---|
874 | 816 | |
---|
875 | 817 | i_mmap_lock_read(mapping); |
---|
876 | 818 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { |
---|
877 | | - unsigned long address, start, end; |
---|
| 819 | + struct mmu_notifier_range range; |
---|
| 820 | + unsigned long address; |
---|
878 | 821 | |
---|
879 | 822 | cond_resched(); |
---|
880 | 823 | |
---|
.. | .. |
---|
884 | 827 | address = pgoff_address(index, vma); |
---|
885 | 828 | |
---|
886 | 829 | /* |
---|
887 | | - * Note because we provide start/end to follow_pte_pmd it will |
---|
888 | | - * call mmu_notifier_invalidate_range_start() on our behalf |
---|
889 | | - * before taking any lock. |
---|
| 830 | + * follow_invalidate_pte() will use the range to call |
---|
| 831 | + * mmu_notifier_invalidate_range_start() on our behalf before |
---|
| 832 | + * taking any lock. |
---|
890 | 833 | */ |
---|
891 | | - if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) |
---|
| 834 | + if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, |
---|
| 835 | + &pmdp, &ptl)) |
---|
892 | 836 | continue; |
---|
893 | 837 | |
---|
894 | 838 | /* |
---|
.. | .. |
---|
907 | 851 | if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) |
---|
908 | 852 | goto unlock_pmd; |
---|
909 | 853 | |
---|
910 | | - flush_cache_page(vma, address, pfn); |
---|
| 854 | + flush_cache_range(vma, address, |
---|
| 855 | + address + HPAGE_PMD_SIZE); |
---|
911 | 856 | pmd = pmdp_invalidate(vma, address, pmdp); |
---|
912 | 857 | pmd = pmd_wrprotect(pmd); |
---|
913 | 858 | pmd = pmd_mkclean(pmd); |
---|
.. | .. |
---|
930 | 875 | pte_unmap_unlock(ptep, ptl); |
---|
931 | 876 | } |
---|
932 | 877 | |
---|
933 | | - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
---|
| 878 | + mmu_notifier_invalidate_range_end(&range); |
---|
934 | 879 | } |
---|
935 | 880 | i_mmap_unlock_read(mapping); |
---|
936 | 881 | } |
---|
937 | 882 | |
---|
938 | | -static int dax_writeback_one(struct dax_device *dax_dev, |
---|
939 | | - struct address_space *mapping, pgoff_t index, void *entry) |
---|
| 883 | +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, |
---|
| 884 | + struct address_space *mapping, void *entry) |
---|
940 | 885 | { |
---|
941 | | - struct radix_tree_root *pages = &mapping->i_pages; |
---|
942 | | - void *entry2, **slot; |
---|
943 | | - unsigned long pfn; |
---|
| 886 | + unsigned long pfn, index, count; |
---|
944 | 887 | long ret = 0; |
---|
945 | | - size_t size; |
---|
946 | 888 | |
---|
947 | 889 | /* |
---|
948 | 890 | * A page got tagged dirty in DAX mapping? Something is seriously |
---|
949 | 891 | * wrong. |
---|
950 | 892 | */ |
---|
951 | | - if (WARN_ON(!radix_tree_exceptional_entry(entry))) |
---|
| 893 | + if (WARN_ON(!xa_is_value(entry))) |
---|
952 | 894 | return -EIO; |
---|
953 | 895 | |
---|
954 | | - xa_lock_irq(pages); |
---|
955 | | - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); |
---|
956 | | - /* Entry got punched out / reallocated? */ |
---|
957 | | - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) |
---|
958 | | - goto put_unlocked; |
---|
959 | | - /* |
---|
960 | | - * Entry got reallocated elsewhere? No need to writeback. We have to |
---|
961 | | - * compare pfns as we must not bail out due to difference in lockbit |
---|
962 | | - * or entry type. |
---|
963 | | - */ |
---|
964 | | - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) |
---|
965 | | - goto put_unlocked; |
---|
966 | | - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
---|
967 | | - dax_is_zero_entry(entry))) { |
---|
968 | | - ret = -EIO; |
---|
969 | | - goto put_unlocked; |
---|
| 896 | + if (unlikely(dax_is_locked(entry))) { |
---|
| 897 | + void *old_entry = entry; |
---|
| 898 | + |
---|
| 899 | + entry = get_unlocked_entry(xas, 0); |
---|
| 900 | + |
---|
| 901 | + /* Entry got punched out / reallocated? */ |
---|
| 902 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
---|
| 903 | + goto put_unlocked; |
---|
| 904 | + /* |
---|
| 905 | + * Entry got reallocated elsewhere? No need to writeback. |
---|
| 906 | + * We have to compare pfns as we must not bail out due to |
---|
| 907 | + * difference in lockbit or entry type. |
---|
| 908 | + */ |
---|
| 909 | + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) |
---|
| 910 | + goto put_unlocked; |
---|
| 911 | + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
---|
| 912 | + dax_is_zero_entry(entry))) { |
---|
| 913 | + ret = -EIO; |
---|
| 914 | + goto put_unlocked; |
---|
| 915 | + } |
---|
| 916 | + |
---|
| 917 | + /* Another fsync thread may have already done this entry */ |
---|
| 918 | + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) |
---|
| 919 | + goto put_unlocked; |
---|
970 | 920 | } |
---|
971 | 921 | |
---|
972 | | - /* Another fsync thread may have already written back this entry */ |
---|
973 | | - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) |
---|
974 | | - goto put_unlocked; |
---|
975 | 922 | /* Lock the entry to serialize with page faults */ |
---|
976 | | - entry = lock_slot(mapping, slot); |
---|
| 923 | + dax_lock_entry(xas, entry); |
---|
| 924 | + |
---|
977 | 925 | /* |
---|
978 | 926 | * We can clear the tag now but we have to be careful so that concurrent |
---|
979 | 927 | * dax_writeback_one() calls for the same index cannot finish before we |
---|
.. | .. |
---|
981 | 929 | * at the entry only under the i_pages lock and once they do that |
---|
982 | 930 | * they will see the entry locked and wait for it to unlock. |
---|
983 | 931 | */ |
---|
984 | | - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); |
---|
985 | | - xa_unlock_irq(pages); |
---|
| 932 | + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); |
---|
| 933 | + xas_unlock_irq(xas); |
---|
986 | 934 | |
---|
987 | 935 | /* |
---|
988 | | - * Even if dax_writeback_mapping_range() was given a wbc->range_start |
---|
989 | | - * in the middle of a PMD, the 'index' we are given will be aligned to |
---|
990 | | - * the start index of the PMD, as will the pfn we pull from 'entry'. |
---|
| 936 | + * If dax_writeback_mapping_range() was given a wbc->range_start |
---|
| 937 | + * in the middle of a PMD, the 'index' we use needs to be |
---|
| 938 | + * aligned to the start of the PMD. |
---|
991 | 939 | * This allows us to flush for PMD_SIZE and not have to worry about |
---|
992 | 940 | * partial PMD writebacks. |
---|
993 | 941 | */ |
---|
994 | | - pfn = dax_radix_pfn(entry); |
---|
995 | | - size = PAGE_SIZE << dax_radix_order(entry); |
---|
| 942 | + pfn = dax_to_pfn(entry); |
---|
| 943 | + count = 1UL << dax_entry_order(entry); |
---|
| 944 | + index = xas->xa_index & ~(count - 1); |
---|
996 | 945 | |
---|
997 | | - dax_mapping_entry_mkclean(mapping, index, pfn); |
---|
998 | | - dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); |
---|
| 946 | + dax_entry_mkclean(mapping, index, pfn); |
---|
| 947 | + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); |
---|
999 | 948 | /* |
---|
1000 | 949 | * After we have flushed the cache, we can clear the dirty tag. There |
---|
1001 | 950 | * cannot be new dirty data in the pfn after the flush has completed as |
---|
1002 | 951 | * the pfn mappings are writeprotected and fault waits for mapping |
---|
1003 | 952 | * entry lock. |
---|
1004 | 953 | */ |
---|
1005 | | - xa_lock_irq(pages); |
---|
1006 | | - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); |
---|
1007 | | - xa_unlock_irq(pages); |
---|
1008 | | - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
---|
1009 | | - put_locked_mapping_entry(mapping, index); |
---|
| 954 | + xas_reset(xas); |
---|
| 955 | + xas_lock_irq(xas); |
---|
| 956 | + xas_store(xas, entry); |
---|
| 957 | + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); |
---|
| 958 | + dax_wake_entry(xas, entry, WAKE_NEXT); |
---|
| 959 | + |
---|
| 960 | + trace_dax_writeback_one(mapping->host, index, count); |
---|
1010 | 961 | return ret; |
---|
1011 | 962 | |
---|
1012 | 963 | put_unlocked: |
---|
1013 | | - put_unlocked_mapping_entry(mapping, index, entry2); |
---|
1014 | | - xa_unlock_irq(pages); |
---|
| 964 | + put_unlocked_entry(xas, entry, WAKE_NEXT); |
---|
1015 | 965 | return ret; |
---|
1016 | 966 | } |
---|
1017 | 967 | |
---|
.. | .. |
---|
1021 | 971 | * on persistent storage prior to completion of the operation. |
---|
1022 | 972 | */ |
---|
1023 | 973 | int dax_writeback_mapping_range(struct address_space *mapping, |
---|
1024 | | - struct block_device *bdev, struct writeback_control *wbc) |
---|
| 974 | + struct dax_device *dax_dev, struct writeback_control *wbc) |
---|
1025 | 975 | { |
---|
| 976 | + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); |
---|
1026 | 977 | struct inode *inode = mapping->host; |
---|
1027 | | - pgoff_t start_index, end_index; |
---|
1028 | | - pgoff_t indices[PAGEVEC_SIZE]; |
---|
1029 | | - struct dax_device *dax_dev; |
---|
1030 | | - struct pagevec pvec; |
---|
1031 | | - bool done = false; |
---|
1032 | | - int i, ret = 0; |
---|
| 978 | + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; |
---|
| 979 | + void *entry; |
---|
| 980 | + int ret = 0; |
---|
| 981 | + unsigned int scanned = 0; |
---|
1033 | 982 | |
---|
1034 | 983 | if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) |
---|
1035 | 984 | return -EIO; |
---|
.. | .. |
---|
1037 | 986 | if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) |
---|
1038 | 987 | return 0; |
---|
1039 | 988 | |
---|
1040 | | - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); |
---|
1041 | | - if (!dax_dev) |
---|
1042 | | - return -EIO; |
---|
| 989 | + trace_dax_writeback_range(inode, xas.xa_index, end_index); |
---|
1043 | 990 | |
---|
1044 | | - start_index = wbc->range_start >> PAGE_SHIFT; |
---|
1045 | | - end_index = wbc->range_end >> PAGE_SHIFT; |
---|
| 991 | + tag_pages_for_writeback(mapping, xas.xa_index, end_index); |
---|
1046 | 992 | |
---|
1047 | | - trace_dax_writeback_range(inode, start_index, end_index); |
---|
1048 | | - |
---|
1049 | | - tag_pages_for_writeback(mapping, start_index, end_index); |
---|
1050 | | - |
---|
1051 | | - pagevec_init(&pvec); |
---|
1052 | | - while (!done) { |
---|
1053 | | - pvec.nr = find_get_entries_tag(mapping, start_index, |
---|
1054 | | - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, |
---|
1055 | | - pvec.pages, indices); |
---|
1056 | | - |
---|
1057 | | - if (pvec.nr == 0) |
---|
| 993 | + xas_lock_irq(&xas); |
---|
| 994 | + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { |
---|
| 995 | + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); |
---|
| 996 | + if (ret < 0) { |
---|
| 997 | + mapping_set_error(mapping, ret); |
---|
1058 | 998 | break; |
---|
1059 | | - |
---|
1060 | | - for (i = 0; i < pvec.nr; i++) { |
---|
1061 | | - if (indices[i] > end_index) { |
---|
1062 | | - done = true; |
---|
1063 | | - break; |
---|
1064 | | - } |
---|
1065 | | - |
---|
1066 | | - ret = dax_writeback_one(dax_dev, mapping, indices[i], |
---|
1067 | | - pvec.pages[i]); |
---|
1068 | | - if (ret < 0) { |
---|
1069 | | - mapping_set_error(mapping, ret); |
---|
1070 | | - goto out; |
---|
1071 | | - } |
---|
1072 | 999 | } |
---|
1073 | | - start_index = indices[pvec.nr - 1] + 1; |
---|
| 1000 | + if (++scanned % XA_CHECK_SCHED) |
---|
| 1001 | + continue; |
---|
| 1002 | + |
---|
| 1003 | + xas_pause(&xas); |
---|
| 1004 | + xas_unlock_irq(&xas); |
---|
| 1005 | + cond_resched(); |
---|
| 1006 | + xas_lock_irq(&xas); |
---|
1074 | 1007 | } |
---|
1075 | | -out: |
---|
1076 | | - put_dax(dax_dev); |
---|
1077 | | - trace_dax_writeback_range_done(inode, start_index, end_index); |
---|
1078 | | - return (ret < 0 ? ret : 0); |
---|
| 1008 | + xas_unlock_irq(&xas); |
---|
| 1009 | + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); |
---|
| 1010 | + return ret; |
---|
1079 | 1011 | } |
---|
1080 | 1012 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); |
---|
1081 | 1013 | |
---|
.. | .. |
---|
1123 | 1055 | * If this page is ever written to we will re-fault and change the mapping to |
---|
1124 | 1056 | * point to real DAX storage instead. |
---|
1125 | 1057 | */ |
---|
1126 | | -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, |
---|
1127 | | - struct vm_fault *vmf) |
---|
| 1058 | +static vm_fault_t dax_load_hole(struct xa_state *xas, |
---|
| 1059 | + struct address_space *mapping, void **entry, |
---|
| 1060 | + struct vm_fault *vmf) |
---|
1128 | 1061 | { |
---|
1129 | 1062 | struct inode *inode = mapping->host; |
---|
1130 | 1063 | unsigned long vaddr = vmf->address; |
---|
1131 | 1064 | pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); |
---|
1132 | 1065 | vm_fault_t ret; |
---|
1133 | 1066 | |
---|
1134 | | - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, |
---|
1135 | | - false); |
---|
| 1067 | + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, |
---|
| 1068 | + DAX_ZERO_PAGE, false); |
---|
| 1069 | + |
---|
1136 | 1070 | ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); |
---|
1137 | 1071 | trace_dax_load_hole(inode, vmf, ret); |
---|
1138 | 1072 | return ret; |
---|
1139 | 1073 | } |
---|
1140 | 1074 | |
---|
1141 | | -static bool dax_range_is_aligned(struct block_device *bdev, |
---|
1142 | | - unsigned int offset, unsigned int length) |
---|
| 1075 | +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) |
---|
1143 | 1076 | { |
---|
1144 | | - unsigned short sector_size = bdev_logical_block_size(bdev); |
---|
| 1077 | + sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); |
---|
| 1078 | + pgoff_t pgoff; |
---|
| 1079 | + long rc, id; |
---|
| 1080 | + void *kaddr; |
---|
| 1081 | + bool page_aligned = false; |
---|
| 1082 | + unsigned offset = offset_in_page(pos); |
---|
| 1083 | + unsigned size = min_t(u64, PAGE_SIZE - offset, length); |
---|
1145 | 1084 | |
---|
1146 | | - if (!IS_ALIGNED(offset, sector_size)) |
---|
1147 | | - return false; |
---|
1148 | | - if (!IS_ALIGNED(length, sector_size)) |
---|
1149 | | - return false; |
---|
| 1085 | + if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && |
---|
| 1086 | + (size == PAGE_SIZE)) |
---|
| 1087 | + page_aligned = true; |
---|
1150 | 1088 | |
---|
1151 | | - return true; |
---|
1152 | | -} |
---|
| 1089 | + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); |
---|
| 1090 | + if (rc) |
---|
| 1091 | + return rc; |
---|
1153 | 1092 | |
---|
1154 | | -int __dax_zero_page_range(struct block_device *bdev, |
---|
1155 | | - struct dax_device *dax_dev, sector_t sector, |
---|
1156 | | - unsigned int offset, unsigned int size) |
---|
1157 | | -{ |
---|
1158 | | - if (dax_range_is_aligned(bdev, offset, size)) { |
---|
1159 | | - sector_t start_sector = sector + (offset >> 9); |
---|
| 1093 | + id = dax_read_lock(); |
---|
1160 | 1094 | |
---|
1161 | | - return blkdev_issue_zeroout(bdev, start_sector, |
---|
1162 | | - size >> 9, GFP_NOFS, 0); |
---|
1163 | | - } else { |
---|
1164 | | - pgoff_t pgoff; |
---|
1165 | | - long rc, id; |
---|
1166 | | - void *kaddr; |
---|
1167 | | - |
---|
1168 | | - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); |
---|
1169 | | - if (rc) |
---|
1170 | | - return rc; |
---|
1171 | | - |
---|
1172 | | - id = dax_read_lock(); |
---|
1173 | | - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); |
---|
1174 | | - if (rc < 0) { |
---|
1175 | | - dax_read_unlock(id); |
---|
1176 | | - return rc; |
---|
1177 | | - } |
---|
1178 | | - memset(kaddr + offset, 0, size); |
---|
1179 | | - dax_flush(dax_dev, kaddr + offset, size); |
---|
| 1095 | + if (page_aligned) |
---|
| 1096 | + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); |
---|
| 1097 | + else |
---|
| 1098 | + rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); |
---|
| 1099 | + if (rc < 0) { |
---|
1180 | 1100 | dax_read_unlock(id); |
---|
| 1101 | + return rc; |
---|
1181 | 1102 | } |
---|
1182 | | - return 0; |
---|
| 1103 | + |
---|
| 1104 | + if (!page_aligned) { |
---|
| 1105 | + memset(kaddr + offset, 0, size); |
---|
| 1106 | + dax_flush(iomap->dax_dev, kaddr + offset, size); |
---|
| 1107 | + } |
---|
| 1108 | + dax_read_unlock(id); |
---|
| 1109 | + return size; |
---|
1183 | 1110 | } |
---|
1184 | | -EXPORT_SYMBOL_GPL(__dax_zero_page_range); |
---|
1185 | 1111 | |
---|
1186 | 1112 | static loff_t |
---|
1187 | 1113 | dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, |
---|
1188 | | - struct iomap *iomap) |
---|
| 1114 | + struct iomap *iomap, struct iomap *srcmap) |
---|
1189 | 1115 | { |
---|
1190 | 1116 | struct block_device *bdev = iomap->bdev; |
---|
1191 | 1117 | struct dax_device *dax_dev = iomap->dax_dev; |
---|
.. | .. |
---|
1295 | 1221 | unsigned flags = 0; |
---|
1296 | 1222 | |
---|
1297 | 1223 | if (iov_iter_rw(iter) == WRITE) { |
---|
1298 | | - lockdep_assert_held_exclusive(&inode->i_rwsem); |
---|
| 1224 | + lockdep_assert_held_write(&inode->i_rwsem); |
---|
1299 | 1225 | flags |= IOMAP_WRITE; |
---|
1300 | 1226 | } else { |
---|
1301 | 1227 | lockdep_assert_held(&inode->i_rwsem); |
---|
.. | .. |
---|
1322 | 1248 | { |
---|
1323 | 1249 | if (error == 0) |
---|
1324 | 1250 | return VM_FAULT_NOPAGE; |
---|
1325 | | - if (error == -ENOMEM) |
---|
1326 | | - return VM_FAULT_OOM; |
---|
1327 | | - return VM_FAULT_SIGBUS; |
---|
| 1251 | + return vmf_error(error); |
---|
1328 | 1252 | } |
---|
1329 | 1253 | |
---|
1330 | 1254 | /* |
---|
.. | .. |
---|
1343 | 1267 | { |
---|
1344 | 1268 | struct vm_area_struct *vma = vmf->vma; |
---|
1345 | 1269 | struct address_space *mapping = vma->vm_file->f_mapping; |
---|
| 1270 | + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); |
---|
1346 | 1271 | struct inode *inode = mapping->host; |
---|
1347 | 1272 | unsigned long vaddr = vmf->address; |
---|
1348 | 1273 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
---|
1349 | | - struct iomap iomap = { 0 }; |
---|
| 1274 | + struct iomap iomap = { .type = IOMAP_HOLE }; |
---|
| 1275 | + struct iomap srcmap = { .type = IOMAP_HOLE }; |
---|
1350 | 1276 | unsigned flags = IOMAP_FAULT; |
---|
1351 | 1277 | int error, major = 0; |
---|
1352 | 1278 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
---|
.. | .. |
---|
1369 | 1295 | if (write && !vmf->cow_page) |
---|
1370 | 1296 | flags |= IOMAP_WRITE; |
---|
1371 | 1297 | |
---|
1372 | | - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); |
---|
1373 | | - if (IS_ERR(entry)) { |
---|
1374 | | - ret = dax_fault_return(PTR_ERR(entry)); |
---|
| 1298 | + entry = grab_mapping_entry(&xas, mapping, 0); |
---|
| 1299 | + if (xa_is_internal(entry)) { |
---|
| 1300 | + ret = xa_to_internal(entry); |
---|
1375 | 1301 | goto out; |
---|
1376 | 1302 | } |
---|
1377 | 1303 | |
---|
.. | .. |
---|
1391 | 1317 | * the file system block size to be equal the page size, which means |
---|
1392 | 1318 | * that we never have to deal with more than a single extent here. |
---|
1393 | 1319 | */ |
---|
1394 | | - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
---|
| 1320 | + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); |
---|
1395 | 1321 | if (iomap_errp) |
---|
1396 | 1322 | *iomap_errp = error; |
---|
1397 | 1323 | if (error) { |
---|
.. | .. |
---|
1412 | 1338 | clear_user_highpage(vmf->cow_page, vaddr); |
---|
1413 | 1339 | break; |
---|
1414 | 1340 | case IOMAP_MAPPED: |
---|
1415 | | - error = copy_user_dax(iomap.bdev, iomap.dax_dev, |
---|
1416 | | - sector, PAGE_SIZE, vmf->cow_page, vaddr); |
---|
| 1341 | + error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, |
---|
| 1342 | + sector, vmf->cow_page, vaddr); |
---|
1417 | 1343 | break; |
---|
1418 | 1344 | default: |
---|
1419 | 1345 | WARN_ON_ONCE(1); |
---|
.. | .. |
---|
1444 | 1370 | if (error < 0) |
---|
1445 | 1371 | goto error_finish_iomap; |
---|
1446 | 1372 | |
---|
1447 | | - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
---|
| 1373 | + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, |
---|
1448 | 1374 | 0, write && !sync); |
---|
1449 | 1375 | |
---|
1450 | 1376 | /* |
---|
.. | .. |
---|
1472 | 1398 | case IOMAP_UNWRITTEN: |
---|
1473 | 1399 | case IOMAP_HOLE: |
---|
1474 | 1400 | if (!write) { |
---|
1475 | | - ret = dax_load_hole(mapping, entry, vmf); |
---|
| 1401 | + ret = dax_load_hole(&xas, mapping, &entry, vmf); |
---|
1476 | 1402 | goto finish_iomap; |
---|
1477 | 1403 | } |
---|
1478 | | - /*FALLTHRU*/ |
---|
| 1404 | + fallthrough; |
---|
1479 | 1405 | default: |
---|
1480 | 1406 | WARN_ON_ONCE(1); |
---|
1481 | 1407 | error = -EIO; |
---|
.. | .. |
---|
1499 | 1425 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
---|
1500 | 1426 | } |
---|
1501 | 1427 | unlock_entry: |
---|
1502 | | - put_locked_mapping_entry(mapping, vmf->pgoff); |
---|
| 1428 | + dax_unlock_entry(&xas, entry); |
---|
1503 | 1429 | out: |
---|
1504 | 1430 | trace_dax_pte_fault_done(inode, vmf, ret); |
---|
1505 | 1431 | return ret | major; |
---|
1506 | 1432 | } |
---|
1507 | 1433 | |
---|
1508 | 1434 | #ifdef CONFIG_FS_DAX_PMD |
---|
1509 | | -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
---|
1510 | | - void *entry) |
---|
| 1435 | +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, |
---|
| 1436 | + struct iomap *iomap, void **entry) |
---|
1511 | 1437 | { |
---|
1512 | 1438 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
---|
1513 | 1439 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
---|
| 1440 | + struct vm_area_struct *vma = vmf->vma; |
---|
1514 | 1441 | struct inode *inode = mapping->host; |
---|
| 1442 | + pgtable_t pgtable = NULL; |
---|
1515 | 1443 | struct page *zero_page; |
---|
1516 | | - void *ret = NULL; |
---|
1517 | 1444 | spinlock_t *ptl; |
---|
1518 | 1445 | pmd_t pmd_entry; |
---|
1519 | 1446 | pfn_t pfn; |
---|
.. | .. |
---|
1524 | 1451 | goto fallback; |
---|
1525 | 1452 | |
---|
1526 | 1453 | pfn = page_to_pfn_t(zero_page); |
---|
1527 | | - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
---|
1528 | | - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); |
---|
| 1454 | + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, |
---|
| 1455 | + DAX_PMD | DAX_ZERO_PAGE, false); |
---|
| 1456 | + |
---|
| 1457 | + if (arch_needs_pgtable_deposit()) { |
---|
| 1458 | + pgtable = pte_alloc_one(vma->vm_mm); |
---|
| 1459 | + if (!pgtable) |
---|
| 1460 | + return VM_FAULT_OOM; |
---|
| 1461 | + } |
---|
1529 | 1462 | |
---|
1530 | 1463 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
---|
1531 | 1464 | if (!pmd_none(*(vmf->pmd))) { |
---|
.. | .. |
---|
1533 | 1466 | goto fallback; |
---|
1534 | 1467 | } |
---|
1535 | 1468 | |
---|
| 1469 | + if (pgtable) { |
---|
| 1470 | + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
---|
| 1471 | + mm_inc_nr_ptes(vma->vm_mm); |
---|
| 1472 | + } |
---|
1536 | 1473 | pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); |
---|
1537 | 1474 | pmd_entry = pmd_mkhuge(pmd_entry); |
---|
1538 | 1475 | set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); |
---|
1539 | 1476 | spin_unlock(ptl); |
---|
1540 | | - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); |
---|
| 1477 | + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); |
---|
1541 | 1478 | return VM_FAULT_NOPAGE; |
---|
1542 | 1479 | |
---|
1543 | 1480 | fallback: |
---|
1544 | | - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); |
---|
| 1481 | + if (pgtable) |
---|
| 1482 | + pte_free(vma->vm_mm, pgtable); |
---|
| 1483 | + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); |
---|
1545 | 1484 | return VM_FAULT_FALLBACK; |
---|
1546 | 1485 | } |
---|
1547 | 1486 | |
---|
.. | .. |
---|
1550 | 1489 | { |
---|
1551 | 1490 | struct vm_area_struct *vma = vmf->vma; |
---|
1552 | 1491 | struct address_space *mapping = vma->vm_file->f_mapping; |
---|
| 1492 | + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); |
---|
1553 | 1493 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
---|
1554 | 1494 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
---|
1555 | 1495 | bool sync; |
---|
1556 | 1496 | unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; |
---|
1557 | 1497 | struct inode *inode = mapping->host; |
---|
1558 | 1498 | vm_fault_t result = VM_FAULT_FALLBACK; |
---|
1559 | | - struct iomap iomap = { 0 }; |
---|
1560 | | - pgoff_t max_pgoff, pgoff; |
---|
| 1499 | + struct iomap iomap = { .type = IOMAP_HOLE }; |
---|
| 1500 | + struct iomap srcmap = { .type = IOMAP_HOLE }; |
---|
| 1501 | + pgoff_t max_pgoff; |
---|
1561 | 1502 | void *entry; |
---|
1562 | 1503 | loff_t pos; |
---|
1563 | 1504 | int error; |
---|
.. | .. |
---|
1568 | 1509 | * supposed to hold locks serializing us with truncate / punch hole so |
---|
1569 | 1510 | * this is a reliable test. |
---|
1570 | 1511 | */ |
---|
1571 | | - pgoff = linear_page_index(vma, pmd_addr); |
---|
1572 | 1512 | max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
---|
1573 | 1513 | |
---|
1574 | 1514 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); |
---|
.. | .. |
---|
1577 | 1517 | * Make sure that the faulting address's PMD offset (color) matches |
---|
1578 | 1518 | * the PMD offset from the start of the file. This is necessary so |
---|
1579 | 1519 | * that a PMD range in the page table overlaps exactly with a PMD |
---|
1580 | | - * range in the radix tree. |
---|
| 1520 | + * range in the page cache. |
---|
1581 | 1521 | */ |
---|
1582 | 1522 | if ((vmf->pgoff & PG_PMD_COLOUR) != |
---|
1583 | 1523 | ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) |
---|
.. | .. |
---|
1593 | 1533 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) |
---|
1594 | 1534 | goto fallback; |
---|
1595 | 1535 | |
---|
1596 | | - if (pgoff >= max_pgoff) { |
---|
| 1536 | + if (xas.xa_index >= max_pgoff) { |
---|
1597 | 1537 | result = VM_FAULT_SIGBUS; |
---|
1598 | 1538 | goto out; |
---|
1599 | 1539 | } |
---|
1600 | 1540 | |
---|
1601 | 1541 | /* If the PMD would extend beyond the file size */ |
---|
1602 | | - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) |
---|
| 1542 | + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) |
---|
1603 | 1543 | goto fallback; |
---|
1604 | 1544 | |
---|
1605 | 1545 | /* |
---|
1606 | | - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a |
---|
1607 | | - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page |
---|
1608 | | - * is already in the tree, for instance), it will return -EEXIST and |
---|
1609 | | - * we just fall back to 4k entries. |
---|
| 1546 | + * grab_mapping_entry() will make sure we get an empty PMD entry, |
---|
| 1547 | + * a zero PMD entry or a DAX PMD. If it can't (because a PTE |
---|
| 1548 | + * entry is already in the array, for instance), it will return |
---|
| 1549 | + * VM_FAULT_FALLBACK. |
---|
1610 | 1550 | */ |
---|
1611 | | - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); |
---|
1612 | | - if (IS_ERR(entry)) |
---|
| 1551 | + entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); |
---|
| 1552 | + if (xa_is_internal(entry)) { |
---|
| 1553 | + result = xa_to_internal(entry); |
---|
1613 | 1554 | goto fallback; |
---|
| 1555 | + } |
---|
1614 | 1556 | |
---|
1615 | 1557 | /* |
---|
1616 | 1558 | * It is possible, particularly with mixed reads & writes to private |
---|
.. | .. |
---|
1629 | 1571 | * setting up a mapping, so really we're using iomap_begin() as a way |
---|
1630 | 1572 | * to look up our filesystem block. |
---|
1631 | 1573 | */ |
---|
1632 | | - pos = (loff_t)pgoff << PAGE_SHIFT; |
---|
1633 | | - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); |
---|
| 1574 | + pos = (loff_t)xas.xa_index << PAGE_SHIFT; |
---|
| 1575 | + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, |
---|
| 1576 | + &srcmap); |
---|
1634 | 1577 | if (error) |
---|
1635 | 1578 | goto unlock_entry; |
---|
1636 | 1579 | |
---|
.. | .. |
---|
1645 | 1588 | if (error < 0) |
---|
1646 | 1589 | goto finish_iomap; |
---|
1647 | 1590 | |
---|
1648 | | - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
---|
1649 | | - RADIX_DAX_PMD, write && !sync); |
---|
| 1591 | + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, |
---|
| 1592 | + DAX_PMD, write && !sync); |
---|
1650 | 1593 | |
---|
1651 | 1594 | /* |
---|
1652 | 1595 | * If we are doing synchronous page fault and inode needs fsync, |
---|
.. | .. |
---|
1669 | 1612 | case IOMAP_HOLE: |
---|
1670 | 1613 | if (WARN_ON_ONCE(write)) |
---|
1671 | 1614 | break; |
---|
1672 | | - result = dax_pmd_load_hole(vmf, &iomap, entry); |
---|
| 1615 | + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); |
---|
1673 | 1616 | break; |
---|
1674 | 1617 | default: |
---|
1675 | 1618 | WARN_ON_ONCE(1); |
---|
.. | .. |
---|
1692 | 1635 | &iomap); |
---|
1693 | 1636 | } |
---|
1694 | 1637 | unlock_entry: |
---|
1695 | | - put_locked_mapping_entry(mapping, pgoff); |
---|
| 1638 | + dax_unlock_entry(&xas, entry); |
---|
1696 | 1639 | fallback: |
---|
1697 | 1640 | if (result == VM_FAULT_FALLBACK) { |
---|
1698 | 1641 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
---|
.. | .. |
---|
1737 | 1680 | } |
---|
1738 | 1681 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
---|
1739 | 1682 | |
---|
1740 | | -/** |
---|
| 1683 | +/* |
---|
1741 | 1684 | * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables |
---|
1742 | 1685 | * @vmf: The description of the fault |
---|
1743 | | - * @pe_size: Size of entry to be inserted |
---|
1744 | 1686 | * @pfn: PFN to insert |
---|
| 1687 | + * @order: Order of entry to insert. |
---|
1745 | 1688 | * |
---|
1746 | | - * This function inserts writeable PTE or PMD entry into page tables for mmaped |
---|
1747 | | - * DAX file. It takes care of marking corresponding radix tree entry as dirty |
---|
1748 | | - * as well. |
---|
| 1689 | + * This function inserts a writeable PTE or PMD entry into the page tables |
---|
| 1690 | + * for an mmaped DAX file. It also marks the page cache entry as dirty. |
---|
1749 | 1691 | */ |
---|
1750 | | -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, |
---|
1751 | | - enum page_entry_size pe_size, |
---|
1752 | | - pfn_t pfn) |
---|
| 1692 | +static vm_fault_t |
---|
| 1693 | +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) |
---|
1753 | 1694 | { |
---|
1754 | 1695 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
---|
1755 | | - void *entry, **slot; |
---|
1756 | | - pgoff_t index = vmf->pgoff; |
---|
| 1696 | + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); |
---|
| 1697 | + void *entry; |
---|
1757 | 1698 | vm_fault_t ret; |
---|
1758 | 1699 | |
---|
1759 | | - xa_lock_irq(&mapping->i_pages); |
---|
1760 | | - entry = get_unlocked_mapping_entry(mapping, index, &slot); |
---|
| 1700 | + xas_lock_irq(&xas); |
---|
| 1701 | + entry = get_unlocked_entry(&xas, order); |
---|
1761 | 1702 | /* Did we race with someone splitting entry or so? */ |
---|
1762 | | - if (!entry || |
---|
1763 | | - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || |
---|
1764 | | - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { |
---|
1765 | | - put_unlocked_mapping_entry(mapping, index, entry); |
---|
1766 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 1703 | + if (!entry || dax_is_conflict(entry) || |
---|
| 1704 | + (order == 0 && !dax_is_pte_entry(entry))) { |
---|
| 1705 | + put_unlocked_entry(&xas, entry, WAKE_NEXT); |
---|
| 1706 | + xas_unlock_irq(&xas); |
---|
1767 | 1707 | trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, |
---|
1768 | 1708 | VM_FAULT_NOPAGE); |
---|
1769 | 1709 | return VM_FAULT_NOPAGE; |
---|
1770 | 1710 | } |
---|
1771 | | - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); |
---|
1772 | | - entry = lock_slot(mapping, slot); |
---|
1773 | | - xa_unlock_irq(&mapping->i_pages); |
---|
1774 | | - switch (pe_size) { |
---|
1775 | | - case PE_SIZE_PTE: |
---|
| 1711 | + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); |
---|
| 1712 | + dax_lock_entry(&xas, entry); |
---|
| 1713 | + xas_unlock_irq(&xas); |
---|
| 1714 | + if (order == 0) |
---|
1776 | 1715 | ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); |
---|
1777 | | - break; |
---|
1778 | 1716 | #ifdef CONFIG_FS_DAX_PMD |
---|
1779 | | - case PE_SIZE_PMD: |
---|
| 1717 | + else if (order == PMD_ORDER) |
---|
1780 | 1718 | ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); |
---|
1781 | | - break; |
---|
1782 | 1719 | #endif |
---|
1783 | | - default: |
---|
| 1720 | + else |
---|
1784 | 1721 | ret = VM_FAULT_FALLBACK; |
---|
1785 | | - } |
---|
1786 | | - put_locked_mapping_entry(mapping, index); |
---|
| 1722 | + dax_unlock_entry(&xas, entry); |
---|
1787 | 1723 | trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); |
---|
1788 | 1724 | return ret; |
---|
1789 | 1725 | } |
---|
.. | .. |
---|
1803 | 1739 | { |
---|
1804 | 1740 | int err; |
---|
1805 | 1741 | loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; |
---|
1806 | | - size_t len = 0; |
---|
| 1742 | + unsigned int order = pe_order(pe_size); |
---|
| 1743 | + size_t len = PAGE_SIZE << order; |
---|
1807 | 1744 | |
---|
1808 | | - if (pe_size == PE_SIZE_PTE) |
---|
1809 | | - len = PAGE_SIZE; |
---|
1810 | | - else if (pe_size == PE_SIZE_PMD) |
---|
1811 | | - len = PMD_SIZE; |
---|
1812 | | - else |
---|
1813 | | - WARN_ON_ONCE(1); |
---|
1814 | 1745 | err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); |
---|
1815 | 1746 | if (err) |
---|
1816 | 1747 | return VM_FAULT_SIGBUS; |
---|
1817 | | - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); |
---|
| 1748 | + return dax_insert_pfn_mkwrite(vmf, pfn, order); |
---|
1818 | 1749 | } |
---|
1819 | 1750 | EXPORT_SYMBOL_GPL(dax_finish_sync_fault); |
---|