~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,17 +1,9 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* fs/dax.c - Direct Access filesystem code
3	4	* Copyright (c) 2013-2014 Intel Corporation
4	5	* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5	6	* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6		- *
7		- * This program is free software; you can redistribute it and/or modify it
8		- * under the terms and conditions of the GNU General Public License,
9		- * version 2, as published by the Free Software Foundation.
10		- *
11		- * This program is distributed in the hope it will be useful, but WITHOUT
12		- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13		- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14		- * more details.
15	7	*/
16	8
17	9	#include <linux/atomic.h>
..	..	@@ -33,10 +25,21 @@
33	25	#include <linux/sizes.h>
34	26	#include <linux/mmu_notifier.h>
35	27	#include <linux/iomap.h>
36		-#include "internal.h"
	28	+#include <asm/pgalloc.h>
37	29
38	30	#define CREATE_TRACE_POINTS
39	31	#include <trace/events/fs_dax.h>
	32	+
	33	+static inline unsigned int pe_order(enum page_entry_size pe_size)
	34	+{
	35	+ if (pe_size == PE_SIZE_PTE)
	36	+ return PAGE_SHIFT - PAGE_SHIFT;
	37	+ if (pe_size == PE_SIZE_PMD)
	38	+ return PMD_SHIFT - PAGE_SHIFT;
	39	+ if (pe_size == PE_SIZE_PUD)
	40	+ return PUD_SHIFT - PAGE_SHIFT;
	41	+ return ~0;
	42	+}
40	43
41	44	/* We choose 4096 entries - same as per-zone page wait tables */
42	45	#define DAX_WAIT_TABLE_BITS 12
..	..	@@ -45,6 +48,9 @@
45	48	/* The 'colour' (ie low bits) within a PMD of a page offset. */
46	49	#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
47	50	#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
	51	+
	52	+/* The order of a PMD entry */
	53	+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
48	54
49	55	static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
50	56
..	..	@@ -59,63 +65,77 @@
59	65	fs_initcall(init_dax_wait_table);
60	66
61	67	/*
62		- * We use lowest available bit in exceptional entry for locking, one bit for
63		- * the entry size (PMD) and two more to tell us if the entry is a zero page or
64		- * an empty entry that is just used for locking. In total four special bits.
	68	+ * DAX pagecache entries use XArray value entries so they can't be mistaken
	69	+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
	70	+ * and two more to tell us if the entry is a zero page or an empty entry that
	71	+ * is just used for locking. In total four special bits.
65	72	*
66	73	* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
67	74	* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
68	75	* block allocation.
69	76	*/
70		-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
71		-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
72		-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
73		-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
74		-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
	77	+#define DAX_SHIFT (4)
	78	+#define DAX_LOCKED (1UL << 0)
	79	+#define DAX_PMD (1UL << 1)
	80	+#define DAX_ZERO_PAGE (1UL << 2)
	81	+#define DAX_EMPTY (1UL << 3)
75	82
76		-static unsigned long dax_radix_pfn(void *entry)
	83	+static unsigned long dax_to_pfn(void *entry)
77	84	{
78		- return (unsigned long)entry >> RADIX_DAX_SHIFT;
	85	+ return xa_to_value(entry) >> DAX_SHIFT;
79	86	}
80	87
81		-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
	88	+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
82	89	{
83		- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY \| flags \|
84		- (pfn << RADIX_DAX_SHIFT) \| RADIX_DAX_ENTRY_LOCK);
	90	+ return xa_mk_value(flags \| (pfn_t_to_pfn(pfn) << DAX_SHIFT));
85	91	}
86	92
87		-static unsigned int dax_radix_order(void *entry)
	93	+static bool dax_is_locked(void *entry)
88	94	{
89		- if ((unsigned long)entry & RADIX_DAX_PMD)
90		- return PMD_SHIFT - PAGE_SHIFT;
	95	+ return xa_to_value(entry) & DAX_LOCKED;
	96	+}
	97	+
	98	+static unsigned int dax_entry_order(void *entry)
	99	+{
	100	+ if (xa_to_value(entry) & DAX_PMD)
	101	+ return PMD_ORDER;
91	102	return 0;
92	103	}
93	104
94		-static int dax_is_pmd_entry(void *entry)
	105	+static unsigned long dax_is_pmd_entry(void *entry)
95	106	{
96		- return (unsigned long)entry & RADIX_DAX_PMD;
	107	+ return xa_to_value(entry) & DAX_PMD;
97	108	}
98	109
99		-static int dax_is_pte_entry(void *entry)
	110	+static bool dax_is_pte_entry(void *entry)
100	111	{
101		- return !((unsigned long)entry & RADIX_DAX_PMD);
	112	+ return !(xa_to_value(entry) & DAX_PMD);
102	113	}
103	114
104	115	static int dax_is_zero_entry(void *entry)
105	116	{
106		- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
	117	+ return xa_to_value(entry) & DAX_ZERO_PAGE;
107	118	}
108	119
109	120	static int dax_is_empty_entry(void *entry)
110	121	{
111		- return (unsigned long)entry & RADIX_DAX_EMPTY;
	122	+ return xa_to_value(entry) & DAX_EMPTY;
112	123	}
113	124
114	125	/*
115		- * DAX radix tree locking
	126	+ * true if the entry that was found is of a smaller order than the entry
	127	+ * we were looking for
	128	+ */
	129	+static bool dax_is_conflict(void *entry)
	130	+{
	131	+ return entry == XA_RETRY_ENTRY;
	132	+}
	133	+
	134	+/*
	135	+ * DAX page cache entry locking
116	136	*/
117	137	struct exceptional_entry_key {
118		- struct address_space *mapping;
	138	+ struct xarray *xa;
119	139	pgoff_t entry_start;
120	140	};
121	141
..	..	@@ -124,10 +144,21 @@
124	144	struct exceptional_entry_key key;
125	145	};
126	146
127		-static wait_queue_head_t dax_entry_waitqueue(struct address_space mapping,
128		- pgoff_t index, void entry, struct exceptional_entry_key key)
	147	+/**
	148	+ * enum dax_wake_mode: waitqueue wakeup behaviour
	149	+ * @WAKE_ALL: wake all waiters in the waitqueue
	150	+ * @WAKE_NEXT: wake only the first waiter in the waitqueue
	151	+ */
	152	+enum dax_wake_mode {
	153	+ WAKE_ALL,
	154	+ WAKE_NEXT,
	155	+};
	156	+
	157	+static wait_queue_head_t dax_entry_waitqueue(struct xa_state xas,
	158	+ void entry, struct exceptional_entry_key key)
129	159	{
130	160	unsigned long hash;
	161	+ unsigned long index = xas->xa_index;
131	162
132	163	/*
133	164	* If 'entry' is a PMD, align the 'index' that we use for the wait
..	..	@@ -136,22 +167,21 @@
136	167	*/
137	168	if (dax_is_pmd_entry(entry))
138	169	index &= ~PG_PMD_COLOUR;
139		-
140		- key->mapping = mapping;
	170	+ key->xa = xas->xa;
141	171	key->entry_start = index;
142	172
143		- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
	173	+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
144	174	return wait_table + hash;
145	175	}
146	176
147		-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
148		- int sync, void *keyp)
	177	+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
	178	+ unsigned int mode, int sync, void *keyp)
149	179	{
150	180	struct exceptional_entry_key *key = keyp;
151	181	struct wait_exceptional_entry_queue *ewait =
152	182	container_of(wait, struct wait_exceptional_entry_queue, wait);
153	183
154		- if (key->mapping != ewait->key.mapping \|\|
	184	+ if (key->xa != ewait->key.xa \|\|
155	185	key->entry_start != ewait->key.entry_start)
156	186	return 0;
157	187	return autoremove_wake_function(wait, mode, sync, NULL);
..	..	@@ -162,13 +192,13 @@
162	192	* The important information it's conveying is whether the entry at
163	193	* this index used to be a PMD entry.
164	194	*/
165		-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
166		- pgoff_t index, void *entry, bool wake_all)
	195	+static void dax_wake_entry(struct xa_state xas, void entry,
	196	+ enum dax_wake_mode mode)
167	197	{
168	198	struct exceptional_entry_key key;
169	199	wait_queue_head_t *wq;
170	200
171		- wq = dax_entry_waitqueue(mapping, index, entry, &key);
	201	+ wq = dax_entry_waitqueue(xas, entry, &key);
172	202
173	203	/*
174	204	* Checking for locked entry and prepare_to_wait_exclusive() happens
..	..	@@ -177,62 +207,22 @@
177	207	* must be in the waitqueue and the following check will see them.
178	208	*/
179	209	if (waitqueue_active(wq))
180		- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
	210	+ __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
181	211	}
182	212
183	213	/*
184		- * Check whether the given slot is locked. Must be called with the i_pages
185		- * lock held.
186		- */
187		-static inline int slot_locked(struct address_space mapping, void *slot)
188		-{
189		- unsigned long entry = (unsigned long)
190		- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
191		- return entry & RADIX_DAX_ENTRY_LOCK;
192		-}
193		-
194		-/*
195		- * Mark the given slot as locked. Must be called with the i_pages lock held.
196		- */
197		-static inline void lock_slot(struct address_space mapping, void **slot)
198		-{
199		- unsigned long entry = (unsigned long)
200		- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
201		-
202		- entry \|= RADIX_DAX_ENTRY_LOCK;
203		- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
204		- return (void *)entry;
205		-}
206		-
207		-/*
208		- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
209		- */
210		-static inline void unlock_slot(struct address_space mapping, void **slot)
211		-{
212		- unsigned long entry = (unsigned long)
213		- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
214		-
215		- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
216		- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
217		- return (void *)entry;
218		-}
219		-
220		-static void put_unlocked_mapping_entry(struct address_space *mapping,
221		- pgoff_t index, void *entry);
222		-
223		-/*
224		- * Lookup entry in radix tree, wait for it to become unlocked if it is
225		- * exceptional entry and return it. The caller must call
226		- * put_unlocked_mapping_entry() when he decided not to lock the entry or
227		- * put_locked_mapping_entry() when he locked the entry and now wants to
228		- * unlock it.
	214	+ * Look up entry in page cache, wait for it to become unlocked if it
	215	+ * is a DAX entry and return it. The caller must subsequently call
	216	+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
	217	+ * if it did. The entry returned may have a larger order than @order.
	218	+ * If @order is larger than the order of the entry found in i_pages, this
	219	+ * function returns a dax_is_conflict entry.
229	220	*
230	221	* Must be called with the i_pages lock held.
231	222	*/
232		-static void get_unlocked_mapping_entry(struct address_space mapping,
233		- pgoff_t index, void ***slotp)
	223	+static void get_unlocked_entry(struct xa_state xas, unsigned int order)
234	224	{
235		- void entry, *slot;
	225	+ void *entry;
236	226	struct wait_exceptional_entry_queue ewait;
237	227	wait_queue_head_t *wq;
238	228
..	..	@@ -240,23 +230,22 @@
240	230	ewait.wait.func = wake_exceptional_entry_func;
241	231
242	232	for (;;) {
243		- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
244		- &slot);
245		- if (!entry \|\|
246		- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) \|\|
247		- !slot_locked(mapping, slot)) {
248		- if (slotp)
249		- *slotp = slot;
	233	+ entry = xas_find_conflict(xas);
	234	+ if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
250	235	return entry;
251		- }
	236	+ if (dax_entry_order(entry) < order)
	237	+ return XA_RETRY_ENTRY;
	238	+ if (!dax_is_locked(entry))
	239	+ return entry;
252	240
253		- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
	241	+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
254	242	prepare_to_wait_exclusive(wq, &ewait.wait,
255	243	TASK_UNINTERRUPTIBLE);
256		- xa_unlock_irq(&mapping->i_pages);
	244	+ xas_unlock_irq(xas);
	245	+ xas_reset(xas);
257	246	schedule();
258	247	finish_wait(wq, &ewait.wait);
259		- xa_lock_irq(&mapping->i_pages);
	248	+ xas_lock_irq(xas);
260	249	}
261	250	}
262	251
..	..	@@ -265,8 +254,7 @@
265	254	* (it's cycled in clear_inode() after removing the entries from i_pages)
266	255	* After we call xas_unlock_irq(), we cannot touch xas->xa.
267	256	*/
268		-static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index,
269		- void **slotp, void entry)
	257	+static void wait_entry_unlocked(struct xa_state xas, void entry)
270	258	{
271	259	struct wait_exceptional_entry_queue ewait;
272	260	wait_queue_head_t *wq;
..	..	@@ -274,7 +262,7 @@
274	262	init_wait(&ewait.wait);
275	263	ewait.wait.func = wake_exceptional_entry_func;
276	264
277		- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
	265	+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
278	266	/*
279	267	* Unlike get_unlocked_entry() there is no guarantee that this
280	268	* path ever successfully retrieves an unlocked entry before an
..	..	@@ -282,45 +270,43 @@
282	270	* never successfully performs its own wake up.
283	271	*/
284	272	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
285		- xa_unlock_irq(&mapping->i_pages);
	273	+ xas_unlock_irq(xas);
286	274	schedule();
287	275	finish_wait(wq, &ewait.wait);
288	276	}
289	277
290		-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
	278	+static void put_unlocked_entry(struct xa_state xas, void entry,
	279	+ enum dax_wake_mode mode)
291	280	{
292		- void entry, *slot;
293		-
294		- xa_lock_irq(&mapping->i_pages);
295		- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
296		- if (WARN_ON_ONCE(!entry \|\| !radix_tree_exceptional_entry(entry) \|\|
297		- !slot_locked(mapping, slot))) {
298		- xa_unlock_irq(&mapping->i_pages);
299		- return;
300		- }
301		- unlock_slot(mapping, slot);
302		- xa_unlock_irq(&mapping->i_pages);
303		- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
304		-}
305		-
306		-static void put_locked_mapping_entry(struct address_space *mapping,
307		- pgoff_t index)
308		-{
309		- unlock_mapping_entry(mapping, index);
	281	+ if (entry && !dax_is_conflict(entry))
	282	+ dax_wake_entry(xas, entry, mode);
310	283	}
311	284
312	285	/*
313		- * Called when we are done with radix tree entry we looked up via
314		- * get_unlocked_mapping_entry() and which we didn't lock in the end.
	286	+ * We used the xa_state to get the entry, but then we locked the entry and
	287	+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
	288	+ * before use.
315	289	*/
316		-static void put_unlocked_mapping_entry(struct address_space *mapping,
317		- pgoff_t index, void *entry)
	290	+static void dax_unlock_entry(struct xa_state xas, void entry)
318	291	{
319		- if (!entry)
320		- return;
	292	+ void *old;
321	293
322		- /* We have to wake up next waiter for the radix tree entry lock */
323		- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
	294	+ BUG_ON(dax_is_locked(entry));
	295	+ xas_reset(xas);
	296	+ xas_lock_irq(xas);
	297	+ old = xas_store(xas, entry);
	298	+ xas_unlock_irq(xas);
	299	+ BUG_ON(!dax_is_locked(old));
	300	+ dax_wake_entry(xas, entry, WAKE_NEXT);
	301	+}
	302	+
	303	+/*
	304	+ * Return: The entry stored at this location before it was locked.
	305	+ */
	306	+static void dax_lock_entry(struct xa_state xas, void *entry)
	307	+{
	308	+ unsigned long v = xa_to_value(entry);
	309	+ return xas_store(xas, xa_mk_value(v \| DAX_LOCKED));
324	310	}
325	311
326	312	static unsigned long dax_entry_size(void *entry)
..	..	@@ -335,9 +321,9 @@
335	321	return PAGE_SIZE;
336	322	}
337	323
338		-static unsigned long dax_radix_end_pfn(void *entry)
	324	+static unsigned long dax_end_pfn(void *entry)
339	325	{
340		- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
	326	+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
341	327	}
342	328
343	329	/*
..	..	@@ -345,8 +331,8 @@
345	331	* 'empty' and 'zero' entries.
346	332	*/
347	333	#define for_each_mapped_pfn(entry, pfn) \
348		- for (pfn = dax_radix_pfn(entry); \
349		- pfn < dax_radix_end_pfn(entry); pfn++)
	334	+ for (pfn = dax_to_pfn(entry); \
	335	+ pfn < dax_end_pfn(entry); pfn++)
350	336
351	337	/*
352	338	* TODO: for reflink+dax we need a way to associate a single page with
..	..	@@ -403,18 +389,25 @@
403	389	return NULL;
404	390	}
405	391
406		-bool dax_lock_mapping_entry(struct page *page)
	392	+/*
	393	+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
	394	+ * @page: The page whose entry we want to lock
	395	+ *
	396	+ * Context: Process context.
	397	+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
	398	+ * not be locked.
	399	+ */
	400	+dax_entry_t dax_lock_page(struct page *page)
407	401	{
408		- pgoff_t index;
409		- struct inode *inode;
410		- bool did_lock = false;
411		- void entry = NULL, *slot;
412		- struct address_space *mapping;
	402	+ XA_STATE(xas, NULL, 0);
	403	+ void *entry;
413	404
	405	+ /* Ensure page->mapping isn't freed while we look at it */
414	406	rcu_read_lock();
415	407	for (;;) {
416		- mapping = READ_ONCE(page->mapping);
	408	+ struct address_space *mapping = READ_ONCE(page->mapping);
417	409
	410	+ entry = NULL;
418	411	if (!mapping \|\| !dax_mapping(mapping))
419	412	break;
420	413
..	..	@@ -425,101 +418,93 @@
425	418	* otherwise we would not have a valid pfn_to_page()
426	419	* translation.
427	420	*/
428		- inode = mapping->host;
429		- if (S_ISCHR(inode->i_mode)) {
430		- did_lock = true;
	421	+ entry = (void *)~0UL;
	422	+ if (S_ISCHR(mapping->host->i_mode))
431	423	break;
432		- }
433	424
434		- xa_lock_irq(&mapping->i_pages);
	425	+ xas.xa = &mapping->i_pages;
	426	+ xas_lock_irq(&xas);
435	427	if (mapping != page->mapping) {
436		- xa_unlock_irq(&mapping->i_pages);
	428	+ xas_unlock_irq(&xas);
437	429	continue;
438	430	}
439		- index = page->index;
440		-
441		- entry = __radix_tree_lookup(&mapping->i_pages, index,
442		- NULL, &slot);
443		- if (!entry) {
444		- xa_unlock_irq(&mapping->i_pages);
445		- break;
446		- } else if (slot_locked(mapping, slot)) {
	431	+ xas_set(&xas, page->index);
	432	+ entry = xas_load(&xas);
	433	+ if (dax_is_locked(entry)) {
447	434	rcu_read_unlock();
448		- wait_entry_unlocked(mapping, index, &slot, entry);
	435	+ wait_entry_unlocked(&xas, entry);
449	436	rcu_read_lock();
450	437	continue;
451	438	}
452		- lock_slot(mapping, slot);
453		- did_lock = true;
454		- xa_unlock_irq(&mapping->i_pages);
	439	+ dax_lock_entry(&xas, entry);
	440	+ xas_unlock_irq(&xas);
455	441	break;
456	442	}
457	443	rcu_read_unlock();
458		-
459		- return did_lock;
	444	+ return (dax_entry_t)entry;
460	445	}
461	446
462		-void dax_unlock_mapping_entry(struct page *page)
	447	+void dax_unlock_page(struct page *page, dax_entry_t cookie)
463	448	{
464	449	struct address_space *mapping = page->mapping;
465		- struct inode *inode = mapping->host;
	450	+ XA_STATE(xas, &mapping->i_pages, page->index);
466	451
467		- if (S_ISCHR(inode->i_mode))
	452	+ if (S_ISCHR(mapping->host->i_mode))
468	453	return;
469	454
470		- unlock_mapping_entry(mapping, page->index);
	455	+ dax_unlock_entry(&xas, (void *)cookie);
471	456	}
472	457
473	458	/*
474		- * Find radix tree entry at given index. If it points to an exceptional entry,
475		- * return it with the radix tree entry locked. If the radix tree doesn't
476		- * contain given index, create an empty exceptional entry for the index and
477		- * return with it locked.
	459	+ * Find page cache entry at given index. If it is a DAX entry, return it
	460	+ * with the entry locked. If the page cache doesn't contain an entry at
	461	+ * that index, add a locked empty entry.
478	462	*
479		- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
480		- * either return that locked entry or will return an error. This error will
481		- * happen if there are any 4k entries within the 2MiB range that we are
482		- * requesting.
	463	+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
	464	+ * either return that locked entry or will return VM_FAULT_FALLBACK.
	465	+ * This will happen if there are any PTE entries within the PMD range
	466	+ * that we are requesting.
483	467	*
484		- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
485		- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
486		- * insertion will fail if it finds any 4k entries already in the tree, and a
487		- * 4k insertion will cause an existing 2MiB entry to be unmapped and
488		- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
489		- * well as 2MiB empty entries.
	468	+ * We always favor PTE entries over PMD entries. There isn't a flow where we
	469	+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
	470	+ * insertion will fail if it finds any PTE entries already in the tree, and a
	471	+ * PTE insertion will cause an existing PMD entry to be unmapped and
	472	+ * downgraded to PTE entries. This happens for both PMD zero pages as
	473	+ * well as PMD empty entries.
490	474	*
491		- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
492		- * real storage backing them. We will leave these real 2MiB DAX entries in
493		- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
	475	+ * The exception to this downgrade path is for PMD entries that have
	476	+ * real storage backing them. We will leave these real PMD entries in
	477	+ * the tree, and PTE writes will simply dirty the entire PMD entry.
494	478	*
495	479	* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
496	480	* persistent memory the benefit is doubtful. We can add that later if we can
497	481	* show it helps.
	482	+ *
	483	+ * On error, this function does not return an ERR_PTR. Instead it returns
	484	+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
	485	+ * overlap with xarray value entries.
498	486	*/
499		-static void grab_mapping_entry(struct address_space mapping, pgoff_t index,
500		- unsigned long size_flag)
	487	+static void grab_mapping_entry(struct xa_state xas,
	488	+ struct address_space *mapping, unsigned int order)
501	489	{
502		- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
503		- void entry, *slot;
	490	+ unsigned long index = xas->xa_index;
	491	+ bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
	492	+ void *entry;
504	493
505		-restart:
506		- xa_lock_irq(&mapping->i_pages);
507		- entry = get_unlocked_mapping_entry(mapping, index, &slot);
508		-
509		- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
510		- entry = ERR_PTR(-EIO);
511		- goto out_unlock;
512		- }
	494	+retry:
	495	+ pmd_downgrade = false;
	496	+ xas_lock_irq(xas);
	497	+ entry = get_unlocked_entry(xas, order);
513	498
514	499	if (entry) {
515		- if (size_flag & RADIX_DAX_PMD) {
516		- if (dax_is_pte_entry(entry)) {
517		- put_unlocked_mapping_entry(mapping, index,
518		- entry);
519		- entry = ERR_PTR(-EEXIST);
520		- goto out_unlock;
521		- }
522		- } else { /* trying to grab a PTE entry */
	500	+ if (dax_is_conflict(entry))
	501	+ goto fallback;
	502	+ if (!xa_is_value(entry)) {
	503	+ xas_set_err(xas, -EIO);
	504	+ goto out_unlock;
	505	+ }
	506	+
	507	+ if (order == 0) {
523	508	if (dax_is_pmd_entry(entry) &&
524	509	(dax_is_zero_entry(entry) \|\|
525	510	dax_is_empty_entry(entry))) {
..	..	@@ -528,92 +513,69 @@
528	513	}
529	514	}
530	515
531		- /* No entry for given index? Make sure radix tree is big enough. */
532		- if (!entry \|\| pmd_downgrade) {
533		- int err;
	516	+ if (pmd_downgrade) {
	517	+ /*
	518	+ * Make sure 'entry' remains valid while we drop
	519	+ * the i_pages lock.
	520	+ */
	521	+ dax_lock_entry(xas, entry);
534	522
535		- if (pmd_downgrade) {
536		- /*
537		- * Make sure 'entry' remains valid while we drop
538		- * the i_pages lock.
539		- */
540		- entry = lock_slot(mapping, slot);
541		- }
542		-
543		- xa_unlock_irq(&mapping->i_pages);
544	523	/*
545	524	* Besides huge zero pages the only other thing that gets
546	525	* downgraded are empty entries which don't need to be
547	526	* unmapped.
548	527	*/
549		- if (pmd_downgrade && dax_is_zero_entry(entry))
550		- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
551		- PG_PMD_NR, false);
552		-
553		- err = radix_tree_preload(
554		- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
555		- if (err) {
556		- if (pmd_downgrade)
557		- put_locked_mapping_entry(mapping, index);
558		- return ERR_PTR(err);
559		- }
560		- xa_lock_irq(&mapping->i_pages);
561		-
562		- if (!entry) {
563		- /*
564		- * We needed to drop the i_pages lock while calling
565		- * radix_tree_preload() and we didn't have an entry to
566		- * lock. See if another thread inserted an entry at
567		- * our index during this time.
568		- */
569		- entry = __radix_tree_lookup(&mapping->i_pages, index,
570		- NULL, &slot);
571		- if (entry) {
572		- radix_tree_preload_end();
573		- xa_unlock_irq(&mapping->i_pages);
574		- goto restart;
575		- }
	528	+ if (dax_is_zero_entry(entry)) {
	529	+ xas_unlock_irq(xas);
	530	+ unmap_mapping_pages(mapping,
	531	+ xas->xa_index & ~PG_PMD_COLOUR,
	532	+ PG_PMD_NR, false);
	533	+ xas_reset(xas);
	534	+ xas_lock_irq(xas);
576	535	}
577	536
578		- if (pmd_downgrade) {
579		- dax_disassociate_entry(entry, mapping, false);
580		- radix_tree_delete(&mapping->i_pages, index);
581		- mapping->nrexceptional--;
582		- dax_wake_mapping_entry_waiter(mapping, index, entry,
583		- true);
584		- }
585		-
586		- entry = dax_radix_locked_entry(0, size_flag \| RADIX_DAX_EMPTY);
587		-
588		- err = __radix_tree_insert(&mapping->i_pages, index,
589		- dax_radix_order(entry), entry);
590		- radix_tree_preload_end();
591		- if (err) {
592		- xa_unlock_irq(&mapping->i_pages);
593		- /*
594		- * Our insertion of a DAX entry failed, most likely
595		- * because we were inserting a PMD entry and it
596		- * collided with a PTE sized entry at a different
597		- * index in the PMD range. We haven't inserted
598		- * anything into the radix tree and have no waiters to
599		- * wake.
600		- */
601		- return ERR_PTR(err);
602		- }
603		- /* Good, we have inserted empty locked entry into the tree. */
604		- mapping->nrexceptional++;
605		- xa_unlock_irq(&mapping->i_pages);
606		- return entry;
	537	+ dax_disassociate_entry(entry, mapping, false);
	538	+ xas_store(xas, NULL); /* undo the PMD join */
	539	+ dax_wake_entry(xas, entry, WAKE_ALL);
	540	+ mapping->nrexceptional--;
	541	+ entry = NULL;
	542	+ xas_set(xas, index);
607	543	}
608		- entry = lock_slot(mapping, slot);
609		- out_unlock:
610		- xa_unlock_irq(&mapping->i_pages);
	544	+
	545	+ if (entry) {
	546	+ dax_lock_entry(xas, entry);
	547	+ } else {
	548	+ unsigned long flags = DAX_EMPTY;
	549	+
	550	+ if (order > 0)
	551	+ flags \|= DAX_PMD;
	552	+ entry = dax_make_entry(pfn_to_pfn_t(0), flags);
	553	+ dax_lock_entry(xas, entry);
	554	+ if (xas_error(xas))
	555	+ goto out_unlock;
	556	+ mapping->nrexceptional++;
	557	+ }
	558	+
	559	+out_unlock:
	560	+ xas_unlock_irq(xas);
	561	+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
	562	+ goto retry;
	563	+ if (xas->xa_node == XA_ERROR(-ENOMEM))
	564	+ return xa_mk_internal(VM_FAULT_OOM);
	565	+ if (xas_error(xas))
	566	+ return xa_mk_internal(VM_FAULT_SIGBUS);
611	567	return entry;
	568	+fallback:
	569	+ xas_unlock_irq(xas);
	570	+ return xa_mk_internal(VM_FAULT_FALLBACK);
612	571	}
613	572
614	573	/**
615		- * dax_layout_busy_page - find first pinned page in @mapping
	574	+ * dax_layout_busy_page_range - find first pinned page in @mapping
616	575	* @mapping: address space to scan for a page with ref count > 1
	576	+ * @start: Starting offset. Page containing 'start' is included.
	577	+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
	578	+ * pages from 'start' till the end of file are included.
617	579	*
618	580	* DAX requires ZONE_DEVICE mapped pages. These pages are never
619	581	* 'onlined' to the page allocator so they are considered idle when
..	..	@@ -626,13 +588,15 @@
626	588	* to be able to run unmap_mapping_range() and subsequently not race
627	589	* mapping_mapped() becoming true.
628	590	*/
629		-struct page dax_layout_busy_page(struct address_space mapping)
	591	+struct page dax_layout_busy_page_range(struct address_space mapping,
	592	+ loff_t start, loff_t end)
630	593	{
631		- pgoff_t indices[PAGEVEC_SIZE];
	594	+ void *entry;
	595	+ unsigned int scanned = 0;
632	596	struct page *page = NULL;
633		- struct pagevec pvec;
634		- pgoff_t index, end;
635		- unsigned i;
	597	+ pgoff_t start_idx = start >> PAGE_SHIFT;
	598	+ pgoff_t end_idx;
	599	+ XA_STATE(xas, &mapping->i_pages, start_idx);
636	600
637	601	/*
638	602	* In the 'limited' case get_user_pages() for dax is disabled.
..	..	@@ -643,112 +607,93 @@
643	607	if (!dax_mapping(mapping) \|\| !mapping_mapped(mapping))
644	608	return NULL;
645	609
646		- pagevec_init(&pvec);
647		- index = 0;
648		- end = -1;
649		-
	610	+ /* If end == LLONG_MAX, all pages from start to till end of file */
	611	+ if (end == LLONG_MAX)
	612	+ end_idx = ULONG_MAX;
	613	+ else
	614	+ end_idx = end >> PAGE_SHIFT;
650	615	/*
651	616	* If we race get_user_pages_fast() here either we'll see the
652		- * elevated page count in the pagevec_lookup and wait, or
	617	+ * elevated page count in the iteration and wait, or
653	618	* get_user_pages_fast() will see that the page it took a reference
654	619	* against is no longer mapped in the page tables and bail to the
655	620	* get_user_pages() slow path. The slow path is protected by
656	621	* pte_lock() and pmd_lock(). New references are not taken without
657		- * holding those locks, and unmap_mapping_range() will not zero the
	622	+ * holding those locks, and unmap_mapping_pages() will not zero the
658	623	* pte or pmd without holding the respective lock, so we are
659	624	* guaranteed to either see new references or prevent new
660	625	* references from being established.
661	626	*/
662		- unmap_mapping_range(mapping, 0, 0, 0);
	627	+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
663	628
664		- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
665		- min(end - index, (pgoff_t)PAGEVEC_SIZE),
666		- indices)) {
667		- pgoff_t nr_pages = 1;
668		-
669		- for (i = 0; i < pagevec_count(&pvec); i++) {
670		- struct page *pvec_ent = pvec.pages[i];
671		- void *entry;
672		-
673		- index = indices[i];
674		- if (index >= end)
675		- break;
676		-
677		- if (WARN_ON_ONCE(
678		- !radix_tree_exceptional_entry(pvec_ent)))
679		- continue;
680		-
681		- xa_lock_irq(&mapping->i_pages);
682		- entry = get_unlocked_mapping_entry(mapping, index, NULL);
683		- if (entry) {
684		- page = dax_busy_page(entry);
685		- /*
686		- * Account for multi-order entries at
687		- * the end of the pagevec.
688		- */
689		- if (i + 1 >= pagevec_count(&pvec))
690		- nr_pages = 1UL << dax_radix_order(entry);
691		- }
692		- put_unlocked_mapping_entry(mapping, index, entry);
693		- xa_unlock_irq(&mapping->i_pages);
694		- if (page)
695		- break;
696		- }
697		-
698		- /*
699		- * We don't expect normal struct page entries to exist in our
700		- * tree, but we keep these pagevec calls so that this code is
701		- * consistent with the common pattern for handling pagevecs
702		- * throughout the kernel.
703		- */
704		- pagevec_remove_exceptionals(&pvec);
705		- pagevec_release(&pvec);
706		- index += nr_pages;
707		-
	629	+ xas_lock_irq(&xas);
	630	+ xas_for_each(&xas, entry, end_idx) {
	631	+ if (WARN_ON_ONCE(!xa_is_value(entry)))
	632	+ continue;
	633	+ if (unlikely(dax_is_locked(entry)))
	634	+ entry = get_unlocked_entry(&xas, 0);
	635	+ if (entry)
	636	+ page = dax_busy_page(entry);
	637	+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
708	638	if (page)
709	639	break;
	640	+ if (++scanned % XA_CHECK_SCHED)
	641	+ continue;
	642	+
	643	+ xas_pause(&xas);
	644	+ xas_unlock_irq(&xas);
	645	+ cond_resched();
	646	+ xas_lock_irq(&xas);
710	647	}
	648	+ xas_unlock_irq(&xas);
711	649	return page;
	650	+}
	651	+EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
	652	+
	653	+struct page dax_layout_busy_page(struct address_space mapping)
	654	+{
	655	+ return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
712	656	}
713	657	EXPORT_SYMBOL_GPL(dax_layout_busy_page);
714	658
715		-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
	659	+static int __dax_invalidate_entry(struct address_space *mapping,
716	660	pgoff_t index, bool trunc)
717	661	{
	662	+ XA_STATE(xas, &mapping->i_pages, index);
718	663	int ret = 0;
719	664	void *entry;
720		- struct radix_tree_root *pages = &mapping->i_pages;
721	665
722		- xa_lock_irq(pages);
723		- entry = get_unlocked_mapping_entry(mapping, index, NULL);
724		- if (!entry \|\| WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
	666	+ xas_lock_irq(&xas);
	667	+ entry = get_unlocked_entry(&xas, 0);
	668	+ if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
725	669	goto out;
726	670	if (!trunc &&
727		- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) \|\|
728		- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
	671	+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) \|\|
	672	+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
729	673	goto out;
730	674	dax_disassociate_entry(entry, mapping, trunc);
731		- radix_tree_delete(pages, index);
	675	+ xas_store(&xas, NULL);
732	676	mapping->nrexceptional--;
733	677	ret = 1;
734	678	out:
735		- put_unlocked_mapping_entry(mapping, index, entry);
736		- xa_unlock_irq(pages);
	679	+ put_unlocked_entry(&xas, entry, WAKE_ALL);
	680	+ xas_unlock_irq(&xas);
737	681	return ret;
738	682	}
	683	+
739	684	/*
740		- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
741		- * entry to get unlocked before deleting it.
	685	+ * Delete DAX entry at @index from @mapping. Wait for it
	686	+ * to be unlocked before deleting it.
742	687	*/
743	688	int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
744	689	{
745		- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
	690	+ int ret = __dax_invalidate_entry(mapping, index, true);
746	691
747	692	/*
748	693	* This gets called from truncate / punch_hole path. As such, the caller
749	694	* must hold locks protecting against concurrent modifications of the
750		- * radix tree (usually fs-private i_mmap_sem for writing). Since the
751		- * caller has seen exceptional entry for this index, we better find it
	695	+ * page cache (usually fs-private i_mmap_sem for writing). Since the
	696	+ * caller has seen a DAX entry for this index, we better find it
752	697	* at that index as well...
753	698	*/
754	699	WARN_ON_ONCE(!ret);
..	..	@@ -756,34 +701,38 @@
756	701	}
757	702
758	703	/*
759		- * Invalidate exceptional DAX entry if it is clean.
	704	+ * Invalidate DAX entry if it is clean.
760	705	*/
761	706	int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
762	707	pgoff_t index)
763	708	{
764		- return __dax_invalidate_mapping_entry(mapping, index, false);
	709	+ return __dax_invalidate_entry(mapping, index, false);
765	710	}
766	711
767		-static int copy_user_dax(struct block_device bdev, struct dax_device dax_dev,
768		- sector_t sector, size_t size, struct page *to,
769		- unsigned long vaddr)
	712	+static int copy_cow_page_dax(struct block_device bdev, struct dax_device dax_dev,
	713	+ sector_t sector, struct page *to, unsigned long vaddr)
770	714	{
771	715	void vto, kaddr;
772	716	pgoff_t pgoff;
773	717	long rc;
774	718	int id;
775	719
776		- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
	720	+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
777	721	if (rc)
778	722	return rc;
779	723
780	724	id = dax_read_lock();
781		- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
	725	+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL);
782	726	if (rc < 0) {
783	727	dax_read_unlock(id);
784	728	return rc;
785	729	}
786	730	vto = kmap_atomic(to);
	731	+#ifdef CONFIG_ARM
	732	+#ifndef copy_user_page
	733	+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
	734	+#endif
	735	+#endif
787	736	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
788	737	kunmap_atomic(vto);
789	738	dax_read_unlock(id);
..	..	@@ -797,64 +746,57 @@
797	746	* already in the tree, we will skip the insertion and just dirty the PMD as
798	747	* appropriate.
799	748	*/
800		-static void dax_insert_mapping_entry(struct address_space mapping,
801		- struct vm_fault *vmf,
802		- void *entry, pfn_t pfn_t,
803		- unsigned long flags, bool dirty)
	749	+static void dax_insert_entry(struct xa_state xas,
	750	+ struct address_space mapping, struct vm_fault vmf,
	751	+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
804	752	{
805		- struct radix_tree_root *pages = &mapping->i_pages;
806		- unsigned long pfn = pfn_t_to_pfn(pfn_t);
807		- pgoff_t index = vmf->pgoff;
808		- void *new_entry;
	753	+ void *new_entry = dax_make_entry(pfn, flags);
809	754
810	755	if (dirty)
811	756	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
812	757
813		- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
	758	+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
	759	+ unsigned long index = xas->xa_index;
814	760	/* we are replacing a zero page with block mapping */
815	761	if (dax_is_pmd_entry(entry))
816	762	unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
817		- PG_PMD_NR, false);
	763	+ PG_PMD_NR, false);
818	764	else /* pte entry */
819		- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
	765	+ unmap_mapping_pages(mapping, index, 1, false);
820	766	}
821	767
822		- xa_lock_irq(pages);
823		- new_entry = dax_radix_locked_entry(pfn, flags);
824		- if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
	768	+ xas_reset(xas);
	769	+ xas_lock_irq(xas);
	770	+ if (dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
	771	+ void *old;
	772	+
825	773	dax_disassociate_entry(entry, mapping, false);
826	774	dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
827		- }
828		-
829		- if (dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
830	775	/*
831		- * Only swap our new entry into the radix tree if the current
	776	+ * Only swap our new entry into the page cache if the current
832	777	* entry is a zero page or an empty entry. If a normal PTE or
833		- * PMD entry is already in the tree, we leave it alone. This
	778	+ * PMD entry is already in the cache, we leave it alone. This
834	779	* means that if we are trying to insert a PTE and the
835	780	* existing entry is a PMD, we will just leave the PMD in the
836	781	* tree and dirty it if necessary.
837	782	*/
838		- struct radix_tree_node *node;
839		- void **slot;
840		- void *ret;
841		-
842		- ret = __radix_tree_lookup(pages, index, &node, &slot);
843		- WARN_ON_ONCE(ret != entry);
844		- __radix_tree_replace(pages, node, slot,
845		- new_entry, NULL);
	783	+ old = dax_lock_entry(xas, new_entry);
	784	+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) \|
	785	+ DAX_LOCKED));
846	786	entry = new_entry;
	787	+ } else {
	788	+ xas_load(xas); /* Walk the xa_state */
847	789	}
848	790
849	791	if (dirty)
850		- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
	792	+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
851	793
852		- xa_unlock_irq(pages);
	794	+ xas_unlock_irq(xas);
853	795	return entry;
854	796	}
855	797
856		-static inline unsigned long
857		-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
	798	+static inline
	799	+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
858	800	{
859	801	unsigned long address;
860	802
..	..	@@ -864,8 +806,8 @@
864	806	}
865	807
866	808	/* Walk all mappings of a given index of a file and writeprotect them */
867		-static void dax_mapping_entry_mkclean(struct address_space *mapping,
868		- pgoff_t index, unsigned long pfn)
	809	+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
	810	+ unsigned long pfn)
869	811	{
870	812	struct vm_area_struct *vma;
871	813	pte_t pte, *ptep = NULL;
..	..	@@ -874,7 +816,8 @@
874	816
875	817	i_mmap_lock_read(mapping);
876	818	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
877		- unsigned long address, start, end;
	819	+ struct mmu_notifier_range range;
	820	+ unsigned long address;
878	821
879	822	cond_resched();
880	823
..	..	@@ -884,11 +827,12 @@
884	827	address = pgoff_address(index, vma);
885	828
886	829	/*
887		- * Note because we provide start/end to follow_pte_pmd it will
888		- * call mmu_notifier_invalidate_range_start() on our behalf
889		- * before taking any lock.
	830	+ * follow_invalidate_pte() will use the range to call
	831	+ * mmu_notifier_invalidate_range_start() on our behalf before
	832	+ * taking any lock.
890	833	*/
891		- if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
	834	+ if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
	835	+ &pmdp, &ptl))
892	836	continue;
893	837
894	838	/*
..	..	@@ -907,7 +851,8 @@
907	851	if (!pmd_dirty(pmdp) && !pmd_write(pmdp))
908	852	goto unlock_pmd;
909	853
910		- flush_cache_page(vma, address, pfn);
	854	+ flush_cache_range(vma, address,
	855	+ address + HPAGE_PMD_SIZE);
911	856	pmd = pmdp_invalidate(vma, address, pmdp);
912	857	pmd = pmd_wrprotect(pmd);
913	858	pmd = pmd_mkclean(pmd);
..	..	@@ -930,50 +875,53 @@
930	875	pte_unmap_unlock(ptep, ptl);
931	876	}
932	877
933		- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
	878	+ mmu_notifier_invalidate_range_end(&range);
934	879	}
935	880	i_mmap_unlock_read(mapping);
936	881	}
937	882
938		-static int dax_writeback_one(struct dax_device *dax_dev,
939		- struct address_space mapping, pgoff_t index, void entry)
	883	+static int dax_writeback_one(struct xa_state xas, struct dax_device dax_dev,
	884	+ struct address_space mapping, void entry)
940	885	{
941		- struct radix_tree_root *pages = &mapping->i_pages;
942		- void entry2, *slot;
943		- unsigned long pfn;
	886	+ unsigned long pfn, index, count;
944	887	long ret = 0;
945		- size_t size;
946	888
947	889	/*
948	890	* A page got tagged dirty in DAX mapping? Something is seriously
949	891	* wrong.
950	892	*/
951		- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
	893	+ if (WARN_ON(!xa_is_value(entry)))
952	894	return -EIO;
953	895
954		- xa_lock_irq(pages);
955		- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
956		- /* Entry got punched out / reallocated? */
957		- if (!entry2 \|\| WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
958		- goto put_unlocked;
959		- /*
960		- * Entry got reallocated elsewhere? No need to writeback. We have to
961		- * compare pfns as we must not bail out due to difference in lockbit
962		- * or entry type.
963		- */
964		- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
965		- goto put_unlocked;
966		- if (WARN_ON_ONCE(dax_is_empty_entry(entry) \|\|
967		- dax_is_zero_entry(entry))) {
968		- ret = -EIO;
969		- goto put_unlocked;
	896	+ if (unlikely(dax_is_locked(entry))) {
	897	+ void *old_entry = entry;
	898	+
	899	+ entry = get_unlocked_entry(xas, 0);
	900	+
	901	+ /* Entry got punched out / reallocated? */
	902	+ if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
	903	+ goto put_unlocked;
	904	+ /*
	905	+ * Entry got reallocated elsewhere? No need to writeback.
	906	+ * We have to compare pfns as we must not bail out due to
	907	+ * difference in lockbit or entry type.
	908	+ */
	909	+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
	910	+ goto put_unlocked;
	911	+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) \|\|
	912	+ dax_is_zero_entry(entry))) {
	913	+ ret = -EIO;
	914	+ goto put_unlocked;
	915	+ }
	916	+
	917	+ /* Another fsync thread may have already done this entry */
	918	+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
	919	+ goto put_unlocked;
970	920	}
971	921
972		- /* Another fsync thread may have already written back this entry */
973		- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
974		- goto put_unlocked;
975	922	/* Lock the entry to serialize with page faults */
976		- entry = lock_slot(mapping, slot);
	923	+ dax_lock_entry(xas, entry);
	924	+
977	925	/*
978	926	* We can clear the tag now but we have to be careful so that concurrent
979	927	* dax_writeback_one() calls for the same index cannot finish before we
..	..	@@ -981,37 +929,39 @@
981	929	* at the entry only under the i_pages lock and once they do that
982	930	* they will see the entry locked and wait for it to unlock.
983	931	*/
984		- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
985		- xa_unlock_irq(pages);
	932	+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
	933	+ xas_unlock_irq(xas);
986	934
987	935	/*
988		- * Even if dax_writeback_mapping_range() was given a wbc->range_start
989		- * in the middle of a PMD, the 'index' we are given will be aligned to
990		- * the start index of the PMD, as will the pfn we pull from 'entry'.
	936	+ * If dax_writeback_mapping_range() was given a wbc->range_start
	937	+ * in the middle of a PMD, the 'index' we use needs to be
	938	+ * aligned to the start of the PMD.
991	939	* This allows us to flush for PMD_SIZE and not have to worry about
992	940	* partial PMD writebacks.
993	941	*/
994		- pfn = dax_radix_pfn(entry);
995		- size = PAGE_SIZE << dax_radix_order(entry);
	942	+ pfn = dax_to_pfn(entry);
	943	+ count = 1UL << dax_entry_order(entry);
	944	+ index = xas->xa_index & ~(count - 1);
996	945
997		- dax_mapping_entry_mkclean(mapping, index, pfn);
998		- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
	946	+ dax_entry_mkclean(mapping, index, pfn);
	947	+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
999	948	/*
1000	949	* After we have flushed the cache, we can clear the dirty tag. There
1001	950	* cannot be new dirty data in the pfn after the flush has completed as
1002	951	* the pfn mappings are writeprotected and fault waits for mapping
1003	952	* entry lock.
1004	953	*/
1005		- xa_lock_irq(pages);
1006		- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
1007		- xa_unlock_irq(pages);
1008		- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
1009		- put_locked_mapping_entry(mapping, index);
	954	+ xas_reset(xas);
	955	+ xas_lock_irq(xas);
	956	+ xas_store(xas, entry);
	957	+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
	958	+ dax_wake_entry(xas, entry, WAKE_NEXT);
	959	+
	960	+ trace_dax_writeback_one(mapping->host, index, count);
1010	961	return ret;
1011	962
1012	963	put_unlocked:
1013		- put_unlocked_mapping_entry(mapping, index, entry2);
1014		- xa_unlock_irq(pages);
	964	+ put_unlocked_entry(xas, entry, WAKE_NEXT);
1015	965	return ret;
1016	966	}
1017	967
..	..	@@ -1021,15 +971,14 @@
1021	971	* on persistent storage prior to completion of the operation.
1022	972	*/
1023	973	int dax_writeback_mapping_range(struct address_space *mapping,
1024		- struct block_device bdev, struct writeback_control wbc)
	974	+ struct dax_device dax_dev, struct writeback_control wbc)
1025	975	{
	976	+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1026	977	struct inode *inode = mapping->host;
1027		- pgoff_t start_index, end_index;
1028		- pgoff_t indices[PAGEVEC_SIZE];
1029		- struct dax_device *dax_dev;
1030		- struct pagevec pvec;
1031		- bool done = false;
1032		- int i, ret = 0;
	978	+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
	979	+ void *entry;
	980	+ int ret = 0;
	981	+ unsigned int scanned = 0;
1033	982
1034	983	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1035	984	return -EIO;
..	..	@@ -1037,45 +986,28 @@
1037	986	if (!mapping->nrexceptional \|\| wbc->sync_mode != WB_SYNC_ALL)
1038	987	return 0;
1039	988
1040		- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
1041		- if (!dax_dev)
1042		- return -EIO;
	989	+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
1043	990
1044		- start_index = wbc->range_start >> PAGE_SHIFT;
1045		- end_index = wbc->range_end >> PAGE_SHIFT;
	991	+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1046	992
1047		- trace_dax_writeback_range(inode, start_index, end_index);
1048		-
1049		- tag_pages_for_writeback(mapping, start_index, end_index);
1050		-
1051		- pagevec_init(&pvec);
1052		- while (!done) {
1053		- pvec.nr = find_get_entries_tag(mapping, start_index,
1054		- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
1055		- pvec.pages, indices);
1056		-
1057		- if (pvec.nr == 0)
	993	+ xas_lock_irq(&xas);
	994	+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
	995	+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
	996	+ if (ret < 0) {
	997	+ mapping_set_error(mapping, ret);
1058	998	break;
1059		-
1060		- for (i = 0; i < pvec.nr; i++) {
1061		- if (indices[i] > end_index) {
1062		- done = true;
1063		- break;
1064		- }
1065		-
1066		- ret = dax_writeback_one(dax_dev, mapping, indices[i],
1067		- pvec.pages[i]);
1068		- if (ret < 0) {
1069		- mapping_set_error(mapping, ret);
1070		- goto out;
1071		- }
1072	999	}
1073		- start_index = indices[pvec.nr - 1] + 1;
	1000	+ if (++scanned % XA_CHECK_SCHED)
	1001	+ continue;
	1002	+
	1003	+ xas_pause(&xas);
	1004	+ xas_unlock_irq(&xas);
	1005	+ cond_resched();
	1006	+ xas_lock_irq(&xas);
1074	1007	}
1075		-out:
1076		- put_dax(dax_dev);
1077		- trace_dax_writeback_range_done(inode, start_index, end_index);
1078		- return (ret < 0 ? ret : 0);
	1008	+ xas_unlock_irq(&xas);
	1009	+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
	1010	+ return ret;
1079	1011	}
1080	1012	EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1081	1013
..	..	@@ -1123,69 +1055,63 @@
1123	1055	* If this page is ever written to we will re-fault and change the mapping to
1124	1056	* point to real DAX storage instead.
1125	1057	*/
1126		-static vm_fault_t dax_load_hole(struct address_space mapping, void entry,
1127		- struct vm_fault *vmf)
	1058	+static vm_fault_t dax_load_hole(struct xa_state *xas,
	1059	+ struct address_space mapping, void *entry,
	1060	+ struct vm_fault *vmf)
1128	1061	{
1129	1062	struct inode *inode = mapping->host;
1130	1063	unsigned long vaddr = vmf->address;
1131	1064	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1132	1065	vm_fault_t ret;
1133	1066
1134		- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
1135		- false);
	1067	+ entry = dax_insert_entry(xas, mapping, vmf, entry, pfn,
	1068	+ DAX_ZERO_PAGE, false);
	1069	+
1136	1070	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1137	1071	trace_dax_load_hole(inode, vmf, ret);
1138	1072	return ret;
1139	1073	}
1140	1074
1141		-static bool dax_range_is_aligned(struct block_device *bdev,
1142		- unsigned int offset, unsigned int length)
	1075	+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
1143	1076	{
1144		- unsigned short sector_size = bdev_logical_block_size(bdev);
	1077	+ sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
	1078	+ pgoff_t pgoff;
	1079	+ long rc, id;
	1080	+ void *kaddr;
	1081	+ bool page_aligned = false;
	1082	+ unsigned offset = offset_in_page(pos);
	1083	+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
1145	1084
1146		- if (!IS_ALIGNED(offset, sector_size))
1147		- return false;
1148		- if (!IS_ALIGNED(length, sector_size))
1149		- return false;
	1085	+ if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
	1086	+ (size == PAGE_SIZE))
	1087	+ page_aligned = true;
1150	1088
1151		- return true;
1152		-}
	1089	+ rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
	1090	+ if (rc)
	1091	+ return rc;
1153	1092
1154		-int __dax_zero_page_range(struct block_device *bdev,
1155		- struct dax_device *dax_dev, sector_t sector,
1156		- unsigned int offset, unsigned int size)
1157		-{
1158		- if (dax_range_is_aligned(bdev, offset, size)) {
1159		- sector_t start_sector = sector + (offset >> 9);
	1093	+ id = dax_read_lock();
1160	1094
1161		- return blkdev_issue_zeroout(bdev, start_sector,
1162		- size >> 9, GFP_NOFS, 0);
1163		- } else {
1164		- pgoff_t pgoff;
1165		- long rc, id;
1166		- void *kaddr;
1167		-
1168		- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
1169		- if (rc)
1170		- return rc;
1171		-
1172		- id = dax_read_lock();
1173		- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
1174		- if (rc < 0) {
1175		- dax_read_unlock(id);
1176		- return rc;
1177		- }
1178		- memset(kaddr + offset, 0, size);
1179		- dax_flush(dax_dev, kaddr + offset, size);
	1095	+ if (page_aligned)
	1096	+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
	1097	+ else
	1098	+ rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
	1099	+ if (rc < 0) {
1180	1100	dax_read_unlock(id);
	1101	+ return rc;
1181	1102	}
1182		- return 0;
	1103	+
	1104	+ if (!page_aligned) {
	1105	+ memset(kaddr + offset, 0, size);
	1106	+ dax_flush(iomap->dax_dev, kaddr + offset, size);
	1107	+ }
	1108	+ dax_read_unlock(id);
	1109	+ return size;
1183	1110	}
1184		-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1185	1111
1186	1112	static loff_t
1187	1113	dax_iomap_actor(struct inode inode, loff_t pos, loff_t length, void data,
1188		- struct iomap *iomap)
	1114	+ struct iomap iomap, struct iomap srcmap)
1189	1115	{
1190	1116	struct block_device *bdev = iomap->bdev;
1191	1117	struct dax_device *dax_dev = iomap->dax_dev;
..	..	@@ -1295,7 +1221,7 @@
1295	1221	unsigned flags = 0;
1296	1222
1297	1223	if (iov_iter_rw(iter) == WRITE) {
1298		- lockdep_assert_held_exclusive(&inode->i_rwsem);
	1224	+ lockdep_assert_held_write(&inode->i_rwsem);
1299	1225	flags \|= IOMAP_WRITE;
1300	1226	} else {
1301	1227	lockdep_assert_held(&inode->i_rwsem);
..	..	@@ -1322,9 +1248,7 @@
1322	1248	{
1323	1249	if (error == 0)
1324	1250	return VM_FAULT_NOPAGE;
1325		- if (error == -ENOMEM)
1326		- return VM_FAULT_OOM;
1327		- return VM_FAULT_SIGBUS;
	1251	+ return vmf_error(error);
1328	1252	}
1329	1253
1330	1254	/*
..	..	@@ -1343,10 +1267,12 @@
1343	1267	{
1344	1268	struct vm_area_struct *vma = vmf->vma;
1345	1269	struct address_space *mapping = vma->vm_file->f_mapping;
	1270	+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1346	1271	struct inode *inode = mapping->host;
1347	1272	unsigned long vaddr = vmf->address;
1348	1273	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1349		- struct iomap iomap = { 0 };
	1274	+ struct iomap iomap = { .type = IOMAP_HOLE };
	1275	+ struct iomap srcmap = { .type = IOMAP_HOLE };
1350	1276	unsigned flags = IOMAP_FAULT;
1351	1277	int error, major = 0;
1352	1278	bool write = vmf->flags & FAULT_FLAG_WRITE;
..	..	@@ -1369,9 +1295,9 @@
1369	1295	if (write && !vmf->cow_page)
1370	1296	flags \|= IOMAP_WRITE;
1371	1297
1372		- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1373		- if (IS_ERR(entry)) {
1374		- ret = dax_fault_return(PTR_ERR(entry));
	1298	+ entry = grab_mapping_entry(&xas, mapping, 0);
	1299	+ if (xa_is_internal(entry)) {
	1300	+ ret = xa_to_internal(entry);
1375	1301	goto out;
1376	1302	}
1377	1303
..	..	@@ -1391,7 +1317,7 @@
1391	1317	* the file system block size to be equal the page size, which means
1392	1318	* that we never have to deal with more than a single extent here.
1393	1319	*/
1394		- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
	1320	+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
1395	1321	if (iomap_errp)
1396	1322	*iomap_errp = error;
1397	1323	if (error) {
..	..	@@ -1412,8 +1338,8 @@
1412	1338	clear_user_highpage(vmf->cow_page, vaddr);
1413	1339	break;
1414	1340	case IOMAP_MAPPED:
1415		- error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1416		- sector, PAGE_SIZE, vmf->cow_page, vaddr);
	1341	+ error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
	1342	+ sector, vmf->cow_page, vaddr);
1417	1343	break;
1418	1344	default:
1419	1345	WARN_ON_ONCE(1);
..	..	@@ -1444,7 +1370,7 @@
1444	1370	if (error < 0)
1445	1371	goto error_finish_iomap;
1446	1372
1447		- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
	1373	+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1448	1374	0, write && !sync);
1449	1375
1450	1376	/*
..	..	@@ -1472,10 +1398,10 @@
1472	1398	case IOMAP_UNWRITTEN:
1473	1399	case IOMAP_HOLE:
1474	1400	if (!write) {
1475		- ret = dax_load_hole(mapping, entry, vmf);
	1401	+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
1476	1402	goto finish_iomap;
1477	1403	}
1478		- /FALLTHRU/
	1404	+ fallthrough;
1479	1405	default:
1480	1406	WARN_ON_ONCE(1);
1481	1407	error = -EIO;
..	..	@@ -1499,21 +1425,22 @@
1499	1425	ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1500	1426	}
1501	1427	unlock_entry:
1502		- put_locked_mapping_entry(mapping, vmf->pgoff);
	1428	+ dax_unlock_entry(&xas, entry);
1503	1429	out:
1504	1430	trace_dax_pte_fault_done(inode, vmf, ret);
1505	1431	return ret \| major;
1506	1432	}
1507	1433
1508	1434	#ifdef CONFIG_FS_DAX_PMD
1509		-static vm_fault_t dax_pmd_load_hole(struct vm_fault vmf, struct iomap iomap,
1510		- void *entry)
	1435	+static vm_fault_t dax_pmd_load_hole(struct xa_state xas, struct vm_fault vmf,
	1436	+ struct iomap iomap, void *entry)
1511	1437	{
1512	1438	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1513	1439	unsigned long pmd_addr = vmf->address & PMD_MASK;
	1440	+ struct vm_area_struct *vma = vmf->vma;
1514	1441	struct inode *inode = mapping->host;
	1442	+ pgtable_t pgtable = NULL;
1515	1443	struct page *zero_page;
1516		- void *ret = NULL;
1517	1444	spinlock_t *ptl;
1518	1445	pmd_t pmd_entry;
1519	1446	pfn_t pfn;
..	..	@@ -1524,8 +1451,14 @@
1524	1451	goto fallback;
1525	1452
1526	1453	pfn = page_to_pfn_t(zero_page);
1527		- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1528		- RADIX_DAX_PMD \| RADIX_DAX_ZERO_PAGE, false);
	1454	+ entry = dax_insert_entry(xas, mapping, vmf, entry, pfn,
	1455	+ DAX_PMD \| DAX_ZERO_PAGE, false);
	1456	+
	1457	+ if (arch_needs_pgtable_deposit()) {
	1458	+ pgtable = pte_alloc_one(vma->vm_mm);
	1459	+ if (!pgtable)
	1460	+ return VM_FAULT_OOM;
	1461	+ }
1529	1462
1530	1463	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1531	1464	if (!pmd_none(*(vmf->pmd))) {
..	..	@@ -1533,15 +1466,21 @@
1533	1466	goto fallback;
1534	1467	}
1535	1468
	1469	+ if (pgtable) {
	1470	+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
	1471	+ mm_inc_nr_ptes(vma->vm_mm);
	1472	+ }
1536	1473	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1537	1474	pmd_entry = pmd_mkhuge(pmd_entry);
1538	1475	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1539	1476	spin_unlock(ptl);
1540		- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
	1477	+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1541	1478	return VM_FAULT_NOPAGE;
1542	1479
1543	1480	fallback:
1544		- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
	1481	+ if (pgtable)
	1482	+ pte_free(vma->vm_mm, pgtable);
	1483	+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1545	1484	return VM_FAULT_FALLBACK;
1546	1485	}
1547	1486
..	..	@@ -1550,14 +1489,16 @@
1550	1489	{
1551	1490	struct vm_area_struct *vma = vmf->vma;
1552	1491	struct address_space *mapping = vma->vm_file->f_mapping;
	1492	+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1553	1493	unsigned long pmd_addr = vmf->address & PMD_MASK;
1554	1494	bool write = vmf->flags & FAULT_FLAG_WRITE;
1555	1495	bool sync;
1556	1496	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) \| IOMAP_FAULT;
1557	1497	struct inode *inode = mapping->host;
1558	1498	vm_fault_t result = VM_FAULT_FALLBACK;
1559		- struct iomap iomap = { 0 };
1560		- pgoff_t max_pgoff, pgoff;
	1499	+ struct iomap iomap = { .type = IOMAP_HOLE };
	1500	+ struct iomap srcmap = { .type = IOMAP_HOLE };
	1501	+ pgoff_t max_pgoff;
1561	1502	void *entry;
1562	1503	loff_t pos;
1563	1504	int error;
..	..	@@ -1568,7 +1509,6 @@
1568	1509	* supposed to hold locks serializing us with truncate / punch hole so
1569	1510	* this is a reliable test.
1570	1511	*/
1571		- pgoff = linear_page_index(vma, pmd_addr);
1572	1512	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1573	1513
1574	1514	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
..	..	@@ -1577,7 +1517,7 @@
1577	1517	* Make sure that the faulting address's PMD offset (color) matches
1578	1518	* the PMD offset from the start of the file. This is necessary so
1579	1519	* that a PMD range in the page table overlaps exactly with a PMD
1580		- * range in the radix tree.
	1520	+ * range in the page cache.
1581	1521	*/
1582	1522	if ((vmf->pgoff & PG_PMD_COLOUR) !=
1583	1523	((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
..	..	@@ -1593,24 +1533,26 @@
1593	1533	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1594	1534	goto fallback;
1595	1535
1596		- if (pgoff >= max_pgoff) {
	1536	+ if (xas.xa_index >= max_pgoff) {
1597	1537	result = VM_FAULT_SIGBUS;
1598	1538	goto out;
1599	1539	}
1600	1540
1601	1541	/* If the PMD would extend beyond the file size */
1602		- if ((pgoff \| PG_PMD_COLOUR) >= max_pgoff)
	1542	+ if ((xas.xa_index \| PG_PMD_COLOUR) >= max_pgoff)
1603	1543	goto fallback;
1604	1544
1605	1545	/*
1606		- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1607		- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1608		- * is already in the tree, for instance), it will return -EEXIST and
1609		- * we just fall back to 4k entries.
	1546	+ * grab_mapping_entry() will make sure we get an empty PMD entry,
	1547	+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
	1548	+ * entry is already in the array, for instance), it will return
	1549	+ * VM_FAULT_FALLBACK.
1610	1550	*/
1611		- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1612		- if (IS_ERR(entry))
	1551	+ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
	1552	+ if (xa_is_internal(entry)) {
	1553	+ result = xa_to_internal(entry);
1613	1554	goto fallback;
	1555	+ }
1614	1556
1615	1557	/*
1616	1558	* It is possible, particularly with mixed reads & writes to private
..	..	@@ -1629,8 +1571,9 @@
1629	1571	* setting up a mapping, so really we're using iomap_begin() as a way
1630	1572	* to look up our filesystem block.
1631	1573	*/
1632		- pos = (loff_t)pgoff << PAGE_SHIFT;
1633		- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
	1574	+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
	1575	+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
	1576	+ &srcmap);
1634	1577	if (error)
1635	1578	goto unlock_entry;
1636	1579
..	..	@@ -1645,8 +1588,8 @@
1645	1588	if (error < 0)
1646	1589	goto finish_iomap;
1647	1590
1648		- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1649		- RADIX_DAX_PMD, write && !sync);
	1591	+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
	1592	+ DAX_PMD, write && !sync);
1650	1593
1651	1594	/*
1652	1595	* If we are doing synchronous page fault and inode needs fsync,
..	..	@@ -1669,7 +1612,7 @@
1669	1612	case IOMAP_HOLE:
1670	1613	if (WARN_ON_ONCE(write))
1671	1614	break;
1672		- result = dax_pmd_load_hole(vmf, &iomap, entry);
	1615	+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
1673	1616	break;
1674	1617	default:
1675	1618	WARN_ON_ONCE(1);
..	..	@@ -1692,7 +1635,7 @@
1692	1635	&iomap);
1693	1636	}
1694	1637	unlock_entry:
1695		- put_locked_mapping_entry(mapping, pgoff);
	1638	+ dax_unlock_entry(&xas, entry);
1696	1639	fallback:
1697	1640	if (result == VM_FAULT_FALLBACK) {
1698	1641	split_huge_pmd(vma, vmf->pmd, vmf->address);
..	..	@@ -1737,53 +1680,46 @@
1737	1680	}
1738	1681	EXPORT_SYMBOL_GPL(dax_iomap_fault);
1739	1682
1740		-/**
	1683	+/*
1741	1684	* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1742	1685	* @vmf: The description of the fault
1743		- * @pe_size: Size of entry to be inserted
1744	1686	* @pfn: PFN to insert
	1687	+ * @order: Order of entry to insert.
1745	1688	*
1746		- * This function inserts writeable PTE or PMD entry into page tables for mmaped
1747		- * DAX file. It takes care of marking corresponding radix tree entry as dirty
1748		- * as well.
	1689	+ * This function inserts a writeable PTE or PMD entry into the page tables
	1690	+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
1749	1691	*/
1750		-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
1751		- enum page_entry_size pe_size,
1752		- pfn_t pfn)
	1692	+static vm_fault_t
	1693	+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1753	1694	{
1754	1695	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1755		- void entry, *slot;
1756		- pgoff_t index = vmf->pgoff;
	1696	+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
	1697	+ void *entry;
1757	1698	vm_fault_t ret;
1758	1699
1759		- xa_lock_irq(&mapping->i_pages);
1760		- entry = get_unlocked_mapping_entry(mapping, index, &slot);
	1700	+ xas_lock_irq(&xas);
	1701	+ entry = get_unlocked_entry(&xas, order);
1761	1702	/* Did we race with someone splitting entry or so? */
1762		- if (!entry \|\|
1763		- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) \|\|
1764		- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
1765		- put_unlocked_mapping_entry(mapping, index, entry);
1766		- xa_unlock_irq(&mapping->i_pages);
	1703	+ if (!entry \|\| dax_is_conflict(entry) \|\|
	1704	+ (order == 0 && !dax_is_pte_entry(entry))) {
	1705	+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
	1706	+ xas_unlock_irq(&xas);
1767	1707	trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1768	1708	VM_FAULT_NOPAGE);
1769	1709	return VM_FAULT_NOPAGE;
1770	1710	}
1771		- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
1772		- entry = lock_slot(mapping, slot);
1773		- xa_unlock_irq(&mapping->i_pages);
1774		- switch (pe_size) {
1775		- case PE_SIZE_PTE:
	1711	+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
	1712	+ dax_lock_entry(&xas, entry);
	1713	+ xas_unlock_irq(&xas);
	1714	+ if (order == 0)
1776	1715	ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1777		- break;
1778	1716	#ifdef CONFIG_FS_DAX_PMD
1779		- case PE_SIZE_PMD:
	1717	+ else if (order == PMD_ORDER)
1780	1718	ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
1781		- break;
1782	1719	#endif
1783		- default:
	1720	+ else
1784	1721	ret = VM_FAULT_FALLBACK;
1785		- }
1786		- put_locked_mapping_entry(mapping, index);
	1722	+ dax_unlock_entry(&xas, entry);
1787	1723	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1788	1724	return ret;
1789	1725	}
..	..	@@ -1803,17 +1739,12 @@
1803	1739	{
1804	1740	int err;
1805	1741	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1806		- size_t len = 0;
	1742	+ unsigned int order = pe_order(pe_size);
	1743	+ size_t len = PAGE_SIZE << order;
1807	1744
1808		- if (pe_size == PE_SIZE_PTE)
1809		- len = PAGE_SIZE;
1810		- else if (pe_size == PE_SIZE_PMD)
1811		- len = PMD_SIZE;
1812		- else
1813		- WARN_ON_ONCE(1);
1814	1745	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1815	1746	if (err)
1816	1747	return VM_FAULT_SIGBUS;
1817		- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
	1748	+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
1818	1749	}
1819	1750	EXPORT_SYMBOL_GPL(dax_finish_sync_fault);