| .. | .. | 
|---|
|  | 1 | +// SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 1 | 2 | /* | 
|---|
| 2 | 3 | * fs/dax.c - Direct Access filesystem code | 
|---|
| 3 | 4 | * Copyright (c) 2013-2014 Intel Corporation | 
|---|
| 4 | 5 | * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> | 
|---|
| 5 | 6 | * Author: Ross Zwisler <ross.zwisler@linux.intel.com> | 
|---|
| 6 |  | - * | 
|---|
| 7 |  | - * This program is free software; you can redistribute it and/or modify it | 
|---|
| 8 |  | - * under the terms and conditions of the GNU General Public License, | 
|---|
| 9 |  | - * version 2, as published by the Free Software Foundation. | 
|---|
| 10 |  | - * | 
|---|
| 11 |  | - * This program is distributed in the hope it will be useful, but WITHOUT | 
|---|
| 12 |  | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
|---|
| 13 |  | - * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for | 
|---|
| 14 |  | - * more details. | 
|---|
| 15 | 7 | */ | 
|---|
| 16 | 8 |  | 
|---|
| 17 | 9 | #include <linux/atomic.h> | 
|---|
| .. | .. | 
|---|
| 33 | 25 | #include <linux/sizes.h> | 
|---|
| 34 | 26 | #include <linux/mmu_notifier.h> | 
|---|
| 35 | 27 | #include <linux/iomap.h> | 
|---|
| 36 |  | -#include "internal.h" | 
|---|
|  | 28 | +#include <asm/pgalloc.h> | 
|---|
| 37 | 29 |  | 
|---|
| 38 | 30 | #define CREATE_TRACE_POINTS | 
|---|
| 39 | 31 | #include <trace/events/fs_dax.h> | 
|---|
|  | 32 | + | 
|---|
|  | 33 | +static inline unsigned int pe_order(enum page_entry_size pe_size) | 
|---|
|  | 34 | +{ | 
|---|
|  | 35 | +	if (pe_size == PE_SIZE_PTE) | 
|---|
|  | 36 | +		return PAGE_SHIFT - PAGE_SHIFT; | 
|---|
|  | 37 | +	if (pe_size == PE_SIZE_PMD) | 
|---|
|  | 38 | +		return PMD_SHIFT - PAGE_SHIFT; | 
|---|
|  | 39 | +	if (pe_size == PE_SIZE_PUD) | 
|---|
|  | 40 | +		return PUD_SHIFT - PAGE_SHIFT; | 
|---|
|  | 41 | +	return ~0; | 
|---|
|  | 42 | +} | 
|---|
| 40 | 43 |  | 
|---|
| 41 | 44 | /* We choose 4096 entries - same as per-zone page wait tables */ | 
|---|
| 42 | 45 | #define DAX_WAIT_TABLE_BITS 12 | 
|---|
| .. | .. | 
|---|
| 45 | 48 | /* The 'colour' (ie low bits) within a PMD of a page offset.  */ | 
|---|
| 46 | 49 | #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1) | 
|---|
| 47 | 50 | #define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT) | 
|---|
|  | 51 | + | 
|---|
|  | 52 | +/* The order of a PMD entry */ | 
|---|
|  | 53 | +#define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT) | 
|---|
| 48 | 54 |  | 
|---|
| 49 | 55 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; | 
|---|
| 50 | 56 |  | 
|---|
| .. | .. | 
|---|
| 59 | 65 | fs_initcall(init_dax_wait_table); | 
|---|
| 60 | 66 |  | 
|---|
| 61 | 67 | /* | 
|---|
| 62 |  | - * We use lowest available bit in exceptional entry for locking, one bit for | 
|---|
| 63 |  | - * the entry size (PMD) and two more to tell us if the entry is a zero page or | 
|---|
| 64 |  | - * an empty entry that is just used for locking.  In total four special bits. | 
|---|
|  | 68 | + * DAX pagecache entries use XArray value entries so they can't be mistaken | 
|---|
|  | 69 | + * for pages.  We use one bit for locking, one bit for the entry size (PMD) | 
|---|
|  | 70 | + * and two more to tell us if the entry is a zero page or an empty entry that | 
|---|
|  | 71 | + * is just used for locking.  In total four special bits. | 
|---|
| 65 | 72 | * | 
|---|
| 66 | 73 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE | 
|---|
| 67 | 74 | * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem | 
|---|
| 68 | 75 | * block allocation. | 
|---|
| 69 | 76 | */ | 
|---|
| 70 |  | -#define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4) | 
|---|
| 71 |  | -#define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT) | 
|---|
| 72 |  | -#define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) | 
|---|
| 73 |  | -#define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) | 
|---|
| 74 |  | -#define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) | 
|---|
|  | 77 | +#define DAX_SHIFT	(4) | 
|---|
|  | 78 | +#define DAX_LOCKED	(1UL << 0) | 
|---|
|  | 79 | +#define DAX_PMD		(1UL << 1) | 
|---|
|  | 80 | +#define DAX_ZERO_PAGE	(1UL << 2) | 
|---|
|  | 81 | +#define DAX_EMPTY	(1UL << 3) | 
|---|
| 75 | 82 |  | 
|---|
| 76 |  | -static unsigned long dax_radix_pfn(void *entry) | 
|---|
|  | 83 | +static unsigned long dax_to_pfn(void *entry) | 
|---|
| 77 | 84 | { | 
|---|
| 78 |  | -	return (unsigned long)entry >> RADIX_DAX_SHIFT; | 
|---|
|  | 85 | +	return xa_to_value(entry) >> DAX_SHIFT; | 
|---|
| 79 | 86 | } | 
|---|
| 80 | 87 |  | 
|---|
| 81 |  | -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) | 
|---|
|  | 88 | +static void *dax_make_entry(pfn_t pfn, unsigned long flags) | 
|---|
| 82 | 89 | { | 
|---|
| 83 |  | -	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | | 
|---|
| 84 |  | -			(pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); | 
|---|
|  | 90 | +	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); | 
|---|
| 85 | 91 | } | 
|---|
| 86 | 92 |  | 
|---|
| 87 |  | -static unsigned int dax_radix_order(void *entry) | 
|---|
|  | 93 | +static bool dax_is_locked(void *entry) | 
|---|
| 88 | 94 | { | 
|---|
| 89 |  | -	if ((unsigned long)entry & RADIX_DAX_PMD) | 
|---|
| 90 |  | -		return PMD_SHIFT - PAGE_SHIFT; | 
|---|
|  | 95 | +	return xa_to_value(entry) & DAX_LOCKED; | 
|---|
|  | 96 | +} | 
|---|
|  | 97 | + | 
|---|
|  | 98 | +static unsigned int dax_entry_order(void *entry) | 
|---|
|  | 99 | +{ | 
|---|
|  | 100 | +	if (xa_to_value(entry) & DAX_PMD) | 
|---|
|  | 101 | +		return PMD_ORDER; | 
|---|
| 91 | 102 | return 0; | 
|---|
| 92 | 103 | } | 
|---|
| 93 | 104 |  | 
|---|
| 94 |  | -static int dax_is_pmd_entry(void *entry) | 
|---|
|  | 105 | +static unsigned long dax_is_pmd_entry(void *entry) | 
|---|
| 95 | 106 | { | 
|---|
| 96 |  | -	return (unsigned long)entry & RADIX_DAX_PMD; | 
|---|
|  | 107 | +	return xa_to_value(entry) & DAX_PMD; | 
|---|
| 97 | 108 | } | 
|---|
| 98 | 109 |  | 
|---|
| 99 |  | -static int dax_is_pte_entry(void *entry) | 
|---|
|  | 110 | +static bool dax_is_pte_entry(void *entry) | 
|---|
| 100 | 111 | { | 
|---|
| 101 |  | -	return !((unsigned long)entry & RADIX_DAX_PMD); | 
|---|
|  | 112 | +	return !(xa_to_value(entry) & DAX_PMD); | 
|---|
| 102 | 113 | } | 
|---|
| 103 | 114 |  | 
|---|
| 104 | 115 | static int dax_is_zero_entry(void *entry) | 
|---|
| 105 | 116 | { | 
|---|
| 106 |  | -	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; | 
|---|
|  | 117 | +	return xa_to_value(entry) & DAX_ZERO_PAGE; | 
|---|
| 107 | 118 | } | 
|---|
| 108 | 119 |  | 
|---|
| 109 | 120 | static int dax_is_empty_entry(void *entry) | 
|---|
| 110 | 121 | { | 
|---|
| 111 |  | -	return (unsigned long)entry & RADIX_DAX_EMPTY; | 
|---|
|  | 122 | +	return xa_to_value(entry) & DAX_EMPTY; | 
|---|
| 112 | 123 | } | 
|---|
| 113 | 124 |  | 
|---|
| 114 | 125 | /* | 
|---|
| 115 |  | - * DAX radix tree locking | 
|---|
|  | 126 | + * true if the entry that was found is of a smaller order than the entry | 
|---|
|  | 127 | + * we were looking for | 
|---|
|  | 128 | + */ | 
|---|
|  | 129 | +static bool dax_is_conflict(void *entry) | 
|---|
|  | 130 | +{ | 
|---|
|  | 131 | +	return entry == XA_RETRY_ENTRY; | 
|---|
|  | 132 | +} | 
|---|
|  | 133 | + | 
|---|
|  | 134 | +/* | 
|---|
|  | 135 | + * DAX page cache entry locking | 
|---|
| 116 | 136 | */ | 
|---|
| 117 | 137 | struct exceptional_entry_key { | 
|---|
| 118 |  | -	struct address_space *mapping; | 
|---|
|  | 138 | +	struct xarray *xa; | 
|---|
| 119 | 139 | pgoff_t entry_start; | 
|---|
| 120 | 140 | }; | 
|---|
| 121 | 141 |  | 
|---|
| .. | .. | 
|---|
| 124 | 144 | struct exceptional_entry_key key; | 
|---|
| 125 | 145 | }; | 
|---|
| 126 | 146 |  | 
|---|
| 127 |  | -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, | 
|---|
| 128 |  | -		pgoff_t index, void *entry, struct exceptional_entry_key *key) | 
|---|
|  | 147 | +/** | 
|---|
|  | 148 | + * enum dax_wake_mode: waitqueue wakeup behaviour | 
|---|
|  | 149 | + * @WAKE_ALL: wake all waiters in the waitqueue | 
|---|
|  | 150 | + * @WAKE_NEXT: wake only the first waiter in the waitqueue | 
|---|
|  | 151 | + */ | 
|---|
|  | 152 | +enum dax_wake_mode { | 
|---|
|  | 153 | +	WAKE_ALL, | 
|---|
|  | 154 | +	WAKE_NEXT, | 
|---|
|  | 155 | +}; | 
|---|
|  | 156 | + | 
|---|
|  | 157 | +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, | 
|---|
|  | 158 | +		void *entry, struct exceptional_entry_key *key) | 
|---|
| 129 | 159 | { | 
|---|
| 130 | 160 | unsigned long hash; | 
|---|
|  | 161 | +	unsigned long index = xas->xa_index; | 
|---|
| 131 | 162 |  | 
|---|
| 132 | 163 | /* | 
|---|
| 133 | 164 | * If 'entry' is a PMD, align the 'index' that we use for the wait | 
|---|
| .. | .. | 
|---|
| 136 | 167 | */ | 
|---|
| 137 | 168 | if (dax_is_pmd_entry(entry)) | 
|---|
| 138 | 169 | index &= ~PG_PMD_COLOUR; | 
|---|
| 139 |  | - | 
|---|
| 140 |  | -	key->mapping = mapping; | 
|---|
|  | 170 | +	key->xa = xas->xa; | 
|---|
| 141 | 171 | key->entry_start = index; | 
|---|
| 142 | 172 |  | 
|---|
| 143 |  | -	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); | 
|---|
|  | 173 | +	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); | 
|---|
| 144 | 174 | return wait_table + hash; | 
|---|
| 145 | 175 | } | 
|---|
| 146 | 176 |  | 
|---|
| 147 |  | -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, | 
|---|
| 148 |  | -				       int sync, void *keyp) | 
|---|
|  | 177 | +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, | 
|---|
|  | 178 | +		unsigned int mode, int sync, void *keyp) | 
|---|
| 149 | 179 | { | 
|---|
| 150 | 180 | struct exceptional_entry_key *key = keyp; | 
|---|
| 151 | 181 | struct wait_exceptional_entry_queue *ewait = | 
|---|
| 152 | 182 | container_of(wait, struct wait_exceptional_entry_queue, wait); | 
|---|
| 153 | 183 |  | 
|---|
| 154 |  | -	if (key->mapping != ewait->key.mapping || | 
|---|
|  | 184 | +	if (key->xa != ewait->key.xa || | 
|---|
| 155 | 185 | key->entry_start != ewait->key.entry_start) | 
|---|
| 156 | 186 | return 0; | 
|---|
| 157 | 187 | return autoremove_wake_function(wait, mode, sync, NULL); | 
|---|
| .. | .. | 
|---|
| 162 | 192 | * The important information it's conveying is whether the entry at | 
|---|
| 163 | 193 | * this index used to be a PMD entry. | 
|---|
| 164 | 194 | */ | 
|---|
| 165 |  | -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, | 
|---|
| 166 |  | -		pgoff_t index, void *entry, bool wake_all) | 
|---|
|  | 195 | +static void dax_wake_entry(struct xa_state *xas, void *entry, | 
|---|
|  | 196 | +			   enum dax_wake_mode mode) | 
|---|
| 167 | 197 | { | 
|---|
| 168 | 198 | struct exceptional_entry_key key; | 
|---|
| 169 | 199 | wait_queue_head_t *wq; | 
|---|
| 170 | 200 |  | 
|---|
| 171 |  | -	wq = dax_entry_waitqueue(mapping, index, entry, &key); | 
|---|
|  | 201 | +	wq = dax_entry_waitqueue(xas, entry, &key); | 
|---|
| 172 | 202 |  | 
|---|
| 173 | 203 | /* | 
|---|
| 174 | 204 | * Checking for locked entry and prepare_to_wait_exclusive() happens | 
|---|
| .. | .. | 
|---|
| 177 | 207 | * must be in the waitqueue and the following check will see them. | 
|---|
| 178 | 208 | */ | 
|---|
| 179 | 209 | if (waitqueue_active(wq)) | 
|---|
| 180 |  | -		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | 
|---|
|  | 210 | +		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); | 
|---|
| 181 | 211 | } | 
|---|
| 182 | 212 |  | 
|---|
| 183 | 213 | /* | 
|---|
| 184 |  | - * Check whether the given slot is locked.  Must be called with the i_pages | 
|---|
| 185 |  | - * lock held. | 
|---|
| 186 |  | - */ | 
|---|
| 187 |  | -static inline int slot_locked(struct address_space *mapping, void **slot) | 
|---|
| 188 |  | -{ | 
|---|
| 189 |  | -	unsigned long entry = (unsigned long) | 
|---|
| 190 |  | -		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); | 
|---|
| 191 |  | -	return entry & RADIX_DAX_ENTRY_LOCK; | 
|---|
| 192 |  | -} | 
|---|
| 193 |  | - | 
|---|
| 194 |  | -/* | 
|---|
| 195 |  | - * Mark the given slot as locked.  Must be called with the i_pages lock held. | 
|---|
| 196 |  | - */ | 
|---|
| 197 |  | -static inline void *lock_slot(struct address_space *mapping, void **slot) | 
|---|
| 198 |  | -{ | 
|---|
| 199 |  | -	unsigned long entry = (unsigned long) | 
|---|
| 200 |  | -		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); | 
|---|
| 201 |  | - | 
|---|
| 202 |  | -	entry |= RADIX_DAX_ENTRY_LOCK; | 
|---|
| 203 |  | -	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); | 
|---|
| 204 |  | -	return (void *)entry; | 
|---|
| 205 |  | -} | 
|---|
| 206 |  | - | 
|---|
| 207 |  | -/* | 
|---|
| 208 |  | - * Mark the given slot as unlocked.  Must be called with the i_pages lock held. | 
|---|
| 209 |  | - */ | 
|---|
| 210 |  | -static inline void *unlock_slot(struct address_space *mapping, void **slot) | 
|---|
| 211 |  | -{ | 
|---|
| 212 |  | -	unsigned long entry = (unsigned long) | 
|---|
| 213 |  | -		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); | 
|---|
| 214 |  | - | 
|---|
| 215 |  | -	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; | 
|---|
| 216 |  | -	radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); | 
|---|
| 217 |  | -	return (void *)entry; | 
|---|
| 218 |  | -} | 
|---|
| 219 |  | - | 
|---|
| 220 |  | -static void put_unlocked_mapping_entry(struct address_space *mapping, | 
|---|
| 221 |  | -				       pgoff_t index, void *entry); | 
|---|
| 222 |  | - | 
|---|
| 223 |  | -/* | 
|---|
| 224 |  | - * Lookup entry in radix tree, wait for it to become unlocked if it is | 
|---|
| 225 |  | - * exceptional entry and return it. The caller must call | 
|---|
| 226 |  | - * put_unlocked_mapping_entry() when he decided not to lock the entry or | 
|---|
| 227 |  | - * put_locked_mapping_entry() when he locked the entry and now wants to | 
|---|
| 228 |  | - * unlock it. | 
|---|
|  | 214 | + * Look up entry in page cache, wait for it to become unlocked if it | 
|---|
|  | 215 | + * is a DAX entry and return it.  The caller must subsequently call | 
|---|
|  | 216 | + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() | 
|---|
|  | 217 | + * if it did.  The entry returned may have a larger order than @order. | 
|---|
|  | 218 | + * If @order is larger than the order of the entry found in i_pages, this | 
|---|
|  | 219 | + * function returns a dax_is_conflict entry. | 
|---|
| 229 | 220 | * | 
|---|
| 230 | 221 | * Must be called with the i_pages lock held. | 
|---|
| 231 | 222 | */ | 
|---|
| 232 |  | -static void *get_unlocked_mapping_entry(struct address_space *mapping, | 
|---|
| 233 |  | -		pgoff_t index, void ***slotp) | 
|---|
|  | 223 | +static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) | 
|---|
| 234 | 224 | { | 
|---|
| 235 |  | -	void *entry, **slot; | 
|---|
|  | 225 | +	void *entry; | 
|---|
| 236 | 226 | struct wait_exceptional_entry_queue ewait; | 
|---|
| 237 | 227 | wait_queue_head_t *wq; | 
|---|
| 238 | 228 |  | 
|---|
| .. | .. | 
|---|
| 240 | 230 | ewait.wait.func = wake_exceptional_entry_func; | 
|---|
| 241 | 231 |  | 
|---|
| 242 | 232 | for (;;) { | 
|---|
| 243 |  | -		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, | 
|---|
| 244 |  | -					  &slot); | 
|---|
| 245 |  | -		if (!entry || | 
|---|
| 246 |  | -		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || | 
|---|
| 247 |  | -		    !slot_locked(mapping, slot)) { | 
|---|
| 248 |  | -			if (slotp) | 
|---|
| 249 |  | -				*slotp = slot; | 
|---|
|  | 233 | +		entry = xas_find_conflict(xas); | 
|---|
|  | 234 | +		if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) | 
|---|
| 250 | 235 | return entry; | 
|---|
| 251 |  | -		} | 
|---|
|  | 236 | +		if (dax_entry_order(entry) < order) | 
|---|
|  | 237 | +			return XA_RETRY_ENTRY; | 
|---|
|  | 238 | +		if (!dax_is_locked(entry)) | 
|---|
|  | 239 | +			return entry; | 
|---|
| 252 | 240 |  | 
|---|
| 253 |  | -		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); | 
|---|
|  | 241 | +		wq = dax_entry_waitqueue(xas, entry, &ewait.key); | 
|---|
| 254 | 242 | prepare_to_wait_exclusive(wq, &ewait.wait, | 
|---|
| 255 | 243 | TASK_UNINTERRUPTIBLE); | 
|---|
| 256 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 244 | +		xas_unlock_irq(xas); | 
|---|
|  | 245 | +		xas_reset(xas); | 
|---|
| 257 | 246 | schedule(); | 
|---|
| 258 | 247 | finish_wait(wq, &ewait.wait); | 
|---|
| 259 |  | -		xa_lock_irq(&mapping->i_pages); | 
|---|
|  | 248 | +		xas_lock_irq(xas); | 
|---|
| 260 | 249 | } | 
|---|
| 261 | 250 | } | 
|---|
| 262 | 251 |  | 
|---|
| .. | .. | 
|---|
| 265 | 254 | * (it's cycled in clear_inode() after removing the entries from i_pages) | 
|---|
| 266 | 255 | * After we call xas_unlock_irq(), we cannot touch xas->xa. | 
|---|
| 267 | 256 | */ | 
|---|
| 268 |  | -static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, | 
|---|
| 269 |  | -		void ***slotp, void *entry) | 
|---|
|  | 257 | +static void wait_entry_unlocked(struct xa_state *xas, void *entry) | 
|---|
| 270 | 258 | { | 
|---|
| 271 | 259 | struct wait_exceptional_entry_queue ewait; | 
|---|
| 272 | 260 | wait_queue_head_t *wq; | 
|---|
| .. | .. | 
|---|
| 274 | 262 | init_wait(&ewait.wait); | 
|---|
| 275 | 263 | ewait.wait.func = wake_exceptional_entry_func; | 
|---|
| 276 | 264 |  | 
|---|
| 277 |  | -	wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); | 
|---|
|  | 265 | +	wq = dax_entry_waitqueue(xas, entry, &ewait.key); | 
|---|
| 278 | 266 | /* | 
|---|
| 279 | 267 | * Unlike get_unlocked_entry() there is no guarantee that this | 
|---|
| 280 | 268 | * path ever successfully retrieves an unlocked entry before an | 
|---|
| .. | .. | 
|---|
| 282 | 270 | * never successfully performs its own wake up. | 
|---|
| 283 | 271 | */ | 
|---|
| 284 | 272 | prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); | 
|---|
| 285 |  | -	xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 273 | +	xas_unlock_irq(xas); | 
|---|
| 286 | 274 | schedule(); | 
|---|
| 287 | 275 | finish_wait(wq, &ewait.wait); | 
|---|
| 288 | 276 | } | 
|---|
| 289 | 277 |  | 
|---|
| 290 |  | -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) | 
|---|
|  | 278 | +static void put_unlocked_entry(struct xa_state *xas, void *entry, | 
|---|
|  | 279 | +			       enum dax_wake_mode mode) | 
|---|
| 291 | 280 | { | 
|---|
| 292 |  | -	void *entry, **slot; | 
|---|
| 293 |  | - | 
|---|
| 294 |  | -	xa_lock_irq(&mapping->i_pages); | 
|---|
| 295 |  | -	entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); | 
|---|
| 296 |  | -	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | 
|---|
| 297 |  | -			 !slot_locked(mapping, slot))) { | 
|---|
| 298 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
| 299 |  | -		return; | 
|---|
| 300 |  | -	} | 
|---|
| 301 |  | -	unlock_slot(mapping, slot); | 
|---|
| 302 |  | -	xa_unlock_irq(&mapping->i_pages); | 
|---|
| 303 |  | -	dax_wake_mapping_entry_waiter(mapping, index, entry, false); | 
|---|
| 304 |  | -} | 
|---|
| 305 |  | - | 
|---|
| 306 |  | -static void put_locked_mapping_entry(struct address_space *mapping, | 
|---|
| 307 |  | -		pgoff_t index) | 
|---|
| 308 |  | -{ | 
|---|
| 309 |  | -	unlock_mapping_entry(mapping, index); | 
|---|
|  | 281 | +	if (entry && !dax_is_conflict(entry)) | 
|---|
|  | 282 | +		dax_wake_entry(xas, entry, mode); | 
|---|
| 310 | 283 | } | 
|---|
| 311 | 284 |  | 
|---|
| 312 | 285 | /* | 
|---|
| 313 |  | - * Called when we are done with radix tree entry we looked up via | 
|---|
| 314 |  | - * get_unlocked_mapping_entry() and which we didn't lock in the end. | 
|---|
|  | 286 | + * We used the xa_state to get the entry, but then we locked the entry and | 
|---|
|  | 287 | + * dropped the xa_lock, so we know the xa_state is stale and must be reset | 
|---|
|  | 288 | + * before use. | 
|---|
| 315 | 289 | */ | 
|---|
| 316 |  | -static void put_unlocked_mapping_entry(struct address_space *mapping, | 
|---|
| 317 |  | -				       pgoff_t index, void *entry) | 
|---|
|  | 290 | +static void dax_unlock_entry(struct xa_state *xas, void *entry) | 
|---|
| 318 | 291 | { | 
|---|
| 319 |  | -	if (!entry) | 
|---|
| 320 |  | -		return; | 
|---|
|  | 292 | +	void *old; | 
|---|
| 321 | 293 |  | 
|---|
| 322 |  | -	/* We have to wake up next waiter for the radix tree entry lock */ | 
|---|
| 323 |  | -	dax_wake_mapping_entry_waiter(mapping, index, entry, false); | 
|---|
|  | 294 | +	BUG_ON(dax_is_locked(entry)); | 
|---|
|  | 295 | +	xas_reset(xas); | 
|---|
|  | 296 | +	xas_lock_irq(xas); | 
|---|
|  | 297 | +	old = xas_store(xas, entry); | 
|---|
|  | 298 | +	xas_unlock_irq(xas); | 
|---|
|  | 299 | +	BUG_ON(!dax_is_locked(old)); | 
|---|
|  | 300 | +	dax_wake_entry(xas, entry, WAKE_NEXT); | 
|---|
|  | 301 | +} | 
|---|
|  | 302 | + | 
|---|
|  | 303 | +/* | 
|---|
|  | 304 | + * Return: The entry stored at this location before it was locked. | 
|---|
|  | 305 | + */ | 
|---|
|  | 306 | +static void *dax_lock_entry(struct xa_state *xas, void *entry) | 
|---|
|  | 307 | +{ | 
|---|
|  | 308 | +	unsigned long v = xa_to_value(entry); | 
|---|
|  | 309 | +	return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); | 
|---|
| 324 | 310 | } | 
|---|
| 325 | 311 |  | 
|---|
| 326 | 312 | static unsigned long dax_entry_size(void *entry) | 
|---|
| .. | .. | 
|---|
| 335 | 321 | return PAGE_SIZE; | 
|---|
| 336 | 322 | } | 
|---|
| 337 | 323 |  | 
|---|
| 338 |  | -static unsigned long dax_radix_end_pfn(void *entry) | 
|---|
|  | 324 | +static unsigned long dax_end_pfn(void *entry) | 
|---|
| 339 | 325 | { | 
|---|
| 340 |  | -	return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; | 
|---|
|  | 326 | +	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; | 
|---|
| 341 | 327 | } | 
|---|
| 342 | 328 |  | 
|---|
| 343 | 329 | /* | 
|---|
| .. | .. | 
|---|
| 345 | 331 | * 'empty' and 'zero' entries. | 
|---|
| 346 | 332 | */ | 
|---|
| 347 | 333 | #define for_each_mapped_pfn(entry, pfn) \ | 
|---|
| 348 |  | -	for (pfn = dax_radix_pfn(entry); \ | 
|---|
| 349 |  | -			pfn < dax_radix_end_pfn(entry); pfn++) | 
|---|
|  | 334 | +	for (pfn = dax_to_pfn(entry); \ | 
|---|
|  | 335 | +			pfn < dax_end_pfn(entry); pfn++) | 
|---|
| 350 | 336 |  | 
|---|
| 351 | 337 | /* | 
|---|
| 352 | 338 | * TODO: for reflink+dax we need a way to associate a single page with | 
|---|
| .. | .. | 
|---|
| 403 | 389 | return NULL; | 
|---|
| 404 | 390 | } | 
|---|
| 405 | 391 |  | 
|---|
| 406 |  | -bool dax_lock_mapping_entry(struct page *page) | 
|---|
|  | 392 | +/* | 
|---|
|  | 393 | + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page | 
|---|
|  | 394 | + * @page: The page whose entry we want to lock | 
|---|
|  | 395 | + * | 
|---|
|  | 396 | + * Context: Process context. | 
|---|
|  | 397 | + * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could | 
|---|
|  | 398 | + * not be locked. | 
|---|
|  | 399 | + */ | 
|---|
|  | 400 | +dax_entry_t dax_lock_page(struct page *page) | 
|---|
| 407 | 401 | { | 
|---|
| 408 |  | -	pgoff_t index; | 
|---|
| 409 |  | -	struct inode *inode; | 
|---|
| 410 |  | -	bool did_lock = false; | 
|---|
| 411 |  | -	void *entry = NULL, **slot; | 
|---|
| 412 |  | -	struct address_space *mapping; | 
|---|
|  | 402 | +	XA_STATE(xas, NULL, 0); | 
|---|
|  | 403 | +	void *entry; | 
|---|
| 413 | 404 |  | 
|---|
|  | 405 | +	/* Ensure page->mapping isn't freed while we look at it */ | 
|---|
| 414 | 406 | rcu_read_lock(); | 
|---|
| 415 | 407 | for (;;) { | 
|---|
| 416 |  | -		mapping = READ_ONCE(page->mapping); | 
|---|
|  | 408 | +		struct address_space *mapping = READ_ONCE(page->mapping); | 
|---|
| 417 | 409 |  | 
|---|
|  | 410 | +		entry = NULL; | 
|---|
| 418 | 411 | if (!mapping || !dax_mapping(mapping)) | 
|---|
| 419 | 412 | break; | 
|---|
| 420 | 413 |  | 
|---|
| .. | .. | 
|---|
| 425 | 418 | * otherwise we would not have a valid pfn_to_page() | 
|---|
| 426 | 419 | * translation. | 
|---|
| 427 | 420 | */ | 
|---|
| 428 |  | -		inode = mapping->host; | 
|---|
| 429 |  | -		if (S_ISCHR(inode->i_mode)) { | 
|---|
| 430 |  | -			did_lock = true; | 
|---|
|  | 421 | +		entry = (void *)~0UL; | 
|---|
|  | 422 | +		if (S_ISCHR(mapping->host->i_mode)) | 
|---|
| 431 | 423 | break; | 
|---|
| 432 |  | -		} | 
|---|
| 433 | 424 |  | 
|---|
| 434 |  | -		xa_lock_irq(&mapping->i_pages); | 
|---|
|  | 425 | +		xas.xa = &mapping->i_pages; | 
|---|
|  | 426 | +		xas_lock_irq(&xas); | 
|---|
| 435 | 427 | if (mapping != page->mapping) { | 
|---|
| 436 |  | -			xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 428 | +			xas_unlock_irq(&xas); | 
|---|
| 437 | 429 | continue; | 
|---|
| 438 | 430 | } | 
|---|
| 439 |  | -		index = page->index; | 
|---|
| 440 |  | - | 
|---|
| 441 |  | -		entry = __radix_tree_lookup(&mapping->i_pages, index, | 
|---|
| 442 |  | -						NULL, &slot); | 
|---|
| 443 |  | -		if (!entry) { | 
|---|
| 444 |  | -			xa_unlock_irq(&mapping->i_pages); | 
|---|
| 445 |  | -			break; | 
|---|
| 446 |  | -		} else if (slot_locked(mapping, slot)) { | 
|---|
|  | 431 | +		xas_set(&xas, page->index); | 
|---|
|  | 432 | +		entry = xas_load(&xas); | 
|---|
|  | 433 | +		if (dax_is_locked(entry)) { | 
|---|
| 447 | 434 | rcu_read_unlock(); | 
|---|
| 448 |  | -			wait_entry_unlocked(mapping, index, &slot, entry); | 
|---|
|  | 435 | +			wait_entry_unlocked(&xas, entry); | 
|---|
| 449 | 436 | rcu_read_lock(); | 
|---|
| 450 | 437 | continue; | 
|---|
| 451 | 438 | } | 
|---|
| 452 |  | -		lock_slot(mapping, slot); | 
|---|
| 453 |  | -		did_lock = true; | 
|---|
| 454 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 439 | +		dax_lock_entry(&xas, entry); | 
|---|
|  | 440 | +		xas_unlock_irq(&xas); | 
|---|
| 455 | 441 | break; | 
|---|
| 456 | 442 | } | 
|---|
| 457 | 443 | rcu_read_unlock(); | 
|---|
| 458 |  | - | 
|---|
| 459 |  | -	return did_lock; | 
|---|
|  | 444 | +	return (dax_entry_t)entry; | 
|---|
| 460 | 445 | } | 
|---|
| 461 | 446 |  | 
|---|
| 462 |  | -void dax_unlock_mapping_entry(struct page *page) | 
|---|
|  | 447 | +void dax_unlock_page(struct page *page, dax_entry_t cookie) | 
|---|
| 463 | 448 | { | 
|---|
| 464 | 449 | struct address_space *mapping = page->mapping; | 
|---|
| 465 |  | -	struct inode *inode = mapping->host; | 
|---|
|  | 450 | +	XA_STATE(xas, &mapping->i_pages, page->index); | 
|---|
| 466 | 451 |  | 
|---|
| 467 |  | -	if (S_ISCHR(inode->i_mode)) | 
|---|
|  | 452 | +	if (S_ISCHR(mapping->host->i_mode)) | 
|---|
| 468 | 453 | return; | 
|---|
| 469 | 454 |  | 
|---|
| 470 |  | -	unlock_mapping_entry(mapping, page->index); | 
|---|
|  | 455 | +	dax_unlock_entry(&xas, (void *)cookie); | 
|---|
| 471 | 456 | } | 
|---|
| 472 | 457 |  | 
|---|
| 473 | 458 | /* | 
|---|
| 474 |  | - * Find radix tree entry at given index. If it points to an exceptional entry, | 
|---|
| 475 |  | - * return it with the radix tree entry locked. If the radix tree doesn't | 
|---|
| 476 |  | - * contain given index, create an empty exceptional entry for the index and | 
|---|
| 477 |  | - * return with it locked. | 
|---|
|  | 459 | + * Find page cache entry at given index. If it is a DAX entry, return it | 
|---|
|  | 460 | + * with the entry locked. If the page cache doesn't contain an entry at | 
|---|
|  | 461 | + * that index, add a locked empty entry. | 
|---|
| 478 | 462 | * | 
|---|
| 479 |  | - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will | 
|---|
| 480 |  | - * either return that locked entry or will return an error.  This error will | 
|---|
| 481 |  | - * happen if there are any 4k entries within the 2MiB range that we are | 
|---|
| 482 |  | - * requesting. | 
|---|
|  | 463 | + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will | 
|---|
|  | 464 | + * either return that locked entry or will return VM_FAULT_FALLBACK. | 
|---|
|  | 465 | + * This will happen if there are any PTE entries within the PMD range | 
|---|
|  | 466 | + * that we are requesting. | 
|---|
| 483 | 467 | * | 
|---|
| 484 |  | - * We always favor 4k entries over 2MiB entries. There isn't a flow where we | 
|---|
| 485 |  | - * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB | 
|---|
| 486 |  | - * insertion will fail if it finds any 4k entries already in the tree, and a | 
|---|
| 487 |  | - * 4k insertion will cause an existing 2MiB entry to be unmapped and | 
|---|
| 488 |  | - * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as | 
|---|
| 489 |  | - * well as 2MiB empty entries. | 
|---|
|  | 468 | + * We always favor PTE entries over PMD entries. There isn't a flow where we | 
|---|
|  | 469 | + * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD | 
|---|
|  | 470 | + * insertion will fail if it finds any PTE entries already in the tree, and a | 
|---|
|  | 471 | + * PTE insertion will cause an existing PMD entry to be unmapped and | 
|---|
|  | 472 | + * downgraded to PTE entries.  This happens for both PMD zero pages as | 
|---|
|  | 473 | + * well as PMD empty entries. | 
|---|
| 490 | 474 | * | 
|---|
| 491 |  | - * The exception to this downgrade path is for 2MiB DAX PMD entries that have | 
|---|
| 492 |  | - * real storage backing them.  We will leave these real 2MiB DAX entries in | 
|---|
| 493 |  | - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. | 
|---|
|  | 475 | + * The exception to this downgrade path is for PMD entries that have | 
|---|
|  | 476 | + * real storage backing them.  We will leave these real PMD entries in | 
|---|
|  | 477 | + * the tree, and PTE writes will simply dirty the entire PMD entry. | 
|---|
| 494 | 478 | * | 
|---|
| 495 | 479 | * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For | 
|---|
| 496 | 480 | * persistent memory the benefit is doubtful. We can add that later if we can | 
|---|
| 497 | 481 | * show it helps. | 
|---|
|  | 482 | + * | 
|---|
|  | 483 | + * On error, this function does not return an ERR_PTR.  Instead it returns | 
|---|
|  | 484 | + * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values | 
|---|
|  | 485 | + * overlap with xarray value entries. | 
|---|
| 498 | 486 | */ | 
|---|
| 499 |  | -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, | 
|---|
| 500 |  | -		unsigned long size_flag) | 
|---|
|  | 487 | +static void *grab_mapping_entry(struct xa_state *xas, | 
|---|
|  | 488 | +		struct address_space *mapping, unsigned int order) | 
|---|
| 501 | 489 | { | 
|---|
| 502 |  | -	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ | 
|---|
| 503 |  | -	void *entry, **slot; | 
|---|
|  | 490 | +	unsigned long index = xas->xa_index; | 
|---|
|  | 491 | +	bool pmd_downgrade;	/* splitting PMD entry into PTE entries? */ | 
|---|
|  | 492 | +	void *entry; | 
|---|
| 504 | 493 |  | 
|---|
| 505 |  | -restart: | 
|---|
| 506 |  | -	xa_lock_irq(&mapping->i_pages); | 
|---|
| 507 |  | -	entry = get_unlocked_mapping_entry(mapping, index, &slot); | 
|---|
| 508 |  | - | 
|---|
| 509 |  | -	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { | 
|---|
| 510 |  | -		entry = ERR_PTR(-EIO); | 
|---|
| 511 |  | -		goto out_unlock; | 
|---|
| 512 |  | -	} | 
|---|
|  | 494 | +retry: | 
|---|
|  | 495 | +	pmd_downgrade = false; | 
|---|
|  | 496 | +	xas_lock_irq(xas); | 
|---|
|  | 497 | +	entry = get_unlocked_entry(xas, order); | 
|---|
| 513 | 498 |  | 
|---|
| 514 | 499 | if (entry) { | 
|---|
| 515 |  | -		if (size_flag & RADIX_DAX_PMD) { | 
|---|
| 516 |  | -			if (dax_is_pte_entry(entry)) { | 
|---|
| 517 |  | -				put_unlocked_mapping_entry(mapping, index, | 
|---|
| 518 |  | -						entry); | 
|---|
| 519 |  | -				entry = ERR_PTR(-EEXIST); | 
|---|
| 520 |  | -				goto out_unlock; | 
|---|
| 521 |  | -			} | 
|---|
| 522 |  | -		} else { /* trying to grab a PTE entry */ | 
|---|
|  | 500 | +		if (dax_is_conflict(entry)) | 
|---|
|  | 501 | +			goto fallback; | 
|---|
|  | 502 | +		if (!xa_is_value(entry)) { | 
|---|
|  | 503 | +			xas_set_err(xas, -EIO); | 
|---|
|  | 504 | +			goto out_unlock; | 
|---|
|  | 505 | +		} | 
|---|
|  | 506 | + | 
|---|
|  | 507 | +		if (order == 0) { | 
|---|
| 523 | 508 | if (dax_is_pmd_entry(entry) && | 
|---|
| 524 | 509 | (dax_is_zero_entry(entry) || | 
|---|
| 525 | 510 | dax_is_empty_entry(entry))) { | 
|---|
| .. | .. | 
|---|
| 528 | 513 | } | 
|---|
| 529 | 514 | } | 
|---|
| 530 | 515 |  | 
|---|
| 531 |  | -	/* No entry for given index? Make sure radix tree is big enough. */ | 
|---|
| 532 |  | -	if (!entry || pmd_downgrade) { | 
|---|
| 533 |  | -		int err; | 
|---|
|  | 516 | +	if (pmd_downgrade) { | 
|---|
|  | 517 | +		/* | 
|---|
|  | 518 | +		 * Make sure 'entry' remains valid while we drop | 
|---|
|  | 519 | +		 * the i_pages lock. | 
|---|
|  | 520 | +		 */ | 
|---|
|  | 521 | +		dax_lock_entry(xas, entry); | 
|---|
| 534 | 522 |  | 
|---|
| 535 |  | -		if (pmd_downgrade) { | 
|---|
| 536 |  | -			/* | 
|---|
| 537 |  | -			 * Make sure 'entry' remains valid while we drop | 
|---|
| 538 |  | -			 * the i_pages lock. | 
|---|
| 539 |  | -			 */ | 
|---|
| 540 |  | -			entry = lock_slot(mapping, slot); | 
|---|
| 541 |  | -		} | 
|---|
| 542 |  | - | 
|---|
| 543 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
| 544 | 523 | /* | 
|---|
| 545 | 524 | * Besides huge zero pages the only other thing that gets | 
|---|
| 546 | 525 | * downgraded are empty entries which don't need to be | 
|---|
| 547 | 526 | * unmapped. | 
|---|
| 548 | 527 | */ | 
|---|
| 549 |  | -		if (pmd_downgrade && dax_is_zero_entry(entry)) | 
|---|
| 550 |  | -			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, | 
|---|
| 551 |  | -							PG_PMD_NR, false); | 
|---|
| 552 |  | - | 
|---|
| 553 |  | -		err = radix_tree_preload( | 
|---|
| 554 |  | -				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); | 
|---|
| 555 |  | -		if (err) { | 
|---|
| 556 |  | -			if (pmd_downgrade) | 
|---|
| 557 |  | -				put_locked_mapping_entry(mapping, index); | 
|---|
| 558 |  | -			return ERR_PTR(err); | 
|---|
| 559 |  | -		} | 
|---|
| 560 |  | -		xa_lock_irq(&mapping->i_pages); | 
|---|
| 561 |  | - | 
|---|
| 562 |  | -		if (!entry) { | 
|---|
| 563 |  | -			/* | 
|---|
| 564 |  | -			 * We needed to drop the i_pages lock while calling | 
|---|
| 565 |  | -			 * radix_tree_preload() and we didn't have an entry to | 
|---|
| 566 |  | -			 * lock.  See if another thread inserted an entry at | 
|---|
| 567 |  | -			 * our index during this time. | 
|---|
| 568 |  | -			 */ | 
|---|
| 569 |  | -			entry = __radix_tree_lookup(&mapping->i_pages, index, | 
|---|
| 570 |  | -					NULL, &slot); | 
|---|
| 571 |  | -			if (entry) { | 
|---|
| 572 |  | -				radix_tree_preload_end(); | 
|---|
| 573 |  | -				xa_unlock_irq(&mapping->i_pages); | 
|---|
| 574 |  | -				goto restart; | 
|---|
| 575 |  | -			} | 
|---|
|  | 528 | +		if (dax_is_zero_entry(entry)) { | 
|---|
|  | 529 | +			xas_unlock_irq(xas); | 
|---|
|  | 530 | +			unmap_mapping_pages(mapping, | 
|---|
|  | 531 | +					xas->xa_index & ~PG_PMD_COLOUR, | 
|---|
|  | 532 | +					PG_PMD_NR, false); | 
|---|
|  | 533 | +			xas_reset(xas); | 
|---|
|  | 534 | +			xas_lock_irq(xas); | 
|---|
| 576 | 535 | } | 
|---|
| 577 | 536 |  | 
|---|
| 578 |  | -		if (pmd_downgrade) { | 
|---|
| 579 |  | -			dax_disassociate_entry(entry, mapping, false); | 
|---|
| 580 |  | -			radix_tree_delete(&mapping->i_pages, index); | 
|---|
| 581 |  | -			mapping->nrexceptional--; | 
|---|
| 582 |  | -			dax_wake_mapping_entry_waiter(mapping, index, entry, | 
|---|
| 583 |  | -					true); | 
|---|
| 584 |  | -		} | 
|---|
| 585 |  | - | 
|---|
| 586 |  | -		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); | 
|---|
| 587 |  | - | 
|---|
| 588 |  | -		err = __radix_tree_insert(&mapping->i_pages, index, | 
|---|
| 589 |  | -				dax_radix_order(entry), entry); | 
|---|
| 590 |  | -		radix_tree_preload_end(); | 
|---|
| 591 |  | -		if (err) { | 
|---|
| 592 |  | -			xa_unlock_irq(&mapping->i_pages); | 
|---|
| 593 |  | -			/* | 
|---|
| 594 |  | -			 * Our insertion of a DAX entry failed, most likely | 
|---|
| 595 |  | -			 * because we were inserting a PMD entry and it | 
|---|
| 596 |  | -			 * collided with a PTE sized entry at a different | 
|---|
| 597 |  | -			 * index in the PMD range.  We haven't inserted | 
|---|
| 598 |  | -			 * anything into the radix tree and have no waiters to | 
|---|
| 599 |  | -			 * wake. | 
|---|
| 600 |  | -			 */ | 
|---|
| 601 |  | -			return ERR_PTR(err); | 
|---|
| 602 |  | -		} | 
|---|
| 603 |  | -		/* Good, we have inserted empty locked entry into the tree. */ | 
|---|
| 604 |  | -		mapping->nrexceptional++; | 
|---|
| 605 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
| 606 |  | -		return entry; | 
|---|
|  | 537 | +		dax_disassociate_entry(entry, mapping, false); | 
|---|
|  | 538 | +		xas_store(xas, NULL);	/* undo the PMD join */ | 
|---|
|  | 539 | +		dax_wake_entry(xas, entry, WAKE_ALL); | 
|---|
|  | 540 | +		mapping->nrexceptional--; | 
|---|
|  | 541 | +		entry = NULL; | 
|---|
|  | 542 | +		xas_set(xas, index); | 
|---|
| 607 | 543 | } | 
|---|
| 608 |  | -	entry = lock_slot(mapping, slot); | 
|---|
| 609 |  | - out_unlock: | 
|---|
| 610 |  | -	xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 544 | + | 
|---|
|  | 545 | +	if (entry) { | 
|---|
|  | 546 | +		dax_lock_entry(xas, entry); | 
|---|
|  | 547 | +	} else { | 
|---|
|  | 548 | +		unsigned long flags = DAX_EMPTY; | 
|---|
|  | 549 | + | 
|---|
|  | 550 | +		if (order > 0) | 
|---|
|  | 551 | +			flags |= DAX_PMD; | 
|---|
|  | 552 | +		entry = dax_make_entry(pfn_to_pfn_t(0), flags); | 
|---|
|  | 553 | +		dax_lock_entry(xas, entry); | 
|---|
|  | 554 | +		if (xas_error(xas)) | 
|---|
|  | 555 | +			goto out_unlock; | 
|---|
|  | 556 | +		mapping->nrexceptional++; | 
|---|
|  | 557 | +	} | 
|---|
|  | 558 | + | 
|---|
|  | 559 | +out_unlock: | 
|---|
|  | 560 | +	xas_unlock_irq(xas); | 
|---|
|  | 561 | +	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) | 
|---|
|  | 562 | +		goto retry; | 
|---|
|  | 563 | +	if (xas->xa_node == XA_ERROR(-ENOMEM)) | 
|---|
|  | 564 | +		return xa_mk_internal(VM_FAULT_OOM); | 
|---|
|  | 565 | +	if (xas_error(xas)) | 
|---|
|  | 566 | +		return xa_mk_internal(VM_FAULT_SIGBUS); | 
|---|
| 611 | 567 | return entry; | 
|---|
|  | 568 | +fallback: | 
|---|
|  | 569 | +	xas_unlock_irq(xas); | 
|---|
|  | 570 | +	return xa_mk_internal(VM_FAULT_FALLBACK); | 
|---|
| 612 | 571 | } | 
|---|
| 613 | 572 |  | 
|---|
| 614 | 573 | /** | 
|---|
| 615 |  | - * dax_layout_busy_page - find first pinned page in @mapping | 
|---|
|  | 574 | + * dax_layout_busy_page_range - find first pinned page in @mapping | 
|---|
| 616 | 575 | * @mapping: address space to scan for a page with ref count > 1 | 
|---|
|  | 576 | + * @start: Starting offset. Page containing 'start' is included. | 
|---|
|  | 577 | + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, | 
|---|
|  | 578 | + *       pages from 'start' till the end of file are included. | 
|---|
| 617 | 579 | * | 
|---|
| 618 | 580 | * DAX requires ZONE_DEVICE mapped pages. These pages are never | 
|---|
| 619 | 581 | * 'onlined' to the page allocator so they are considered idle when | 
|---|
| .. | .. | 
|---|
| 626 | 588 | * to be able to run unmap_mapping_range() and subsequently not race | 
|---|
| 627 | 589 | * mapping_mapped() becoming true. | 
|---|
| 628 | 590 | */ | 
|---|
| 629 |  | -struct page *dax_layout_busy_page(struct address_space *mapping) | 
|---|
|  | 591 | +struct page *dax_layout_busy_page_range(struct address_space *mapping, | 
|---|
|  | 592 | +					loff_t start, loff_t end) | 
|---|
| 630 | 593 | { | 
|---|
| 631 |  | -	pgoff_t	indices[PAGEVEC_SIZE]; | 
|---|
|  | 594 | +	void *entry; | 
|---|
|  | 595 | +	unsigned int scanned = 0; | 
|---|
| 632 | 596 | struct page *page = NULL; | 
|---|
| 633 |  | -	struct pagevec pvec; | 
|---|
| 634 |  | -	pgoff_t	index, end; | 
|---|
| 635 |  | -	unsigned i; | 
|---|
|  | 597 | +	pgoff_t start_idx = start >> PAGE_SHIFT; | 
|---|
|  | 598 | +	pgoff_t end_idx; | 
|---|
|  | 599 | +	XA_STATE(xas, &mapping->i_pages, start_idx); | 
|---|
| 636 | 600 |  | 
|---|
| 637 | 601 | /* | 
|---|
| 638 | 602 | * In the 'limited' case get_user_pages() for dax is disabled. | 
|---|
| .. | .. | 
|---|
| 643 | 607 | if (!dax_mapping(mapping) || !mapping_mapped(mapping)) | 
|---|
| 644 | 608 | return NULL; | 
|---|
| 645 | 609 |  | 
|---|
| 646 |  | -	pagevec_init(&pvec); | 
|---|
| 647 |  | -	index = 0; | 
|---|
| 648 |  | -	end = -1; | 
|---|
| 649 |  | - | 
|---|
|  | 610 | +	/* If end == LLONG_MAX, all pages from start to till end of file */ | 
|---|
|  | 611 | +	if (end == LLONG_MAX) | 
|---|
|  | 612 | +		end_idx = ULONG_MAX; | 
|---|
|  | 613 | +	else | 
|---|
|  | 614 | +		end_idx = end >> PAGE_SHIFT; | 
|---|
| 650 | 615 | /* | 
|---|
| 651 | 616 | * If we race get_user_pages_fast() here either we'll see the | 
|---|
| 652 |  | -	 * elevated page count in the pagevec_lookup and wait, or | 
|---|
|  | 617 | +	 * elevated page count in the iteration and wait, or | 
|---|
| 653 | 618 | * get_user_pages_fast() will see that the page it took a reference | 
|---|
| 654 | 619 | * against is no longer mapped in the page tables and bail to the | 
|---|
| 655 | 620 | * get_user_pages() slow path.  The slow path is protected by | 
|---|
| 656 | 621 | * pte_lock() and pmd_lock(). New references are not taken without | 
|---|
| 657 |  | -	 * holding those locks, and unmap_mapping_range() will not zero the | 
|---|
|  | 622 | +	 * holding those locks, and unmap_mapping_pages() will not zero the | 
|---|
| 658 | 623 | * pte or pmd without holding the respective lock, so we are | 
|---|
| 659 | 624 | * guaranteed to either see new references or prevent new | 
|---|
| 660 | 625 | * references from being established. | 
|---|
| 661 | 626 | */ | 
|---|
| 662 |  | -	unmap_mapping_range(mapping, 0, 0, 0); | 
|---|
|  | 627 | +	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); | 
|---|
| 663 | 628 |  | 
|---|
| 664 |  | -	while (index < end && pagevec_lookup_entries(&pvec, mapping, index, | 
|---|
| 665 |  | -				min(end - index, (pgoff_t)PAGEVEC_SIZE), | 
|---|
| 666 |  | -				indices)) { | 
|---|
| 667 |  | -		pgoff_t nr_pages = 1; | 
|---|
| 668 |  | - | 
|---|
| 669 |  | -		for (i = 0; i < pagevec_count(&pvec); i++) { | 
|---|
| 670 |  | -			struct page *pvec_ent = pvec.pages[i]; | 
|---|
| 671 |  | -			void *entry; | 
|---|
| 672 |  | - | 
|---|
| 673 |  | -			index = indices[i]; | 
|---|
| 674 |  | -			if (index >= end) | 
|---|
| 675 |  | -				break; | 
|---|
| 676 |  | - | 
|---|
| 677 |  | -			if (WARN_ON_ONCE( | 
|---|
| 678 |  | -			     !radix_tree_exceptional_entry(pvec_ent))) | 
|---|
| 679 |  | -				continue; | 
|---|
| 680 |  | - | 
|---|
| 681 |  | -			xa_lock_irq(&mapping->i_pages); | 
|---|
| 682 |  | -			entry = get_unlocked_mapping_entry(mapping, index, NULL); | 
|---|
| 683 |  | -			if (entry) { | 
|---|
| 684 |  | -				page = dax_busy_page(entry); | 
|---|
| 685 |  | -				/* | 
|---|
| 686 |  | -				 * Account for multi-order entries at | 
|---|
| 687 |  | -				 * the end of the pagevec. | 
|---|
| 688 |  | -				 */ | 
|---|
| 689 |  | -				if (i + 1 >= pagevec_count(&pvec)) | 
|---|
| 690 |  | -					nr_pages = 1UL << dax_radix_order(entry); | 
|---|
| 691 |  | -			} | 
|---|
| 692 |  | -			put_unlocked_mapping_entry(mapping, index, entry); | 
|---|
| 693 |  | -			xa_unlock_irq(&mapping->i_pages); | 
|---|
| 694 |  | -			if (page) | 
|---|
| 695 |  | -				break; | 
|---|
| 696 |  | -		} | 
|---|
| 697 |  | - | 
|---|
| 698 |  | -		/* | 
|---|
| 699 |  | -		 * We don't expect normal struct page entries to exist in our | 
|---|
| 700 |  | -		 * tree, but we keep these pagevec calls so that this code is | 
|---|
| 701 |  | -		 * consistent with the common pattern for handling pagevecs | 
|---|
| 702 |  | -		 * throughout the kernel. | 
|---|
| 703 |  | -		 */ | 
|---|
| 704 |  | -		pagevec_remove_exceptionals(&pvec); | 
|---|
| 705 |  | -		pagevec_release(&pvec); | 
|---|
| 706 |  | -		index += nr_pages; | 
|---|
| 707 |  | - | 
|---|
|  | 629 | +	xas_lock_irq(&xas); | 
|---|
|  | 630 | +	xas_for_each(&xas, entry, end_idx) { | 
|---|
|  | 631 | +		if (WARN_ON_ONCE(!xa_is_value(entry))) | 
|---|
|  | 632 | +			continue; | 
|---|
|  | 633 | +		if (unlikely(dax_is_locked(entry))) | 
|---|
|  | 634 | +			entry = get_unlocked_entry(&xas, 0); | 
|---|
|  | 635 | +		if (entry) | 
|---|
|  | 636 | +			page = dax_busy_page(entry); | 
|---|
|  | 637 | +		put_unlocked_entry(&xas, entry, WAKE_NEXT); | 
|---|
| 708 | 638 | if (page) | 
|---|
| 709 | 639 | break; | 
|---|
|  | 640 | +		if (++scanned % XA_CHECK_SCHED) | 
|---|
|  | 641 | +			continue; | 
|---|
|  | 642 | + | 
|---|
|  | 643 | +		xas_pause(&xas); | 
|---|
|  | 644 | +		xas_unlock_irq(&xas); | 
|---|
|  | 645 | +		cond_resched(); | 
|---|
|  | 646 | +		xas_lock_irq(&xas); | 
|---|
| 710 | 647 | } | 
|---|
|  | 648 | +	xas_unlock_irq(&xas); | 
|---|
| 711 | 649 | return page; | 
|---|
|  | 650 | +} | 
|---|
|  | 651 | +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); | 
|---|
|  | 652 | + | 
|---|
|  | 653 | +struct page *dax_layout_busy_page(struct address_space *mapping) | 
|---|
|  | 654 | +{ | 
|---|
|  | 655 | +	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); | 
|---|
| 712 | 656 | } | 
|---|
| 713 | 657 | EXPORT_SYMBOL_GPL(dax_layout_busy_page); | 
|---|
| 714 | 658 |  | 
|---|
| 715 |  | -static int __dax_invalidate_mapping_entry(struct address_space *mapping, | 
|---|
|  | 659 | +static int __dax_invalidate_entry(struct address_space *mapping, | 
|---|
| 716 | 660 | pgoff_t index, bool trunc) | 
|---|
| 717 | 661 | { | 
|---|
|  | 662 | +	XA_STATE(xas, &mapping->i_pages, index); | 
|---|
| 718 | 663 | int ret = 0; | 
|---|
| 719 | 664 | void *entry; | 
|---|
| 720 |  | -	struct radix_tree_root *pages = &mapping->i_pages; | 
|---|
| 721 | 665 |  | 
|---|
| 722 |  | -	xa_lock_irq(pages); | 
|---|
| 723 |  | -	entry = get_unlocked_mapping_entry(mapping, index, NULL); | 
|---|
| 724 |  | -	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) | 
|---|
|  | 666 | +	xas_lock_irq(&xas); | 
|---|
|  | 667 | +	entry = get_unlocked_entry(&xas, 0); | 
|---|
|  | 668 | +	if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) | 
|---|
| 725 | 669 | goto out; | 
|---|
| 726 | 670 | if (!trunc && | 
|---|
| 727 |  | -	    (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || | 
|---|
| 728 |  | -	     radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) | 
|---|
|  | 671 | +	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || | 
|---|
|  | 672 | +	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) | 
|---|
| 729 | 673 | goto out; | 
|---|
| 730 | 674 | dax_disassociate_entry(entry, mapping, trunc); | 
|---|
| 731 |  | -	radix_tree_delete(pages, index); | 
|---|
|  | 675 | +	xas_store(&xas, NULL); | 
|---|
| 732 | 676 | mapping->nrexceptional--; | 
|---|
| 733 | 677 | ret = 1; | 
|---|
| 734 | 678 | out: | 
|---|
| 735 |  | -	put_unlocked_mapping_entry(mapping, index, entry); | 
|---|
| 736 |  | -	xa_unlock_irq(pages); | 
|---|
|  | 679 | +	put_unlocked_entry(&xas, entry, WAKE_ALL); | 
|---|
|  | 680 | +	xas_unlock_irq(&xas); | 
|---|
| 737 | 681 | return ret; | 
|---|
| 738 | 682 | } | 
|---|
|  | 683 | + | 
|---|
| 739 | 684 | /* | 
|---|
| 740 |  | - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree | 
|---|
| 741 |  | - * entry to get unlocked before deleting it. | 
|---|
|  | 685 | + * Delete DAX entry at @index from @mapping.  Wait for it | 
|---|
|  | 686 | + * to be unlocked before deleting it. | 
|---|
| 742 | 687 | */ | 
|---|
| 743 | 688 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) | 
|---|
| 744 | 689 | { | 
|---|
| 745 |  | -	int ret = __dax_invalidate_mapping_entry(mapping, index, true); | 
|---|
|  | 690 | +	int ret = __dax_invalidate_entry(mapping, index, true); | 
|---|
| 746 | 691 |  | 
|---|
| 747 | 692 | /* | 
|---|
| 748 | 693 | * This gets called from truncate / punch_hole path. As such, the caller | 
|---|
| 749 | 694 | * must hold locks protecting against concurrent modifications of the | 
|---|
| 750 |  | -	 * radix tree (usually fs-private i_mmap_sem for writing). Since the | 
|---|
| 751 |  | -	 * caller has seen exceptional entry for this index, we better find it | 
|---|
|  | 695 | +	 * page cache (usually fs-private i_mmap_sem for writing). Since the | 
|---|
|  | 696 | +	 * caller has seen a DAX entry for this index, we better find it | 
|---|
| 752 | 697 | * at that index as well... | 
|---|
| 753 | 698 | */ | 
|---|
| 754 | 699 | WARN_ON_ONCE(!ret); | 
|---|
| .. | .. | 
|---|
| 756 | 701 | } | 
|---|
| 757 | 702 |  | 
|---|
| 758 | 703 | /* | 
|---|
| 759 |  | - * Invalidate exceptional DAX entry if it is clean. | 
|---|
|  | 704 | + * Invalidate DAX entry if it is clean. | 
|---|
| 760 | 705 | */ | 
|---|
| 761 | 706 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | 
|---|
| 762 | 707 | pgoff_t index) | 
|---|
| 763 | 708 | { | 
|---|
| 764 |  | -	return __dax_invalidate_mapping_entry(mapping, index, false); | 
|---|
|  | 709 | +	return __dax_invalidate_entry(mapping, index, false); | 
|---|
| 765 | 710 | } | 
|---|
| 766 | 711 |  | 
|---|
| 767 |  | -static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | 
|---|
| 768 |  | -		sector_t sector, size_t size, struct page *to, | 
|---|
| 769 |  | -		unsigned long vaddr) | 
|---|
|  | 712 | +static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, | 
|---|
|  | 713 | +			     sector_t sector, struct page *to, unsigned long vaddr) | 
|---|
| 770 | 714 | { | 
|---|
| 771 | 715 | void *vto, *kaddr; | 
|---|
| 772 | 716 | pgoff_t pgoff; | 
|---|
| 773 | 717 | long rc; | 
|---|
| 774 | 718 | int id; | 
|---|
| 775 | 719 |  | 
|---|
| 776 |  | -	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); | 
|---|
|  | 720 | +	rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); | 
|---|
| 777 | 721 | if (rc) | 
|---|
| 778 | 722 | return rc; | 
|---|
| 779 | 723 |  | 
|---|
| 780 | 724 | id = dax_read_lock(); | 
|---|
| 781 |  | -	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); | 
|---|
|  | 725 | +	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL); | 
|---|
| 782 | 726 | if (rc < 0) { | 
|---|
| 783 | 727 | dax_read_unlock(id); | 
|---|
| 784 | 728 | return rc; | 
|---|
| 785 | 729 | } | 
|---|
| 786 | 730 | vto = kmap_atomic(to); | 
|---|
|  | 731 | +#ifdef CONFIG_ARM | 
|---|
|  | 732 | +#ifndef copy_user_page | 
|---|
|  | 733 | +#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from) | 
|---|
|  | 734 | +#endif | 
|---|
|  | 735 | +#endif | 
|---|
| 787 | 736 | copy_user_page(vto, (void __force *)kaddr, vaddr, to); | 
|---|
| 788 | 737 | kunmap_atomic(vto); | 
|---|
| 789 | 738 | dax_read_unlock(id); | 
|---|
| .. | .. | 
|---|
| 797 | 746 | * already in the tree, we will skip the insertion and just dirty the PMD as | 
|---|
| 798 | 747 | * appropriate. | 
|---|
| 799 | 748 | */ | 
|---|
| 800 |  | -static void *dax_insert_mapping_entry(struct address_space *mapping, | 
|---|
| 801 |  | -				      struct vm_fault *vmf, | 
|---|
| 802 |  | -				      void *entry, pfn_t pfn_t, | 
|---|
| 803 |  | -				      unsigned long flags, bool dirty) | 
|---|
|  | 749 | +static void *dax_insert_entry(struct xa_state *xas, | 
|---|
|  | 750 | +		struct address_space *mapping, struct vm_fault *vmf, | 
|---|
|  | 751 | +		void *entry, pfn_t pfn, unsigned long flags, bool dirty) | 
|---|
| 804 | 752 | { | 
|---|
| 805 |  | -	struct radix_tree_root *pages = &mapping->i_pages; | 
|---|
| 806 |  | -	unsigned long pfn = pfn_t_to_pfn(pfn_t); | 
|---|
| 807 |  | -	pgoff_t index = vmf->pgoff; | 
|---|
| 808 |  | -	void *new_entry; | 
|---|
|  | 753 | +	void *new_entry = dax_make_entry(pfn, flags); | 
|---|
| 809 | 754 |  | 
|---|
| 810 | 755 | if (dirty) | 
|---|
| 811 | 756 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 
|---|
| 812 | 757 |  | 
|---|
| 813 |  | -	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { | 
|---|
|  | 758 | +	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { | 
|---|
|  | 759 | +		unsigned long index = xas->xa_index; | 
|---|
| 814 | 760 | /* we are replacing a zero page with block mapping */ | 
|---|
| 815 | 761 | if (dax_is_pmd_entry(entry)) | 
|---|
| 816 | 762 | unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, | 
|---|
| 817 |  | -							PG_PMD_NR, false); | 
|---|
|  | 763 | +					PG_PMD_NR, false); | 
|---|
| 818 | 764 | else /* pte entry */ | 
|---|
| 819 |  | -			unmap_mapping_pages(mapping, vmf->pgoff, 1, false); | 
|---|
|  | 765 | +			unmap_mapping_pages(mapping, index, 1, false); | 
|---|
| 820 | 766 | } | 
|---|
| 821 | 767 |  | 
|---|
| 822 |  | -	xa_lock_irq(pages); | 
|---|
| 823 |  | -	new_entry = dax_radix_locked_entry(pfn, flags); | 
|---|
| 824 |  | -	if (dax_entry_size(entry) != dax_entry_size(new_entry)) { | 
|---|
|  | 768 | +	xas_reset(xas); | 
|---|
|  | 769 | +	xas_lock_irq(xas); | 
|---|
|  | 770 | +	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { | 
|---|
|  | 771 | +		void *old; | 
|---|
|  | 772 | + | 
|---|
| 825 | 773 | dax_disassociate_entry(entry, mapping, false); | 
|---|
| 826 | 774 | dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); | 
|---|
| 827 |  | -	} | 
|---|
| 828 |  | - | 
|---|
| 829 |  | -	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { | 
|---|
| 830 | 775 | /* | 
|---|
| 831 |  | -		 * Only swap our new entry into the radix tree if the current | 
|---|
|  | 776 | +		 * Only swap our new entry into the page cache if the current | 
|---|
| 832 | 777 | * entry is a zero page or an empty entry.  If a normal PTE or | 
|---|
| 833 |  | -		 * PMD entry is already in the tree, we leave it alone.  This | 
|---|
|  | 778 | +		 * PMD entry is already in the cache, we leave it alone.  This | 
|---|
| 834 | 779 | * means that if we are trying to insert a PTE and the | 
|---|
| 835 | 780 | * existing entry is a PMD, we will just leave the PMD in the | 
|---|
| 836 | 781 | * tree and dirty it if necessary. | 
|---|
| 837 | 782 | */ | 
|---|
| 838 |  | -		struct radix_tree_node *node; | 
|---|
| 839 |  | -		void **slot; | 
|---|
| 840 |  | -		void *ret; | 
|---|
| 841 |  | - | 
|---|
| 842 |  | -		ret = __radix_tree_lookup(pages, index, &node, &slot); | 
|---|
| 843 |  | -		WARN_ON_ONCE(ret != entry); | 
|---|
| 844 |  | -		__radix_tree_replace(pages, node, slot, | 
|---|
| 845 |  | -				     new_entry, NULL); | 
|---|
|  | 783 | +		old = dax_lock_entry(xas, new_entry); | 
|---|
|  | 784 | +		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | | 
|---|
|  | 785 | +					DAX_LOCKED)); | 
|---|
| 846 | 786 | entry = new_entry; | 
|---|
|  | 787 | +	} else { | 
|---|
|  | 788 | +		xas_load(xas);	/* Walk the xa_state */ | 
|---|
| 847 | 789 | } | 
|---|
| 848 | 790 |  | 
|---|
| 849 | 791 | if (dirty) | 
|---|
| 850 |  | -		radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); | 
|---|
|  | 792 | +		xas_set_mark(xas, PAGECACHE_TAG_DIRTY); | 
|---|
| 851 | 793 |  | 
|---|
| 852 |  | -	xa_unlock_irq(pages); | 
|---|
|  | 794 | +	xas_unlock_irq(xas); | 
|---|
| 853 | 795 | return entry; | 
|---|
| 854 | 796 | } | 
|---|
| 855 | 797 |  | 
|---|
| 856 |  | -static inline unsigned long | 
|---|
| 857 |  | -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) | 
|---|
|  | 798 | +static inline | 
|---|
|  | 799 | +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) | 
|---|
| 858 | 800 | { | 
|---|
| 859 | 801 | unsigned long address; | 
|---|
| 860 | 802 |  | 
|---|
| .. | .. | 
|---|
| 864 | 806 | } | 
|---|
| 865 | 807 |  | 
|---|
| 866 | 808 | /* Walk all mappings of a given index of a file and writeprotect them */ | 
|---|
| 867 |  | -static void dax_mapping_entry_mkclean(struct address_space *mapping, | 
|---|
| 868 |  | -				      pgoff_t index, unsigned long pfn) | 
|---|
|  | 809 | +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, | 
|---|
|  | 810 | +		unsigned long pfn) | 
|---|
| 869 | 811 | { | 
|---|
| 870 | 812 | struct vm_area_struct *vma; | 
|---|
| 871 | 813 | pte_t pte, *ptep = NULL; | 
|---|
| .. | .. | 
|---|
| 874 | 816 |  | 
|---|
| 875 | 817 | i_mmap_lock_read(mapping); | 
|---|
| 876 | 818 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { | 
|---|
| 877 |  | -		unsigned long address, start, end; | 
|---|
|  | 819 | +		struct mmu_notifier_range range; | 
|---|
|  | 820 | +		unsigned long address; | 
|---|
| 878 | 821 |  | 
|---|
| 879 | 822 | cond_resched(); | 
|---|
| 880 | 823 |  | 
|---|
| .. | .. | 
|---|
| 884 | 827 | address = pgoff_address(index, vma); | 
|---|
| 885 | 828 |  | 
|---|
| 886 | 829 | /* | 
|---|
| 887 |  | -		 * Note because we provide start/end to follow_pte_pmd it will | 
|---|
| 888 |  | -		 * call mmu_notifier_invalidate_range_start() on our behalf | 
|---|
| 889 |  | -		 * before taking any lock. | 
|---|
|  | 830 | +		 * follow_invalidate_pte() will use the range to call | 
|---|
|  | 831 | +		 * mmu_notifier_invalidate_range_start() on our behalf before | 
|---|
|  | 832 | +		 * taking any lock. | 
|---|
| 890 | 833 | */ | 
|---|
| 891 |  | -		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) | 
|---|
|  | 834 | +		if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, | 
|---|
|  | 835 | +					  &pmdp, &ptl)) | 
|---|
| 892 | 836 | continue; | 
|---|
| 893 | 837 |  | 
|---|
| 894 | 838 | /* | 
|---|
| .. | .. | 
|---|
| 907 | 851 | if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) | 
|---|
| 908 | 852 | goto unlock_pmd; | 
|---|
| 909 | 853 |  | 
|---|
| 910 |  | -			flush_cache_page(vma, address, pfn); | 
|---|
|  | 854 | +			flush_cache_range(vma, address, | 
|---|
|  | 855 | +					  address + HPAGE_PMD_SIZE); | 
|---|
| 911 | 856 | pmd = pmdp_invalidate(vma, address, pmdp); | 
|---|
| 912 | 857 | pmd = pmd_wrprotect(pmd); | 
|---|
| 913 | 858 | pmd = pmd_mkclean(pmd); | 
|---|
| .. | .. | 
|---|
| 930 | 875 | pte_unmap_unlock(ptep, ptl); | 
|---|
| 931 | 876 | } | 
|---|
| 932 | 877 |  | 
|---|
| 933 |  | -		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); | 
|---|
|  | 878 | +		mmu_notifier_invalidate_range_end(&range); | 
|---|
| 934 | 879 | } | 
|---|
| 935 | 880 | i_mmap_unlock_read(mapping); | 
|---|
| 936 | 881 | } | 
|---|
| 937 | 882 |  | 
|---|
| 938 |  | -static int dax_writeback_one(struct dax_device *dax_dev, | 
|---|
| 939 |  | -		struct address_space *mapping, pgoff_t index, void *entry) | 
|---|
|  | 883 | +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, | 
|---|
|  | 884 | +		struct address_space *mapping, void *entry) | 
|---|
| 940 | 885 | { | 
|---|
| 941 |  | -	struct radix_tree_root *pages = &mapping->i_pages; | 
|---|
| 942 |  | -	void *entry2, **slot; | 
|---|
| 943 |  | -	unsigned long pfn; | 
|---|
|  | 886 | +	unsigned long pfn, index, count; | 
|---|
| 944 | 887 | long ret = 0; | 
|---|
| 945 |  | -	size_t size; | 
|---|
| 946 | 888 |  | 
|---|
| 947 | 889 | /* | 
|---|
| 948 | 890 | * A page got tagged dirty in DAX mapping? Something is seriously | 
|---|
| 949 | 891 | * wrong. | 
|---|
| 950 | 892 | */ | 
|---|
| 951 |  | -	if (WARN_ON(!radix_tree_exceptional_entry(entry))) | 
|---|
|  | 893 | +	if (WARN_ON(!xa_is_value(entry))) | 
|---|
| 952 | 894 | return -EIO; | 
|---|
| 953 | 895 |  | 
|---|
| 954 |  | -	xa_lock_irq(pages); | 
|---|
| 955 |  | -	entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | 
|---|
| 956 |  | -	/* Entry got punched out / reallocated? */ | 
|---|
| 957 |  | -	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) | 
|---|
| 958 |  | -		goto put_unlocked; | 
|---|
| 959 |  | -	/* | 
|---|
| 960 |  | -	 * Entry got reallocated elsewhere? No need to writeback. We have to | 
|---|
| 961 |  | -	 * compare pfns as we must not bail out due to difference in lockbit | 
|---|
| 962 |  | -	 * or entry type. | 
|---|
| 963 |  | -	 */ | 
|---|
| 964 |  | -	if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) | 
|---|
| 965 |  | -		goto put_unlocked; | 
|---|
| 966 |  | -	if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 
|---|
| 967 |  | -				dax_is_zero_entry(entry))) { | 
|---|
| 968 |  | -		ret = -EIO; | 
|---|
| 969 |  | -		goto put_unlocked; | 
|---|
|  | 896 | +	if (unlikely(dax_is_locked(entry))) { | 
|---|
|  | 897 | +		void *old_entry = entry; | 
|---|
|  | 898 | + | 
|---|
|  | 899 | +		entry = get_unlocked_entry(xas, 0); | 
|---|
|  | 900 | + | 
|---|
|  | 901 | +		/* Entry got punched out / reallocated? */ | 
|---|
|  | 902 | +		if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) | 
|---|
|  | 903 | +			goto put_unlocked; | 
|---|
|  | 904 | +		/* | 
|---|
|  | 905 | +		 * Entry got reallocated elsewhere? No need to writeback. | 
|---|
|  | 906 | +		 * We have to compare pfns as we must not bail out due to | 
|---|
|  | 907 | +		 * difference in lockbit or entry type. | 
|---|
|  | 908 | +		 */ | 
|---|
|  | 909 | +		if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) | 
|---|
|  | 910 | +			goto put_unlocked; | 
|---|
|  | 911 | +		if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 
|---|
|  | 912 | +					dax_is_zero_entry(entry))) { | 
|---|
|  | 913 | +			ret = -EIO; | 
|---|
|  | 914 | +			goto put_unlocked; | 
|---|
|  | 915 | +		} | 
|---|
|  | 916 | + | 
|---|
|  | 917 | +		/* Another fsync thread may have already done this entry */ | 
|---|
|  | 918 | +		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) | 
|---|
|  | 919 | +			goto put_unlocked; | 
|---|
| 970 | 920 | } | 
|---|
| 971 | 921 |  | 
|---|
| 972 |  | -	/* Another fsync thread may have already written back this entry */ | 
|---|
| 973 |  | -	if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) | 
|---|
| 974 |  | -		goto put_unlocked; | 
|---|
| 975 | 922 | /* Lock the entry to serialize with page faults */ | 
|---|
| 976 |  | -	entry = lock_slot(mapping, slot); | 
|---|
|  | 923 | +	dax_lock_entry(xas, entry); | 
|---|
|  | 924 | + | 
|---|
| 977 | 925 | /* | 
|---|
| 978 | 926 | * We can clear the tag now but we have to be careful so that concurrent | 
|---|
| 979 | 927 | * dax_writeback_one() calls for the same index cannot finish before we | 
|---|
| .. | .. | 
|---|
| 981 | 929 | * at the entry only under the i_pages lock and once they do that | 
|---|
| 982 | 930 | * they will see the entry locked and wait for it to unlock. | 
|---|
| 983 | 931 | */ | 
|---|
| 984 |  | -	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); | 
|---|
| 985 |  | -	xa_unlock_irq(pages); | 
|---|
|  | 932 | +	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); | 
|---|
|  | 933 | +	xas_unlock_irq(xas); | 
|---|
| 986 | 934 |  | 
|---|
| 987 | 935 | /* | 
|---|
| 988 |  | -	 * Even if dax_writeback_mapping_range() was given a wbc->range_start | 
|---|
| 989 |  | -	 * in the middle of a PMD, the 'index' we are given will be aligned to | 
|---|
| 990 |  | -	 * the start index of the PMD, as will the pfn we pull from 'entry'. | 
|---|
|  | 936 | +	 * If dax_writeback_mapping_range() was given a wbc->range_start | 
|---|
|  | 937 | +	 * in the middle of a PMD, the 'index' we use needs to be | 
|---|
|  | 938 | +	 * aligned to the start of the PMD. | 
|---|
| 991 | 939 | * This allows us to flush for PMD_SIZE and not have to worry about | 
|---|
| 992 | 940 | * partial PMD writebacks. | 
|---|
| 993 | 941 | */ | 
|---|
| 994 |  | -	pfn = dax_radix_pfn(entry); | 
|---|
| 995 |  | -	size = PAGE_SIZE << dax_radix_order(entry); | 
|---|
|  | 942 | +	pfn = dax_to_pfn(entry); | 
|---|
|  | 943 | +	count = 1UL << dax_entry_order(entry); | 
|---|
|  | 944 | +	index = xas->xa_index & ~(count - 1); | 
|---|
| 996 | 945 |  | 
|---|
| 997 |  | -	dax_mapping_entry_mkclean(mapping, index, pfn); | 
|---|
| 998 |  | -	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); | 
|---|
|  | 946 | +	dax_entry_mkclean(mapping, index, pfn); | 
|---|
|  | 947 | +	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); | 
|---|
| 999 | 948 | /* | 
|---|
| 1000 | 949 | * After we have flushed the cache, we can clear the dirty tag. There | 
|---|
| 1001 | 950 | * cannot be new dirty data in the pfn after the flush has completed as | 
|---|
| 1002 | 951 | * the pfn mappings are writeprotected and fault waits for mapping | 
|---|
| 1003 | 952 | * entry lock. | 
|---|
| 1004 | 953 | */ | 
|---|
| 1005 |  | -	xa_lock_irq(pages); | 
|---|
| 1006 |  | -	radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); | 
|---|
| 1007 |  | -	xa_unlock_irq(pages); | 
|---|
| 1008 |  | -	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); | 
|---|
| 1009 |  | -	put_locked_mapping_entry(mapping, index); | 
|---|
|  | 954 | +	xas_reset(xas); | 
|---|
|  | 955 | +	xas_lock_irq(xas); | 
|---|
|  | 956 | +	xas_store(xas, entry); | 
|---|
|  | 957 | +	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); | 
|---|
|  | 958 | +	dax_wake_entry(xas, entry, WAKE_NEXT); | 
|---|
|  | 959 | + | 
|---|
|  | 960 | +	trace_dax_writeback_one(mapping->host, index, count); | 
|---|
| 1010 | 961 | return ret; | 
|---|
| 1011 | 962 |  | 
|---|
| 1012 | 963 | put_unlocked: | 
|---|
| 1013 |  | -	put_unlocked_mapping_entry(mapping, index, entry2); | 
|---|
| 1014 |  | -	xa_unlock_irq(pages); | 
|---|
|  | 964 | +	put_unlocked_entry(xas, entry, WAKE_NEXT); | 
|---|
| 1015 | 965 | return ret; | 
|---|
| 1016 | 966 | } | 
|---|
| 1017 | 967 |  | 
|---|
| .. | .. | 
|---|
| 1021 | 971 | * on persistent storage prior to completion of the operation. | 
|---|
| 1022 | 972 | */ | 
|---|
| 1023 | 973 | int dax_writeback_mapping_range(struct address_space *mapping, | 
|---|
| 1024 |  | -		struct block_device *bdev, struct writeback_control *wbc) | 
|---|
|  | 974 | +		struct dax_device *dax_dev, struct writeback_control *wbc) | 
|---|
| 1025 | 975 | { | 
|---|
|  | 976 | +	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); | 
|---|
| 1026 | 977 | struct inode *inode = mapping->host; | 
|---|
| 1027 |  | -	pgoff_t start_index, end_index; | 
|---|
| 1028 |  | -	pgoff_t indices[PAGEVEC_SIZE]; | 
|---|
| 1029 |  | -	struct dax_device *dax_dev; | 
|---|
| 1030 |  | -	struct pagevec pvec; | 
|---|
| 1031 |  | -	bool done = false; | 
|---|
| 1032 |  | -	int i, ret = 0; | 
|---|
|  | 978 | +	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; | 
|---|
|  | 979 | +	void *entry; | 
|---|
|  | 980 | +	int ret = 0; | 
|---|
|  | 981 | +	unsigned int scanned = 0; | 
|---|
| 1033 | 982 |  | 
|---|
| 1034 | 983 | if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) | 
|---|
| 1035 | 984 | return -EIO; | 
|---|
| .. | .. | 
|---|
| 1037 | 986 | if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) | 
|---|
| 1038 | 987 | return 0; | 
|---|
| 1039 | 988 |  | 
|---|
| 1040 |  | -	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); | 
|---|
| 1041 |  | -	if (!dax_dev) | 
|---|
| 1042 |  | -		return -EIO; | 
|---|
|  | 989 | +	trace_dax_writeback_range(inode, xas.xa_index, end_index); | 
|---|
| 1043 | 990 |  | 
|---|
| 1044 |  | -	start_index = wbc->range_start >> PAGE_SHIFT; | 
|---|
| 1045 |  | -	end_index = wbc->range_end >> PAGE_SHIFT; | 
|---|
|  | 991 | +	tag_pages_for_writeback(mapping, xas.xa_index, end_index); | 
|---|
| 1046 | 992 |  | 
|---|
| 1047 |  | -	trace_dax_writeback_range(inode, start_index, end_index); | 
|---|
| 1048 |  | - | 
|---|
| 1049 |  | -	tag_pages_for_writeback(mapping, start_index, end_index); | 
|---|
| 1050 |  | - | 
|---|
| 1051 |  | -	pagevec_init(&pvec); | 
|---|
| 1052 |  | -	while (!done) { | 
|---|
| 1053 |  | -		pvec.nr = find_get_entries_tag(mapping, start_index, | 
|---|
| 1054 |  | -				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, | 
|---|
| 1055 |  | -				pvec.pages, indices); | 
|---|
| 1056 |  | - | 
|---|
| 1057 |  | -		if (pvec.nr == 0) | 
|---|
|  | 993 | +	xas_lock_irq(&xas); | 
|---|
|  | 994 | +	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { | 
|---|
|  | 995 | +		ret = dax_writeback_one(&xas, dax_dev, mapping, entry); | 
|---|
|  | 996 | +		if (ret < 0) { | 
|---|
|  | 997 | +			mapping_set_error(mapping, ret); | 
|---|
| 1058 | 998 | break; | 
|---|
| 1059 |  | - | 
|---|
| 1060 |  | -		for (i = 0; i < pvec.nr; i++) { | 
|---|
| 1061 |  | -			if (indices[i] > end_index) { | 
|---|
| 1062 |  | -				done = true; | 
|---|
| 1063 |  | -				break; | 
|---|
| 1064 |  | -			} | 
|---|
| 1065 |  | - | 
|---|
| 1066 |  | -			ret = dax_writeback_one(dax_dev, mapping, indices[i], | 
|---|
| 1067 |  | -					pvec.pages[i]); | 
|---|
| 1068 |  | -			if (ret < 0) { | 
|---|
| 1069 |  | -				mapping_set_error(mapping, ret); | 
|---|
| 1070 |  | -				goto out; | 
|---|
| 1071 |  | -			} | 
|---|
| 1072 | 999 | } | 
|---|
| 1073 |  | -		start_index = indices[pvec.nr - 1] + 1; | 
|---|
|  | 1000 | +		if (++scanned % XA_CHECK_SCHED) | 
|---|
|  | 1001 | +			continue; | 
|---|
|  | 1002 | + | 
|---|
|  | 1003 | +		xas_pause(&xas); | 
|---|
|  | 1004 | +		xas_unlock_irq(&xas); | 
|---|
|  | 1005 | +		cond_resched(); | 
|---|
|  | 1006 | +		xas_lock_irq(&xas); | 
|---|
| 1074 | 1007 | } | 
|---|
| 1075 |  | -out: | 
|---|
| 1076 |  | -	put_dax(dax_dev); | 
|---|
| 1077 |  | -	trace_dax_writeback_range_done(inode, start_index, end_index); | 
|---|
| 1078 |  | -	return (ret < 0 ? ret : 0); | 
|---|
|  | 1008 | +	xas_unlock_irq(&xas); | 
|---|
|  | 1009 | +	trace_dax_writeback_range_done(inode, xas.xa_index, end_index); | 
|---|
|  | 1010 | +	return ret; | 
|---|
| 1079 | 1011 | } | 
|---|
| 1080 | 1012 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | 
|---|
| 1081 | 1013 |  | 
|---|
| .. | .. | 
|---|
| 1123 | 1055 | * If this page is ever written to we will re-fault and change the mapping to | 
|---|
| 1124 | 1056 | * point to real DAX storage instead. | 
|---|
| 1125 | 1057 | */ | 
|---|
| 1126 |  | -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, | 
|---|
| 1127 |  | -			 struct vm_fault *vmf) | 
|---|
|  | 1058 | +static vm_fault_t dax_load_hole(struct xa_state *xas, | 
|---|
|  | 1059 | +		struct address_space *mapping, void **entry, | 
|---|
|  | 1060 | +		struct vm_fault *vmf) | 
|---|
| 1128 | 1061 | { | 
|---|
| 1129 | 1062 | struct inode *inode = mapping->host; | 
|---|
| 1130 | 1063 | unsigned long vaddr = vmf->address; | 
|---|
| 1131 | 1064 | pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); | 
|---|
| 1132 | 1065 | vm_fault_t ret; | 
|---|
| 1133 | 1066 |  | 
|---|
| 1134 |  | -	dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, | 
|---|
| 1135 |  | -			false); | 
|---|
|  | 1067 | +	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, | 
|---|
|  | 1068 | +			DAX_ZERO_PAGE, false); | 
|---|
|  | 1069 | + | 
|---|
| 1136 | 1070 | ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); | 
|---|
| 1137 | 1071 | trace_dax_load_hole(inode, vmf, ret); | 
|---|
| 1138 | 1072 | return ret; | 
|---|
| 1139 | 1073 | } | 
|---|
| 1140 | 1074 |  | 
|---|
| 1141 |  | -static bool dax_range_is_aligned(struct block_device *bdev, | 
|---|
| 1142 |  | -				 unsigned int offset, unsigned int length) | 
|---|
|  | 1075 | +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) | 
|---|
| 1143 | 1076 | { | 
|---|
| 1144 |  | -	unsigned short sector_size = bdev_logical_block_size(bdev); | 
|---|
|  | 1077 | +	sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); | 
|---|
|  | 1078 | +	pgoff_t pgoff; | 
|---|
|  | 1079 | +	long rc, id; | 
|---|
|  | 1080 | +	void *kaddr; | 
|---|
|  | 1081 | +	bool page_aligned = false; | 
|---|
|  | 1082 | +	unsigned offset = offset_in_page(pos); | 
|---|
|  | 1083 | +	unsigned size = min_t(u64, PAGE_SIZE - offset, length); | 
|---|
| 1145 | 1084 |  | 
|---|
| 1146 |  | -	if (!IS_ALIGNED(offset, sector_size)) | 
|---|
| 1147 |  | -		return false; | 
|---|
| 1148 |  | -	if (!IS_ALIGNED(length, sector_size)) | 
|---|
| 1149 |  | -		return false; | 
|---|
|  | 1085 | +	if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && | 
|---|
|  | 1086 | +	    (size == PAGE_SIZE)) | 
|---|
|  | 1087 | +		page_aligned = true; | 
|---|
| 1150 | 1088 |  | 
|---|
| 1151 |  | -	return true; | 
|---|
| 1152 |  | -} | 
|---|
|  | 1089 | +	rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); | 
|---|
|  | 1090 | +	if (rc) | 
|---|
|  | 1091 | +		return rc; | 
|---|
| 1153 | 1092 |  | 
|---|
| 1154 |  | -int __dax_zero_page_range(struct block_device *bdev, | 
|---|
| 1155 |  | -		struct dax_device *dax_dev, sector_t sector, | 
|---|
| 1156 |  | -		unsigned int offset, unsigned int size) | 
|---|
| 1157 |  | -{ | 
|---|
| 1158 |  | -	if (dax_range_is_aligned(bdev, offset, size)) { | 
|---|
| 1159 |  | -		sector_t start_sector = sector + (offset >> 9); | 
|---|
|  | 1093 | +	id = dax_read_lock(); | 
|---|
| 1160 | 1094 |  | 
|---|
| 1161 |  | -		return blkdev_issue_zeroout(bdev, start_sector, | 
|---|
| 1162 |  | -				size >> 9, GFP_NOFS, 0); | 
|---|
| 1163 |  | -	} else { | 
|---|
| 1164 |  | -		pgoff_t pgoff; | 
|---|
| 1165 |  | -		long rc, id; | 
|---|
| 1166 |  | -		void *kaddr; | 
|---|
| 1167 |  | - | 
|---|
| 1168 |  | -		rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); | 
|---|
| 1169 |  | -		if (rc) | 
|---|
| 1170 |  | -			return rc; | 
|---|
| 1171 |  | - | 
|---|
| 1172 |  | -		id = dax_read_lock(); | 
|---|
| 1173 |  | -		rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); | 
|---|
| 1174 |  | -		if (rc < 0) { | 
|---|
| 1175 |  | -			dax_read_unlock(id); | 
|---|
| 1176 |  | -			return rc; | 
|---|
| 1177 |  | -		} | 
|---|
| 1178 |  | -		memset(kaddr + offset, 0, size); | 
|---|
| 1179 |  | -		dax_flush(dax_dev, kaddr + offset, size); | 
|---|
|  | 1095 | +	if (page_aligned) | 
|---|
|  | 1096 | +		rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); | 
|---|
|  | 1097 | +	else | 
|---|
|  | 1098 | +		rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); | 
|---|
|  | 1099 | +	if (rc < 0) { | 
|---|
| 1180 | 1100 | dax_read_unlock(id); | 
|---|
|  | 1101 | +		return rc; | 
|---|
| 1181 | 1102 | } | 
|---|
| 1182 |  | -	return 0; | 
|---|
|  | 1103 | + | 
|---|
|  | 1104 | +	if (!page_aligned) { | 
|---|
|  | 1105 | +		memset(kaddr + offset, 0, size); | 
|---|
|  | 1106 | +		dax_flush(iomap->dax_dev, kaddr + offset, size); | 
|---|
|  | 1107 | +	} | 
|---|
|  | 1108 | +	dax_read_unlock(id); | 
|---|
|  | 1109 | +	return size; | 
|---|
| 1183 | 1110 | } | 
|---|
| 1184 |  | -EXPORT_SYMBOL_GPL(__dax_zero_page_range); | 
|---|
| 1185 | 1111 |  | 
|---|
| 1186 | 1112 | static loff_t | 
|---|
| 1187 | 1113 | dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | 
|---|
| 1188 |  | -		struct iomap *iomap) | 
|---|
|  | 1114 | +		struct iomap *iomap, struct iomap *srcmap) | 
|---|
| 1189 | 1115 | { | 
|---|
| 1190 | 1116 | struct block_device *bdev = iomap->bdev; | 
|---|
| 1191 | 1117 | struct dax_device *dax_dev = iomap->dax_dev; | 
|---|
| .. | .. | 
|---|
| 1295 | 1221 | unsigned flags = 0; | 
|---|
| 1296 | 1222 |  | 
|---|
| 1297 | 1223 | if (iov_iter_rw(iter) == WRITE) { | 
|---|
| 1298 |  | -		lockdep_assert_held_exclusive(&inode->i_rwsem); | 
|---|
|  | 1224 | +		lockdep_assert_held_write(&inode->i_rwsem); | 
|---|
| 1299 | 1225 | flags |= IOMAP_WRITE; | 
|---|
| 1300 | 1226 | } else { | 
|---|
| 1301 | 1227 | lockdep_assert_held(&inode->i_rwsem); | 
|---|
| .. | .. | 
|---|
| 1322 | 1248 | { | 
|---|
| 1323 | 1249 | if (error == 0) | 
|---|
| 1324 | 1250 | return VM_FAULT_NOPAGE; | 
|---|
| 1325 |  | -	if (error == -ENOMEM) | 
|---|
| 1326 |  | -		return VM_FAULT_OOM; | 
|---|
| 1327 |  | -	return VM_FAULT_SIGBUS; | 
|---|
|  | 1251 | +	return vmf_error(error); | 
|---|
| 1328 | 1252 | } | 
|---|
| 1329 | 1253 |  | 
|---|
| 1330 | 1254 | /* | 
|---|
| .. | .. | 
|---|
| 1343 | 1267 | { | 
|---|
| 1344 | 1268 | struct vm_area_struct *vma = vmf->vma; | 
|---|
| 1345 | 1269 | struct address_space *mapping = vma->vm_file->f_mapping; | 
|---|
|  | 1270 | +	XA_STATE(xas, &mapping->i_pages, vmf->pgoff); | 
|---|
| 1346 | 1271 | struct inode *inode = mapping->host; | 
|---|
| 1347 | 1272 | unsigned long vaddr = vmf->address; | 
|---|
| 1348 | 1273 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | 
|---|
| 1349 |  | -	struct iomap iomap = { 0 }; | 
|---|
|  | 1274 | +	struct iomap iomap = { .type = IOMAP_HOLE }; | 
|---|
|  | 1275 | +	struct iomap srcmap = { .type = IOMAP_HOLE }; | 
|---|
| 1350 | 1276 | unsigned flags = IOMAP_FAULT; | 
|---|
| 1351 | 1277 | int error, major = 0; | 
|---|
| 1352 | 1278 | bool write = vmf->flags & FAULT_FLAG_WRITE; | 
|---|
| .. | .. | 
|---|
| 1369 | 1295 | if (write && !vmf->cow_page) | 
|---|
| 1370 | 1296 | flags |= IOMAP_WRITE; | 
|---|
| 1371 | 1297 |  | 
|---|
| 1372 |  | -	entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | 
|---|
| 1373 |  | -	if (IS_ERR(entry)) { | 
|---|
| 1374 |  | -		ret = dax_fault_return(PTR_ERR(entry)); | 
|---|
|  | 1298 | +	entry = grab_mapping_entry(&xas, mapping, 0); | 
|---|
|  | 1299 | +	if (xa_is_internal(entry)) { | 
|---|
|  | 1300 | +		ret = xa_to_internal(entry); | 
|---|
| 1375 | 1301 | goto out; | 
|---|
| 1376 | 1302 | } | 
|---|
| 1377 | 1303 |  | 
|---|
| .. | .. | 
|---|
| 1391 | 1317 | * the file system block size to be equal the page size, which means | 
|---|
| 1392 | 1318 | * that we never have to deal with more than a single extent here. | 
|---|
| 1393 | 1319 | */ | 
|---|
| 1394 |  | -	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | 
|---|
|  | 1320 | +	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); | 
|---|
| 1395 | 1321 | if (iomap_errp) | 
|---|
| 1396 | 1322 | *iomap_errp = error; | 
|---|
| 1397 | 1323 | if (error) { | 
|---|
| .. | .. | 
|---|
| 1412 | 1338 | clear_user_highpage(vmf->cow_page, vaddr); | 
|---|
| 1413 | 1339 | break; | 
|---|
| 1414 | 1340 | case IOMAP_MAPPED: | 
|---|
| 1415 |  | -			error = copy_user_dax(iomap.bdev, iomap.dax_dev, | 
|---|
| 1416 |  | -					sector, PAGE_SIZE, vmf->cow_page, vaddr); | 
|---|
|  | 1341 | +			error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, | 
|---|
|  | 1342 | +						  sector, vmf->cow_page, vaddr); | 
|---|
| 1417 | 1343 | break; | 
|---|
| 1418 | 1344 | default: | 
|---|
| 1419 | 1345 | WARN_ON_ONCE(1); | 
|---|
| .. | .. | 
|---|
| 1444 | 1370 | if (error < 0) | 
|---|
| 1445 | 1371 | goto error_finish_iomap; | 
|---|
| 1446 | 1372 |  | 
|---|
| 1447 |  | -		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, | 
|---|
|  | 1373 | +		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, | 
|---|
| 1448 | 1374 | 0, write && !sync); | 
|---|
| 1449 | 1375 |  | 
|---|
| 1450 | 1376 | /* | 
|---|
| .. | .. | 
|---|
| 1472 | 1398 | case IOMAP_UNWRITTEN: | 
|---|
| 1473 | 1399 | case IOMAP_HOLE: | 
|---|
| 1474 | 1400 | if (!write) { | 
|---|
| 1475 |  | -			ret = dax_load_hole(mapping, entry, vmf); | 
|---|
|  | 1401 | +			ret = dax_load_hole(&xas, mapping, &entry, vmf); | 
|---|
| 1476 | 1402 | goto finish_iomap; | 
|---|
| 1477 | 1403 | } | 
|---|
| 1478 |  | -		/*FALLTHRU*/ | 
|---|
|  | 1404 | +		fallthrough; | 
|---|
| 1479 | 1405 | default: | 
|---|
| 1480 | 1406 | WARN_ON_ONCE(1); | 
|---|
| 1481 | 1407 | error = -EIO; | 
|---|
| .. | .. | 
|---|
| 1499 | 1425 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | 
|---|
| 1500 | 1426 | } | 
|---|
| 1501 | 1427 | unlock_entry: | 
|---|
| 1502 |  | -	put_locked_mapping_entry(mapping, vmf->pgoff); | 
|---|
|  | 1428 | +	dax_unlock_entry(&xas, entry); | 
|---|
| 1503 | 1429 | out: | 
|---|
| 1504 | 1430 | trace_dax_pte_fault_done(inode, vmf, ret); | 
|---|
| 1505 | 1431 | return ret | major; | 
|---|
| 1506 | 1432 | } | 
|---|
| 1507 | 1433 |  | 
|---|
| 1508 | 1434 | #ifdef CONFIG_FS_DAX_PMD | 
|---|
| 1509 |  | -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | 
|---|
| 1510 |  | -		void *entry) | 
|---|
|  | 1435 | +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, | 
|---|
|  | 1436 | +		struct iomap *iomap, void **entry) | 
|---|
| 1511 | 1437 | { | 
|---|
| 1512 | 1438 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 
|---|
| 1513 | 1439 | unsigned long pmd_addr = vmf->address & PMD_MASK; | 
|---|
|  | 1440 | +	struct vm_area_struct *vma = vmf->vma; | 
|---|
| 1514 | 1441 | struct inode *inode = mapping->host; | 
|---|
|  | 1442 | +	pgtable_t pgtable = NULL; | 
|---|
| 1515 | 1443 | struct page *zero_page; | 
|---|
| 1516 |  | -	void *ret = NULL; | 
|---|
| 1517 | 1444 | spinlock_t *ptl; | 
|---|
| 1518 | 1445 | pmd_t pmd_entry; | 
|---|
| 1519 | 1446 | pfn_t pfn; | 
|---|
| .. | .. | 
|---|
| 1524 | 1451 | goto fallback; | 
|---|
| 1525 | 1452 |  | 
|---|
| 1526 | 1453 | pfn = page_to_pfn_t(zero_page); | 
|---|
| 1527 |  | -	ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, | 
|---|
| 1528 |  | -			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); | 
|---|
|  | 1454 | +	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, | 
|---|
|  | 1455 | +			DAX_PMD | DAX_ZERO_PAGE, false); | 
|---|
|  | 1456 | + | 
|---|
|  | 1457 | +	if (arch_needs_pgtable_deposit()) { | 
|---|
|  | 1458 | +		pgtable = pte_alloc_one(vma->vm_mm); | 
|---|
|  | 1459 | +		if (!pgtable) | 
|---|
|  | 1460 | +			return VM_FAULT_OOM; | 
|---|
|  | 1461 | +	} | 
|---|
| 1529 | 1462 |  | 
|---|
| 1530 | 1463 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); | 
|---|
| 1531 | 1464 | if (!pmd_none(*(vmf->pmd))) { | 
|---|
| .. | .. | 
|---|
| 1533 | 1466 | goto fallback; | 
|---|
| 1534 | 1467 | } | 
|---|
| 1535 | 1468 |  | 
|---|
|  | 1469 | +	if (pgtable) { | 
|---|
|  | 1470 | +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); | 
|---|
|  | 1471 | +		mm_inc_nr_ptes(vma->vm_mm); | 
|---|
|  | 1472 | +	} | 
|---|
| 1536 | 1473 | pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); | 
|---|
| 1537 | 1474 | pmd_entry = pmd_mkhuge(pmd_entry); | 
|---|
| 1538 | 1475 | set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); | 
|---|
| 1539 | 1476 | spin_unlock(ptl); | 
|---|
| 1540 |  | -	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); | 
|---|
|  | 1477 | +	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); | 
|---|
| 1541 | 1478 | return VM_FAULT_NOPAGE; | 
|---|
| 1542 | 1479 |  | 
|---|
| 1543 | 1480 | fallback: | 
|---|
| 1544 |  | -	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); | 
|---|
|  | 1481 | +	if (pgtable) | 
|---|
|  | 1482 | +		pte_free(vma->vm_mm, pgtable); | 
|---|
|  | 1483 | +	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); | 
|---|
| 1545 | 1484 | return VM_FAULT_FALLBACK; | 
|---|
| 1546 | 1485 | } | 
|---|
| 1547 | 1486 |  | 
|---|
| .. | .. | 
|---|
| 1550 | 1489 | { | 
|---|
| 1551 | 1490 | struct vm_area_struct *vma = vmf->vma; | 
|---|
| 1552 | 1491 | struct address_space *mapping = vma->vm_file->f_mapping; | 
|---|
|  | 1492 | +	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); | 
|---|
| 1553 | 1493 | unsigned long pmd_addr = vmf->address & PMD_MASK; | 
|---|
| 1554 | 1494 | bool write = vmf->flags & FAULT_FLAG_WRITE; | 
|---|
| 1555 | 1495 | bool sync; | 
|---|
| 1556 | 1496 | unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; | 
|---|
| 1557 | 1497 | struct inode *inode = mapping->host; | 
|---|
| 1558 | 1498 | vm_fault_t result = VM_FAULT_FALLBACK; | 
|---|
| 1559 |  | -	struct iomap iomap = { 0 }; | 
|---|
| 1560 |  | -	pgoff_t max_pgoff, pgoff; | 
|---|
|  | 1499 | +	struct iomap iomap = { .type = IOMAP_HOLE }; | 
|---|
|  | 1500 | +	struct iomap srcmap = { .type = IOMAP_HOLE }; | 
|---|
|  | 1501 | +	pgoff_t max_pgoff; | 
|---|
| 1561 | 1502 | void *entry; | 
|---|
| 1562 | 1503 | loff_t pos; | 
|---|
| 1563 | 1504 | int error; | 
|---|
| .. | .. | 
|---|
| 1568 | 1509 | * supposed to hold locks serializing us with truncate / punch hole so | 
|---|
| 1569 | 1510 | * this is a reliable test. | 
|---|
| 1570 | 1511 | */ | 
|---|
| 1571 |  | -	pgoff = linear_page_index(vma, pmd_addr); | 
|---|
| 1572 | 1512 | max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); | 
|---|
| 1573 | 1513 |  | 
|---|
| 1574 | 1514 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); | 
|---|
| .. | .. | 
|---|
| 1577 | 1517 | * Make sure that the faulting address's PMD offset (color) matches | 
|---|
| 1578 | 1518 | * the PMD offset from the start of the file.  This is necessary so | 
|---|
| 1579 | 1519 | * that a PMD range in the page table overlaps exactly with a PMD | 
|---|
| 1580 |  | -	 * range in the radix tree. | 
|---|
|  | 1520 | +	 * range in the page cache. | 
|---|
| 1581 | 1521 | */ | 
|---|
| 1582 | 1522 | if ((vmf->pgoff & PG_PMD_COLOUR) != | 
|---|
| 1583 | 1523 | ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) | 
|---|
| .. | .. | 
|---|
| 1593 | 1533 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) | 
|---|
| 1594 | 1534 | goto fallback; | 
|---|
| 1595 | 1535 |  | 
|---|
| 1596 |  | -	if (pgoff >= max_pgoff) { | 
|---|
|  | 1536 | +	if (xas.xa_index >= max_pgoff) { | 
|---|
| 1597 | 1537 | result = VM_FAULT_SIGBUS; | 
|---|
| 1598 | 1538 | goto out; | 
|---|
| 1599 | 1539 | } | 
|---|
| 1600 | 1540 |  | 
|---|
| 1601 | 1541 | /* If the PMD would extend beyond the file size */ | 
|---|
| 1602 |  | -	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) | 
|---|
|  | 1542 | +	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) | 
|---|
| 1603 | 1543 | goto fallback; | 
|---|
| 1604 | 1544 |  | 
|---|
| 1605 | 1545 | /* | 
|---|
| 1606 |  | -	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a | 
|---|
| 1607 |  | -	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page | 
|---|
| 1608 |  | -	 * is already in the tree, for instance), it will return -EEXIST and | 
|---|
| 1609 |  | -	 * we just fall back to 4k entries. | 
|---|
|  | 1546 | +	 * grab_mapping_entry() will make sure we get an empty PMD entry, | 
|---|
|  | 1547 | +	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE | 
|---|
|  | 1548 | +	 * entry is already in the array, for instance), it will return | 
|---|
|  | 1549 | +	 * VM_FAULT_FALLBACK. | 
|---|
| 1610 | 1550 | */ | 
|---|
| 1611 |  | -	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | 
|---|
| 1612 |  | -	if (IS_ERR(entry)) | 
|---|
|  | 1551 | +	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); | 
|---|
|  | 1552 | +	if (xa_is_internal(entry)) { | 
|---|
|  | 1553 | +		result = xa_to_internal(entry); | 
|---|
| 1613 | 1554 | goto fallback; | 
|---|
|  | 1555 | +	} | 
|---|
| 1614 | 1556 |  | 
|---|
| 1615 | 1557 | /* | 
|---|
| 1616 | 1558 | * It is possible, particularly with mixed reads & writes to private | 
|---|
| .. | .. | 
|---|
| 1629 | 1571 | * setting up a mapping, so really we're using iomap_begin() as a way | 
|---|
| 1630 | 1572 | * to look up our filesystem block. | 
|---|
| 1631 | 1573 | */ | 
|---|
| 1632 |  | -	pos = (loff_t)pgoff << PAGE_SHIFT; | 
|---|
| 1633 |  | -	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | 
|---|
|  | 1574 | +	pos = (loff_t)xas.xa_index << PAGE_SHIFT; | 
|---|
|  | 1575 | +	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, | 
|---|
|  | 1576 | +			&srcmap); | 
|---|
| 1634 | 1577 | if (error) | 
|---|
| 1635 | 1578 | goto unlock_entry; | 
|---|
| 1636 | 1579 |  | 
|---|
| .. | .. | 
|---|
| 1645 | 1588 | if (error < 0) | 
|---|
| 1646 | 1589 | goto finish_iomap; | 
|---|
| 1647 | 1590 |  | 
|---|
| 1648 |  | -		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, | 
|---|
| 1649 |  | -						RADIX_DAX_PMD, write && !sync); | 
|---|
|  | 1591 | +		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, | 
|---|
|  | 1592 | +						DAX_PMD, write && !sync); | 
|---|
| 1650 | 1593 |  | 
|---|
| 1651 | 1594 | /* | 
|---|
| 1652 | 1595 | * If we are doing synchronous page fault and inode needs fsync, | 
|---|
| .. | .. | 
|---|
| 1669 | 1612 | case IOMAP_HOLE: | 
|---|
| 1670 | 1613 | if (WARN_ON_ONCE(write)) | 
|---|
| 1671 | 1614 | break; | 
|---|
| 1672 |  | -		result = dax_pmd_load_hole(vmf, &iomap, entry); | 
|---|
|  | 1615 | +		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); | 
|---|
| 1673 | 1616 | break; | 
|---|
| 1674 | 1617 | default: | 
|---|
| 1675 | 1618 | WARN_ON_ONCE(1); | 
|---|
| .. | .. | 
|---|
| 1692 | 1635 | &iomap); | 
|---|
| 1693 | 1636 | } | 
|---|
| 1694 | 1637 | unlock_entry: | 
|---|
| 1695 |  | -	put_locked_mapping_entry(mapping, pgoff); | 
|---|
|  | 1638 | +	dax_unlock_entry(&xas, entry); | 
|---|
| 1696 | 1639 | fallback: | 
|---|
| 1697 | 1640 | if (result == VM_FAULT_FALLBACK) { | 
|---|
| 1698 | 1641 | split_huge_pmd(vma, vmf->pmd, vmf->address); | 
|---|
| .. | .. | 
|---|
| 1737 | 1680 | } | 
|---|
| 1738 | 1681 | EXPORT_SYMBOL_GPL(dax_iomap_fault); | 
|---|
| 1739 | 1682 |  | 
|---|
| 1740 |  | -/** | 
|---|
|  | 1683 | +/* | 
|---|
| 1741 | 1684 | * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables | 
|---|
| 1742 | 1685 | * @vmf: The description of the fault | 
|---|
| 1743 |  | - * @pe_size: Size of entry to be inserted | 
|---|
| 1744 | 1686 | * @pfn: PFN to insert | 
|---|
|  | 1687 | + * @order: Order of entry to insert. | 
|---|
| 1745 | 1688 | * | 
|---|
| 1746 |  | - * This function inserts writeable PTE or PMD entry into page tables for mmaped | 
|---|
| 1747 |  | - * DAX file.  It takes care of marking corresponding radix tree entry as dirty | 
|---|
| 1748 |  | - * as well. | 
|---|
|  | 1689 | + * This function inserts a writeable PTE or PMD entry into the page tables | 
|---|
|  | 1690 | + * for an mmaped DAX file.  It also marks the page cache entry as dirty. | 
|---|
| 1749 | 1691 | */ | 
|---|
| 1750 |  | -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, | 
|---|
| 1751 |  | -				  enum page_entry_size pe_size, | 
|---|
| 1752 |  | -				  pfn_t pfn) | 
|---|
|  | 1692 | +static vm_fault_t | 
|---|
|  | 1693 | +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) | 
|---|
| 1753 | 1694 | { | 
|---|
| 1754 | 1695 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 
|---|
| 1755 |  | -	void *entry, **slot; | 
|---|
| 1756 |  | -	pgoff_t index = vmf->pgoff; | 
|---|
|  | 1696 | +	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); | 
|---|
|  | 1697 | +	void *entry; | 
|---|
| 1757 | 1698 | vm_fault_t ret; | 
|---|
| 1758 | 1699 |  | 
|---|
| 1759 |  | -	xa_lock_irq(&mapping->i_pages); | 
|---|
| 1760 |  | -	entry = get_unlocked_mapping_entry(mapping, index, &slot); | 
|---|
|  | 1700 | +	xas_lock_irq(&xas); | 
|---|
|  | 1701 | +	entry = get_unlocked_entry(&xas, order); | 
|---|
| 1761 | 1702 | /* Did we race with someone splitting entry or so? */ | 
|---|
| 1762 |  | -	if (!entry || | 
|---|
| 1763 |  | -	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || | 
|---|
| 1764 |  | -	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { | 
|---|
| 1765 |  | -		put_unlocked_mapping_entry(mapping, index, entry); | 
|---|
| 1766 |  | -		xa_unlock_irq(&mapping->i_pages); | 
|---|
|  | 1703 | +	if (!entry || dax_is_conflict(entry) || | 
|---|
|  | 1704 | +	    (order == 0 && !dax_is_pte_entry(entry))) { | 
|---|
|  | 1705 | +		put_unlocked_entry(&xas, entry, WAKE_NEXT); | 
|---|
|  | 1706 | +		xas_unlock_irq(&xas); | 
|---|
| 1767 | 1707 | trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, | 
|---|
| 1768 | 1708 | VM_FAULT_NOPAGE); | 
|---|
| 1769 | 1709 | return VM_FAULT_NOPAGE; | 
|---|
| 1770 | 1710 | } | 
|---|
| 1771 |  | -	radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); | 
|---|
| 1772 |  | -	entry = lock_slot(mapping, slot); | 
|---|
| 1773 |  | -	xa_unlock_irq(&mapping->i_pages); | 
|---|
| 1774 |  | -	switch (pe_size) { | 
|---|
| 1775 |  | -	case PE_SIZE_PTE: | 
|---|
|  | 1711 | +	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); | 
|---|
|  | 1712 | +	dax_lock_entry(&xas, entry); | 
|---|
|  | 1713 | +	xas_unlock_irq(&xas); | 
|---|
|  | 1714 | +	if (order == 0) | 
|---|
| 1776 | 1715 | ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); | 
|---|
| 1777 |  | -		break; | 
|---|
| 1778 | 1716 | #ifdef CONFIG_FS_DAX_PMD | 
|---|
| 1779 |  | -	case PE_SIZE_PMD: | 
|---|
|  | 1717 | +	else if (order == PMD_ORDER) | 
|---|
| 1780 | 1718 | ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); | 
|---|
| 1781 |  | -		break; | 
|---|
| 1782 | 1719 | #endif | 
|---|
| 1783 |  | -	default: | 
|---|
|  | 1720 | +	else | 
|---|
| 1784 | 1721 | ret = VM_FAULT_FALLBACK; | 
|---|
| 1785 |  | -	} | 
|---|
| 1786 |  | -	put_locked_mapping_entry(mapping, index); | 
|---|
|  | 1722 | +	dax_unlock_entry(&xas, entry); | 
|---|
| 1787 | 1723 | trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); | 
|---|
| 1788 | 1724 | return ret; | 
|---|
| 1789 | 1725 | } | 
|---|
| .. | .. | 
|---|
| 1803 | 1739 | { | 
|---|
| 1804 | 1740 | int err; | 
|---|
| 1805 | 1741 | loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; | 
|---|
| 1806 |  | -	size_t len = 0; | 
|---|
|  | 1742 | +	unsigned int order = pe_order(pe_size); | 
|---|
|  | 1743 | +	size_t len = PAGE_SIZE << order; | 
|---|
| 1807 | 1744 |  | 
|---|
| 1808 |  | -	if (pe_size == PE_SIZE_PTE) | 
|---|
| 1809 |  | -		len = PAGE_SIZE; | 
|---|
| 1810 |  | -	else if (pe_size == PE_SIZE_PMD) | 
|---|
| 1811 |  | -		len = PMD_SIZE; | 
|---|
| 1812 |  | -	else | 
|---|
| 1813 |  | -		WARN_ON_ONCE(1); | 
|---|
| 1814 | 1745 | err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); | 
|---|
| 1815 | 1746 | if (err) | 
|---|
| 1816 | 1747 | return VM_FAULT_SIGBUS; | 
|---|
| 1817 |  | -	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); | 
|---|
|  | 1748 | +	return dax_insert_pfn_mkwrite(vmf, pfn, order); | 
|---|
| 1818 | 1749 | } | 
|---|
| 1819 | 1750 | EXPORT_SYMBOL_GPL(dax_finish_sync_fault); | 
|---|