| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * fs/dax.c - Direct Access filesystem code |
|---|
| 3 | 4 | * Copyright (c) 2013-2014 Intel Corporation |
|---|
| 4 | 5 | * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> |
|---|
| 5 | 6 | * Author: Ross Zwisler <ross.zwisler@linux.intel.com> |
|---|
| 6 | | - * |
|---|
| 7 | | - * This program is free software; you can redistribute it and/or modify it |
|---|
| 8 | | - * under the terms and conditions of the GNU General Public License, |
|---|
| 9 | | - * version 2, as published by the Free Software Foundation. |
|---|
| 10 | | - * |
|---|
| 11 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
|---|
| 12 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|---|
| 13 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
|---|
| 14 | | - * more details. |
|---|
| 15 | 7 | */ |
|---|
| 16 | 8 | |
|---|
| 17 | 9 | #include <linux/atomic.h> |
|---|
| .. | .. |
|---|
| 33 | 25 | #include <linux/sizes.h> |
|---|
| 34 | 26 | #include <linux/mmu_notifier.h> |
|---|
| 35 | 27 | #include <linux/iomap.h> |
|---|
| 36 | | -#include "internal.h" |
|---|
| 28 | +#include <asm/pgalloc.h> |
|---|
| 37 | 29 | |
|---|
| 38 | 30 | #define CREATE_TRACE_POINTS |
|---|
| 39 | 31 | #include <trace/events/fs_dax.h> |
|---|
| 32 | + |
|---|
| 33 | +static inline unsigned int pe_order(enum page_entry_size pe_size) |
|---|
| 34 | +{ |
|---|
| 35 | + if (pe_size == PE_SIZE_PTE) |
|---|
| 36 | + return PAGE_SHIFT - PAGE_SHIFT; |
|---|
| 37 | + if (pe_size == PE_SIZE_PMD) |
|---|
| 38 | + return PMD_SHIFT - PAGE_SHIFT; |
|---|
| 39 | + if (pe_size == PE_SIZE_PUD) |
|---|
| 40 | + return PUD_SHIFT - PAGE_SHIFT; |
|---|
| 41 | + return ~0; |
|---|
| 42 | +} |
|---|
| 40 | 43 | |
|---|
| 41 | 44 | /* We choose 4096 entries - same as per-zone page wait tables */ |
|---|
| 42 | 45 | #define DAX_WAIT_TABLE_BITS 12 |
|---|
| .. | .. |
|---|
| 45 | 48 | /* The 'colour' (ie low bits) within a PMD of a page offset. */ |
|---|
| 46 | 49 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) |
|---|
| 47 | 50 | #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) |
|---|
| 51 | + |
|---|
| 52 | +/* The order of a PMD entry */ |
|---|
| 53 | +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) |
|---|
| 48 | 54 | |
|---|
| 49 | 55 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; |
|---|
| 50 | 56 | |
|---|
| .. | .. |
|---|
| 59 | 65 | fs_initcall(init_dax_wait_table); |
|---|
| 60 | 66 | |
|---|
| 61 | 67 | /* |
|---|
| 62 | | - * We use lowest available bit in exceptional entry for locking, one bit for |
|---|
| 63 | | - * the entry size (PMD) and two more to tell us if the entry is a zero page or |
|---|
| 64 | | - * an empty entry that is just used for locking. In total four special bits. |
|---|
| 68 | + * DAX pagecache entries use XArray value entries so they can't be mistaken |
|---|
| 69 | + * for pages. We use one bit for locking, one bit for the entry size (PMD) |
|---|
| 70 | + * and two more to tell us if the entry is a zero page or an empty entry that |
|---|
| 71 | + * is just used for locking. In total four special bits. |
|---|
| 65 | 72 | * |
|---|
| 66 | 73 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE |
|---|
| 67 | 74 | * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem |
|---|
| 68 | 75 | * block allocation. |
|---|
| 69 | 76 | */ |
|---|
| 70 | | -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) |
|---|
| 71 | | -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) |
|---|
| 72 | | -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) |
|---|
| 73 | | -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) |
|---|
| 74 | | -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) |
|---|
| 77 | +#define DAX_SHIFT (4) |
|---|
| 78 | +#define DAX_LOCKED (1UL << 0) |
|---|
| 79 | +#define DAX_PMD (1UL << 1) |
|---|
| 80 | +#define DAX_ZERO_PAGE (1UL << 2) |
|---|
| 81 | +#define DAX_EMPTY (1UL << 3) |
|---|
| 75 | 82 | |
|---|
| 76 | | -static unsigned long dax_radix_pfn(void *entry) |
|---|
| 83 | +static unsigned long dax_to_pfn(void *entry) |
|---|
| 77 | 84 | { |
|---|
| 78 | | - return (unsigned long)entry >> RADIX_DAX_SHIFT; |
|---|
| 85 | + return xa_to_value(entry) >> DAX_SHIFT; |
|---|
| 79 | 86 | } |
|---|
| 80 | 87 | |
|---|
| 81 | | -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) |
|---|
| 88 | +static void *dax_make_entry(pfn_t pfn, unsigned long flags) |
|---|
| 82 | 89 | { |
|---|
| 83 | | - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | |
|---|
| 84 | | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); |
|---|
| 90 | + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); |
|---|
| 85 | 91 | } |
|---|
| 86 | 92 | |
|---|
| 87 | | -static unsigned int dax_radix_order(void *entry) |
|---|
| 93 | +static bool dax_is_locked(void *entry) |
|---|
| 88 | 94 | { |
|---|
| 89 | | - if ((unsigned long)entry & RADIX_DAX_PMD) |
|---|
| 90 | | - return PMD_SHIFT - PAGE_SHIFT; |
|---|
| 95 | + return xa_to_value(entry) & DAX_LOCKED; |
|---|
| 96 | +} |
|---|
| 97 | + |
|---|
| 98 | +static unsigned int dax_entry_order(void *entry) |
|---|
| 99 | +{ |
|---|
| 100 | + if (xa_to_value(entry) & DAX_PMD) |
|---|
| 101 | + return PMD_ORDER; |
|---|
| 91 | 102 | return 0; |
|---|
| 92 | 103 | } |
|---|
| 93 | 104 | |
|---|
| 94 | | -static int dax_is_pmd_entry(void *entry) |
|---|
| 105 | +static unsigned long dax_is_pmd_entry(void *entry) |
|---|
| 95 | 106 | { |
|---|
| 96 | | - return (unsigned long)entry & RADIX_DAX_PMD; |
|---|
| 107 | + return xa_to_value(entry) & DAX_PMD; |
|---|
| 97 | 108 | } |
|---|
| 98 | 109 | |
|---|
| 99 | | -static int dax_is_pte_entry(void *entry) |
|---|
| 110 | +static bool dax_is_pte_entry(void *entry) |
|---|
| 100 | 111 | { |
|---|
| 101 | | - return !((unsigned long)entry & RADIX_DAX_PMD); |
|---|
| 112 | + return !(xa_to_value(entry) & DAX_PMD); |
|---|
| 102 | 113 | } |
|---|
| 103 | 114 | |
|---|
| 104 | 115 | static int dax_is_zero_entry(void *entry) |
|---|
| 105 | 116 | { |
|---|
| 106 | | - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; |
|---|
| 117 | + return xa_to_value(entry) & DAX_ZERO_PAGE; |
|---|
| 107 | 118 | } |
|---|
| 108 | 119 | |
|---|
| 109 | 120 | static int dax_is_empty_entry(void *entry) |
|---|
| 110 | 121 | { |
|---|
| 111 | | - return (unsigned long)entry & RADIX_DAX_EMPTY; |
|---|
| 122 | + return xa_to_value(entry) & DAX_EMPTY; |
|---|
| 112 | 123 | } |
|---|
| 113 | 124 | |
|---|
| 114 | 125 | /* |
|---|
| 115 | | - * DAX radix tree locking |
|---|
| 126 | + * true if the entry that was found is of a smaller order than the entry |
|---|
| 127 | + * we were looking for |
|---|
| 128 | + */ |
|---|
| 129 | +static bool dax_is_conflict(void *entry) |
|---|
| 130 | +{ |
|---|
| 131 | + return entry == XA_RETRY_ENTRY; |
|---|
| 132 | +} |
|---|
| 133 | + |
|---|
| 134 | +/* |
|---|
| 135 | + * DAX page cache entry locking |
|---|
| 116 | 136 | */ |
|---|
| 117 | 137 | struct exceptional_entry_key { |
|---|
| 118 | | - struct address_space *mapping; |
|---|
| 138 | + struct xarray *xa; |
|---|
| 119 | 139 | pgoff_t entry_start; |
|---|
| 120 | 140 | }; |
|---|
| 121 | 141 | |
|---|
| .. | .. |
|---|
| 124 | 144 | struct exceptional_entry_key key; |
|---|
| 125 | 145 | }; |
|---|
| 126 | 146 | |
|---|
| 127 | | -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, |
|---|
| 128 | | - pgoff_t index, void *entry, struct exceptional_entry_key *key) |
|---|
| 147 | +/** |
|---|
| 148 | + * enum dax_wake_mode: waitqueue wakeup behaviour |
|---|
| 149 | + * @WAKE_ALL: wake all waiters in the waitqueue |
|---|
| 150 | + * @WAKE_NEXT: wake only the first waiter in the waitqueue |
|---|
| 151 | + */ |
|---|
| 152 | +enum dax_wake_mode { |
|---|
| 153 | + WAKE_ALL, |
|---|
| 154 | + WAKE_NEXT, |
|---|
| 155 | +}; |
|---|
| 156 | + |
|---|
| 157 | +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, |
|---|
| 158 | + void *entry, struct exceptional_entry_key *key) |
|---|
| 129 | 159 | { |
|---|
| 130 | 160 | unsigned long hash; |
|---|
| 161 | + unsigned long index = xas->xa_index; |
|---|
| 131 | 162 | |
|---|
| 132 | 163 | /* |
|---|
| 133 | 164 | * If 'entry' is a PMD, align the 'index' that we use for the wait |
|---|
| .. | .. |
|---|
| 136 | 167 | */ |
|---|
| 137 | 168 | if (dax_is_pmd_entry(entry)) |
|---|
| 138 | 169 | index &= ~PG_PMD_COLOUR; |
|---|
| 139 | | - |
|---|
| 140 | | - key->mapping = mapping; |
|---|
| 170 | + key->xa = xas->xa; |
|---|
| 141 | 171 | key->entry_start = index; |
|---|
| 142 | 172 | |
|---|
| 143 | | - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); |
|---|
| 173 | + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); |
|---|
| 144 | 174 | return wait_table + hash; |
|---|
| 145 | 175 | } |
|---|
| 146 | 176 | |
|---|
| 147 | | -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, |
|---|
| 148 | | - int sync, void *keyp) |
|---|
| 177 | +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, |
|---|
| 178 | + unsigned int mode, int sync, void *keyp) |
|---|
| 149 | 179 | { |
|---|
| 150 | 180 | struct exceptional_entry_key *key = keyp; |
|---|
| 151 | 181 | struct wait_exceptional_entry_queue *ewait = |
|---|
| 152 | 182 | container_of(wait, struct wait_exceptional_entry_queue, wait); |
|---|
| 153 | 183 | |
|---|
| 154 | | - if (key->mapping != ewait->key.mapping || |
|---|
| 184 | + if (key->xa != ewait->key.xa || |
|---|
| 155 | 185 | key->entry_start != ewait->key.entry_start) |
|---|
| 156 | 186 | return 0; |
|---|
| 157 | 187 | return autoremove_wake_function(wait, mode, sync, NULL); |
|---|
| .. | .. |
|---|
| 162 | 192 | * The important information it's conveying is whether the entry at |
|---|
| 163 | 193 | * this index used to be a PMD entry. |
|---|
| 164 | 194 | */ |
|---|
| 165 | | -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
|---|
| 166 | | - pgoff_t index, void *entry, bool wake_all) |
|---|
| 195 | +static void dax_wake_entry(struct xa_state *xas, void *entry, |
|---|
| 196 | + enum dax_wake_mode mode) |
|---|
| 167 | 197 | { |
|---|
| 168 | 198 | struct exceptional_entry_key key; |
|---|
| 169 | 199 | wait_queue_head_t *wq; |
|---|
| 170 | 200 | |
|---|
| 171 | | - wq = dax_entry_waitqueue(mapping, index, entry, &key); |
|---|
| 201 | + wq = dax_entry_waitqueue(xas, entry, &key); |
|---|
| 172 | 202 | |
|---|
| 173 | 203 | /* |
|---|
| 174 | 204 | * Checking for locked entry and prepare_to_wait_exclusive() happens |
|---|
| .. | .. |
|---|
| 177 | 207 | * must be in the waitqueue and the following check will see them. |
|---|
| 178 | 208 | */ |
|---|
| 179 | 209 | if (waitqueue_active(wq)) |
|---|
| 180 | | - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
|---|
| 210 | + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); |
|---|
| 181 | 211 | } |
|---|
| 182 | 212 | |
|---|
| 183 | 213 | /* |
|---|
| 184 | | - * Check whether the given slot is locked. Must be called with the i_pages |
|---|
| 185 | | - * lock held. |
|---|
| 186 | | - */ |
|---|
| 187 | | -static inline int slot_locked(struct address_space *mapping, void **slot) |
|---|
| 188 | | -{ |
|---|
| 189 | | - unsigned long entry = (unsigned long) |
|---|
| 190 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
|---|
| 191 | | - return entry & RADIX_DAX_ENTRY_LOCK; |
|---|
| 192 | | -} |
|---|
| 193 | | - |
|---|
| 194 | | -/* |
|---|
| 195 | | - * Mark the given slot as locked. Must be called with the i_pages lock held. |
|---|
| 196 | | - */ |
|---|
| 197 | | -static inline void *lock_slot(struct address_space *mapping, void **slot) |
|---|
| 198 | | -{ |
|---|
| 199 | | - unsigned long entry = (unsigned long) |
|---|
| 200 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
|---|
| 201 | | - |
|---|
| 202 | | - entry |= RADIX_DAX_ENTRY_LOCK; |
|---|
| 203 | | - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); |
|---|
| 204 | | - return (void *)entry; |
|---|
| 205 | | -} |
|---|
| 206 | | - |
|---|
| 207 | | -/* |
|---|
| 208 | | - * Mark the given slot as unlocked. Must be called with the i_pages lock held. |
|---|
| 209 | | - */ |
|---|
| 210 | | -static inline void *unlock_slot(struct address_space *mapping, void **slot) |
|---|
| 211 | | -{ |
|---|
| 212 | | - unsigned long entry = (unsigned long) |
|---|
| 213 | | - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); |
|---|
| 214 | | - |
|---|
| 215 | | - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; |
|---|
| 216 | | - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); |
|---|
| 217 | | - return (void *)entry; |
|---|
| 218 | | -} |
|---|
| 219 | | - |
|---|
| 220 | | -static void put_unlocked_mapping_entry(struct address_space *mapping, |
|---|
| 221 | | - pgoff_t index, void *entry); |
|---|
| 222 | | - |
|---|
| 223 | | -/* |
|---|
| 224 | | - * Lookup entry in radix tree, wait for it to become unlocked if it is |
|---|
| 225 | | - * exceptional entry and return it. The caller must call |
|---|
| 226 | | - * put_unlocked_mapping_entry() when he decided not to lock the entry or |
|---|
| 227 | | - * put_locked_mapping_entry() when he locked the entry and now wants to |
|---|
| 228 | | - * unlock it. |
|---|
| 214 | + * Look up entry in page cache, wait for it to become unlocked if it |
|---|
| 215 | + * is a DAX entry and return it. The caller must subsequently call |
|---|
| 216 | + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() |
|---|
| 217 | + * if it did. The entry returned may have a larger order than @order. |
|---|
| 218 | + * If @order is larger than the order of the entry found in i_pages, this |
|---|
| 219 | + * function returns a dax_is_conflict entry. |
|---|
| 229 | 220 | * |
|---|
| 230 | 221 | * Must be called with the i_pages lock held. |
|---|
| 231 | 222 | */ |
|---|
| 232 | | -static void *get_unlocked_mapping_entry(struct address_space *mapping, |
|---|
| 233 | | - pgoff_t index, void ***slotp) |
|---|
| 223 | +static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) |
|---|
| 234 | 224 | { |
|---|
| 235 | | - void *entry, **slot; |
|---|
| 225 | + void *entry; |
|---|
| 236 | 226 | struct wait_exceptional_entry_queue ewait; |
|---|
| 237 | 227 | wait_queue_head_t *wq; |
|---|
| 238 | 228 | |
|---|
| .. | .. |
|---|
| 240 | 230 | ewait.wait.func = wake_exceptional_entry_func; |
|---|
| 241 | 231 | |
|---|
| 242 | 232 | for (;;) { |
|---|
| 243 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, |
|---|
| 244 | | - &slot); |
|---|
| 245 | | - if (!entry || |
|---|
| 246 | | - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || |
|---|
| 247 | | - !slot_locked(mapping, slot)) { |
|---|
| 248 | | - if (slotp) |
|---|
| 249 | | - *slotp = slot; |
|---|
| 233 | + entry = xas_find_conflict(xas); |
|---|
| 234 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
|---|
| 250 | 235 | return entry; |
|---|
| 251 | | - } |
|---|
| 236 | + if (dax_entry_order(entry) < order) |
|---|
| 237 | + return XA_RETRY_ENTRY; |
|---|
| 238 | + if (!dax_is_locked(entry)) |
|---|
| 239 | + return entry; |
|---|
| 252 | 240 | |
|---|
| 253 | | - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); |
|---|
| 241 | + wq = dax_entry_waitqueue(xas, entry, &ewait.key); |
|---|
| 254 | 242 | prepare_to_wait_exclusive(wq, &ewait.wait, |
|---|
| 255 | 243 | TASK_UNINTERRUPTIBLE); |
|---|
| 256 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 244 | + xas_unlock_irq(xas); |
|---|
| 245 | + xas_reset(xas); |
|---|
| 257 | 246 | schedule(); |
|---|
| 258 | 247 | finish_wait(wq, &ewait.wait); |
|---|
| 259 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 248 | + xas_lock_irq(xas); |
|---|
| 260 | 249 | } |
|---|
| 261 | 250 | } |
|---|
| 262 | 251 | |
|---|
| .. | .. |
|---|
| 265 | 254 | * (it's cycled in clear_inode() after removing the entries from i_pages) |
|---|
| 266 | 255 | * After we call xas_unlock_irq(), we cannot touch xas->xa. |
|---|
| 267 | 256 | */ |
|---|
| 268 | | -static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, |
|---|
| 269 | | - void ***slotp, void *entry) |
|---|
| 257 | +static void wait_entry_unlocked(struct xa_state *xas, void *entry) |
|---|
| 270 | 258 | { |
|---|
| 271 | 259 | struct wait_exceptional_entry_queue ewait; |
|---|
| 272 | 260 | wait_queue_head_t *wq; |
|---|
| .. | .. |
|---|
| 274 | 262 | init_wait(&ewait.wait); |
|---|
| 275 | 263 | ewait.wait.func = wake_exceptional_entry_func; |
|---|
| 276 | 264 | |
|---|
| 277 | | - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); |
|---|
| 265 | + wq = dax_entry_waitqueue(xas, entry, &ewait.key); |
|---|
| 278 | 266 | /* |
|---|
| 279 | 267 | * Unlike get_unlocked_entry() there is no guarantee that this |
|---|
| 280 | 268 | * path ever successfully retrieves an unlocked entry before an |
|---|
| .. | .. |
|---|
| 282 | 270 | * never successfully performs its own wake up. |
|---|
| 283 | 271 | */ |
|---|
| 284 | 272 | prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); |
|---|
| 285 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 273 | + xas_unlock_irq(xas); |
|---|
| 286 | 274 | schedule(); |
|---|
| 287 | 275 | finish_wait(wq, &ewait.wait); |
|---|
| 288 | 276 | } |
|---|
| 289 | 277 | |
|---|
| 290 | | -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) |
|---|
| 278 | +static void put_unlocked_entry(struct xa_state *xas, void *entry, |
|---|
| 279 | + enum dax_wake_mode mode) |
|---|
| 291 | 280 | { |
|---|
| 292 | | - void *entry, **slot; |
|---|
| 293 | | - |
|---|
| 294 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 295 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); |
|---|
| 296 | | - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || |
|---|
| 297 | | - !slot_locked(mapping, slot))) { |
|---|
| 298 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 299 | | - return; |
|---|
| 300 | | - } |
|---|
| 301 | | - unlock_slot(mapping, slot); |
|---|
| 302 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 303 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, false); |
|---|
| 304 | | -} |
|---|
| 305 | | - |
|---|
| 306 | | -static void put_locked_mapping_entry(struct address_space *mapping, |
|---|
| 307 | | - pgoff_t index) |
|---|
| 308 | | -{ |
|---|
| 309 | | - unlock_mapping_entry(mapping, index); |
|---|
| 281 | + if (entry && !dax_is_conflict(entry)) |
|---|
| 282 | + dax_wake_entry(xas, entry, mode); |
|---|
| 310 | 283 | } |
|---|
| 311 | 284 | |
|---|
| 312 | 285 | /* |
|---|
| 313 | | - * Called when we are done with radix tree entry we looked up via |
|---|
| 314 | | - * get_unlocked_mapping_entry() and which we didn't lock in the end. |
|---|
| 286 | + * We used the xa_state to get the entry, but then we locked the entry and |
|---|
| 287 | + * dropped the xa_lock, so we know the xa_state is stale and must be reset |
|---|
| 288 | + * before use. |
|---|
| 315 | 289 | */ |
|---|
| 316 | | -static void put_unlocked_mapping_entry(struct address_space *mapping, |
|---|
| 317 | | - pgoff_t index, void *entry) |
|---|
| 290 | +static void dax_unlock_entry(struct xa_state *xas, void *entry) |
|---|
| 318 | 291 | { |
|---|
| 319 | | - if (!entry) |
|---|
| 320 | | - return; |
|---|
| 292 | + void *old; |
|---|
| 321 | 293 | |
|---|
| 322 | | - /* We have to wake up next waiter for the radix tree entry lock */ |
|---|
| 323 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, false); |
|---|
| 294 | + BUG_ON(dax_is_locked(entry)); |
|---|
| 295 | + xas_reset(xas); |
|---|
| 296 | + xas_lock_irq(xas); |
|---|
| 297 | + old = xas_store(xas, entry); |
|---|
| 298 | + xas_unlock_irq(xas); |
|---|
| 299 | + BUG_ON(!dax_is_locked(old)); |
|---|
| 300 | + dax_wake_entry(xas, entry, WAKE_NEXT); |
|---|
| 301 | +} |
|---|
| 302 | + |
|---|
| 303 | +/* |
|---|
| 304 | + * Return: The entry stored at this location before it was locked. |
|---|
| 305 | + */ |
|---|
| 306 | +static void *dax_lock_entry(struct xa_state *xas, void *entry) |
|---|
| 307 | +{ |
|---|
| 308 | + unsigned long v = xa_to_value(entry); |
|---|
| 309 | + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); |
|---|
| 324 | 310 | } |
|---|
| 325 | 311 | |
|---|
| 326 | 312 | static unsigned long dax_entry_size(void *entry) |
|---|
| .. | .. |
|---|
| 335 | 321 | return PAGE_SIZE; |
|---|
| 336 | 322 | } |
|---|
| 337 | 323 | |
|---|
| 338 | | -static unsigned long dax_radix_end_pfn(void *entry) |
|---|
| 324 | +static unsigned long dax_end_pfn(void *entry) |
|---|
| 339 | 325 | { |
|---|
| 340 | | - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; |
|---|
| 326 | + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; |
|---|
| 341 | 327 | } |
|---|
| 342 | 328 | |
|---|
| 343 | 329 | /* |
|---|
| .. | .. |
|---|
| 345 | 331 | * 'empty' and 'zero' entries. |
|---|
| 346 | 332 | */ |
|---|
| 347 | 333 | #define for_each_mapped_pfn(entry, pfn) \ |
|---|
| 348 | | - for (pfn = dax_radix_pfn(entry); \ |
|---|
| 349 | | - pfn < dax_radix_end_pfn(entry); pfn++) |
|---|
| 334 | + for (pfn = dax_to_pfn(entry); \ |
|---|
| 335 | + pfn < dax_end_pfn(entry); pfn++) |
|---|
| 350 | 336 | |
|---|
| 351 | 337 | /* |
|---|
| 352 | 338 | * TODO: for reflink+dax we need a way to associate a single page with |
|---|
| .. | .. |
|---|
| 403 | 389 | return NULL; |
|---|
| 404 | 390 | } |
|---|
| 405 | 391 | |
|---|
| 406 | | -bool dax_lock_mapping_entry(struct page *page) |
|---|
| 392 | +/* |
|---|
| 393 | + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page |
|---|
| 394 | + * @page: The page whose entry we want to lock |
|---|
| 395 | + * |
|---|
| 396 | + * Context: Process context. |
|---|
| 397 | + * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could |
|---|
| 398 | + * not be locked. |
|---|
| 399 | + */ |
|---|
| 400 | +dax_entry_t dax_lock_page(struct page *page) |
|---|
| 407 | 401 | { |
|---|
| 408 | | - pgoff_t index; |
|---|
| 409 | | - struct inode *inode; |
|---|
| 410 | | - bool did_lock = false; |
|---|
| 411 | | - void *entry = NULL, **slot; |
|---|
| 412 | | - struct address_space *mapping; |
|---|
| 402 | + XA_STATE(xas, NULL, 0); |
|---|
| 403 | + void *entry; |
|---|
| 413 | 404 | |
|---|
| 405 | + /* Ensure page->mapping isn't freed while we look at it */ |
|---|
| 414 | 406 | rcu_read_lock(); |
|---|
| 415 | 407 | for (;;) { |
|---|
| 416 | | - mapping = READ_ONCE(page->mapping); |
|---|
| 408 | + struct address_space *mapping = READ_ONCE(page->mapping); |
|---|
| 417 | 409 | |
|---|
| 410 | + entry = NULL; |
|---|
| 418 | 411 | if (!mapping || !dax_mapping(mapping)) |
|---|
| 419 | 412 | break; |
|---|
| 420 | 413 | |
|---|
| .. | .. |
|---|
| 425 | 418 | * otherwise we would not have a valid pfn_to_page() |
|---|
| 426 | 419 | * translation. |
|---|
| 427 | 420 | */ |
|---|
| 428 | | - inode = mapping->host; |
|---|
| 429 | | - if (S_ISCHR(inode->i_mode)) { |
|---|
| 430 | | - did_lock = true; |
|---|
| 421 | + entry = (void *)~0UL; |
|---|
| 422 | + if (S_ISCHR(mapping->host->i_mode)) |
|---|
| 431 | 423 | break; |
|---|
| 432 | | - } |
|---|
| 433 | 424 | |
|---|
| 434 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 425 | + xas.xa = &mapping->i_pages; |
|---|
| 426 | + xas_lock_irq(&xas); |
|---|
| 435 | 427 | if (mapping != page->mapping) { |
|---|
| 436 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 428 | + xas_unlock_irq(&xas); |
|---|
| 437 | 429 | continue; |
|---|
| 438 | 430 | } |
|---|
| 439 | | - index = page->index; |
|---|
| 440 | | - |
|---|
| 441 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, |
|---|
| 442 | | - NULL, &slot); |
|---|
| 443 | | - if (!entry) { |
|---|
| 444 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 445 | | - break; |
|---|
| 446 | | - } else if (slot_locked(mapping, slot)) { |
|---|
| 431 | + xas_set(&xas, page->index); |
|---|
| 432 | + entry = xas_load(&xas); |
|---|
| 433 | + if (dax_is_locked(entry)) { |
|---|
| 447 | 434 | rcu_read_unlock(); |
|---|
| 448 | | - wait_entry_unlocked(mapping, index, &slot, entry); |
|---|
| 435 | + wait_entry_unlocked(&xas, entry); |
|---|
| 449 | 436 | rcu_read_lock(); |
|---|
| 450 | 437 | continue; |
|---|
| 451 | 438 | } |
|---|
| 452 | | - lock_slot(mapping, slot); |
|---|
| 453 | | - did_lock = true; |
|---|
| 454 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 439 | + dax_lock_entry(&xas, entry); |
|---|
| 440 | + xas_unlock_irq(&xas); |
|---|
| 455 | 441 | break; |
|---|
| 456 | 442 | } |
|---|
| 457 | 443 | rcu_read_unlock(); |
|---|
| 458 | | - |
|---|
| 459 | | - return did_lock; |
|---|
| 444 | + return (dax_entry_t)entry; |
|---|
| 460 | 445 | } |
|---|
| 461 | 446 | |
|---|
| 462 | | -void dax_unlock_mapping_entry(struct page *page) |
|---|
| 447 | +void dax_unlock_page(struct page *page, dax_entry_t cookie) |
|---|
| 463 | 448 | { |
|---|
| 464 | 449 | struct address_space *mapping = page->mapping; |
|---|
| 465 | | - struct inode *inode = mapping->host; |
|---|
| 450 | + XA_STATE(xas, &mapping->i_pages, page->index); |
|---|
| 466 | 451 | |
|---|
| 467 | | - if (S_ISCHR(inode->i_mode)) |
|---|
| 452 | + if (S_ISCHR(mapping->host->i_mode)) |
|---|
| 468 | 453 | return; |
|---|
| 469 | 454 | |
|---|
| 470 | | - unlock_mapping_entry(mapping, page->index); |
|---|
| 455 | + dax_unlock_entry(&xas, (void *)cookie); |
|---|
| 471 | 456 | } |
|---|
| 472 | 457 | |
|---|
| 473 | 458 | /* |
|---|
| 474 | | - * Find radix tree entry at given index. If it points to an exceptional entry, |
|---|
| 475 | | - * return it with the radix tree entry locked. If the radix tree doesn't |
|---|
| 476 | | - * contain given index, create an empty exceptional entry for the index and |
|---|
| 477 | | - * return with it locked. |
|---|
| 459 | + * Find page cache entry at given index. If it is a DAX entry, return it |
|---|
| 460 | + * with the entry locked. If the page cache doesn't contain an entry at |
|---|
| 461 | + * that index, add a locked empty entry. |
|---|
| 478 | 462 | * |
|---|
| 479 | | - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will |
|---|
| 480 | | - * either return that locked entry or will return an error. This error will |
|---|
| 481 | | - * happen if there are any 4k entries within the 2MiB range that we are |
|---|
| 482 | | - * requesting. |
|---|
| 463 | + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will |
|---|
| 464 | + * either return that locked entry or will return VM_FAULT_FALLBACK. |
|---|
| 465 | + * This will happen if there are any PTE entries within the PMD range |
|---|
| 466 | + * that we are requesting. |
|---|
| 483 | 467 | * |
|---|
| 484 | | - * We always favor 4k entries over 2MiB entries. There isn't a flow where we |
|---|
| 485 | | - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB |
|---|
| 486 | | - * insertion will fail if it finds any 4k entries already in the tree, and a |
|---|
| 487 | | - * 4k insertion will cause an existing 2MiB entry to be unmapped and |
|---|
| 488 | | - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as |
|---|
| 489 | | - * well as 2MiB empty entries. |
|---|
| 468 | + * We always favor PTE entries over PMD entries. There isn't a flow where we |
|---|
| 469 | + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD |
|---|
| 470 | + * insertion will fail if it finds any PTE entries already in the tree, and a |
|---|
| 471 | + * PTE insertion will cause an existing PMD entry to be unmapped and |
|---|
| 472 | + * downgraded to PTE entries. This happens for both PMD zero pages as |
|---|
| 473 | + * well as PMD empty entries. |
|---|
| 490 | 474 | * |
|---|
| 491 | | - * The exception to this downgrade path is for 2MiB DAX PMD entries that have |
|---|
| 492 | | - * real storage backing them. We will leave these real 2MiB DAX entries in |
|---|
| 493 | | - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. |
|---|
| 475 | + * The exception to this downgrade path is for PMD entries that have |
|---|
| 476 | + * real storage backing them. We will leave these real PMD entries in |
|---|
| 477 | + * the tree, and PTE writes will simply dirty the entire PMD entry. |
|---|
| 494 | 478 | * |
|---|
| 495 | 479 | * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For |
|---|
| 496 | 480 | * persistent memory the benefit is doubtful. We can add that later if we can |
|---|
| 497 | 481 | * show it helps. |
|---|
| 482 | + * |
|---|
| 483 | + * On error, this function does not return an ERR_PTR. Instead it returns |
|---|
| 484 | + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values |
|---|
| 485 | + * overlap with xarray value entries. |
|---|
| 498 | 486 | */ |
|---|
| 499 | | -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, |
|---|
| 500 | | - unsigned long size_flag) |
|---|
| 487 | +static void *grab_mapping_entry(struct xa_state *xas, |
|---|
| 488 | + struct address_space *mapping, unsigned int order) |
|---|
| 501 | 489 | { |
|---|
| 502 | | - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ |
|---|
| 503 | | - void *entry, **slot; |
|---|
| 490 | + unsigned long index = xas->xa_index; |
|---|
| 491 | + bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ |
|---|
| 492 | + void *entry; |
|---|
| 504 | 493 | |
|---|
| 505 | | -restart: |
|---|
| 506 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 507 | | - entry = get_unlocked_mapping_entry(mapping, index, &slot); |
|---|
| 508 | | - |
|---|
| 509 | | - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { |
|---|
| 510 | | - entry = ERR_PTR(-EIO); |
|---|
| 511 | | - goto out_unlock; |
|---|
| 512 | | - } |
|---|
| 494 | +retry: |
|---|
| 495 | + pmd_downgrade = false; |
|---|
| 496 | + xas_lock_irq(xas); |
|---|
| 497 | + entry = get_unlocked_entry(xas, order); |
|---|
| 513 | 498 | |
|---|
| 514 | 499 | if (entry) { |
|---|
| 515 | | - if (size_flag & RADIX_DAX_PMD) { |
|---|
| 516 | | - if (dax_is_pte_entry(entry)) { |
|---|
| 517 | | - put_unlocked_mapping_entry(mapping, index, |
|---|
| 518 | | - entry); |
|---|
| 519 | | - entry = ERR_PTR(-EEXIST); |
|---|
| 520 | | - goto out_unlock; |
|---|
| 521 | | - } |
|---|
| 522 | | - } else { /* trying to grab a PTE entry */ |
|---|
| 500 | + if (dax_is_conflict(entry)) |
|---|
| 501 | + goto fallback; |
|---|
| 502 | + if (!xa_is_value(entry)) { |
|---|
| 503 | + xas_set_err(xas, -EIO); |
|---|
| 504 | + goto out_unlock; |
|---|
| 505 | + } |
|---|
| 506 | + |
|---|
| 507 | + if (order == 0) { |
|---|
| 523 | 508 | if (dax_is_pmd_entry(entry) && |
|---|
| 524 | 509 | (dax_is_zero_entry(entry) || |
|---|
| 525 | 510 | dax_is_empty_entry(entry))) { |
|---|
| .. | .. |
|---|
| 528 | 513 | } |
|---|
| 529 | 514 | } |
|---|
| 530 | 515 | |
|---|
| 531 | | - /* No entry for given index? Make sure radix tree is big enough. */ |
|---|
| 532 | | - if (!entry || pmd_downgrade) { |
|---|
| 533 | | - int err; |
|---|
| 516 | + if (pmd_downgrade) { |
|---|
| 517 | + /* |
|---|
| 518 | + * Make sure 'entry' remains valid while we drop |
|---|
| 519 | + * the i_pages lock. |
|---|
| 520 | + */ |
|---|
| 521 | + dax_lock_entry(xas, entry); |
|---|
| 534 | 522 | |
|---|
| 535 | | - if (pmd_downgrade) { |
|---|
| 536 | | - /* |
|---|
| 537 | | - * Make sure 'entry' remains valid while we drop |
|---|
| 538 | | - * the i_pages lock. |
|---|
| 539 | | - */ |
|---|
| 540 | | - entry = lock_slot(mapping, slot); |
|---|
| 541 | | - } |
|---|
| 542 | | - |
|---|
| 543 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 544 | 523 | /* |
|---|
| 545 | 524 | * Besides huge zero pages the only other thing that gets |
|---|
| 546 | 525 | * downgraded are empty entries which don't need to be |
|---|
| 547 | 526 | * unmapped. |
|---|
| 548 | 527 | */ |
|---|
| 549 | | - if (pmd_downgrade && dax_is_zero_entry(entry)) |
|---|
| 550 | | - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, |
|---|
| 551 | | - PG_PMD_NR, false); |
|---|
| 552 | | - |
|---|
| 553 | | - err = radix_tree_preload( |
|---|
| 554 | | - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); |
|---|
| 555 | | - if (err) { |
|---|
| 556 | | - if (pmd_downgrade) |
|---|
| 557 | | - put_locked_mapping_entry(mapping, index); |
|---|
| 558 | | - return ERR_PTR(err); |
|---|
| 559 | | - } |
|---|
| 560 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 561 | | - |
|---|
| 562 | | - if (!entry) { |
|---|
| 563 | | - /* |
|---|
| 564 | | - * We needed to drop the i_pages lock while calling |
|---|
| 565 | | - * radix_tree_preload() and we didn't have an entry to |
|---|
| 566 | | - * lock. See if another thread inserted an entry at |
|---|
| 567 | | - * our index during this time. |
|---|
| 568 | | - */ |
|---|
| 569 | | - entry = __radix_tree_lookup(&mapping->i_pages, index, |
|---|
| 570 | | - NULL, &slot); |
|---|
| 571 | | - if (entry) { |
|---|
| 572 | | - radix_tree_preload_end(); |
|---|
| 573 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 574 | | - goto restart; |
|---|
| 575 | | - } |
|---|
| 528 | + if (dax_is_zero_entry(entry)) { |
|---|
| 529 | + xas_unlock_irq(xas); |
|---|
| 530 | + unmap_mapping_pages(mapping, |
|---|
| 531 | + xas->xa_index & ~PG_PMD_COLOUR, |
|---|
| 532 | + PG_PMD_NR, false); |
|---|
| 533 | + xas_reset(xas); |
|---|
| 534 | + xas_lock_irq(xas); |
|---|
| 576 | 535 | } |
|---|
| 577 | 536 | |
|---|
| 578 | | - if (pmd_downgrade) { |
|---|
| 579 | | - dax_disassociate_entry(entry, mapping, false); |
|---|
| 580 | | - radix_tree_delete(&mapping->i_pages, index); |
|---|
| 581 | | - mapping->nrexceptional--; |
|---|
| 582 | | - dax_wake_mapping_entry_waiter(mapping, index, entry, |
|---|
| 583 | | - true); |
|---|
| 584 | | - } |
|---|
| 585 | | - |
|---|
| 586 | | - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); |
|---|
| 587 | | - |
|---|
| 588 | | - err = __radix_tree_insert(&mapping->i_pages, index, |
|---|
| 589 | | - dax_radix_order(entry), entry); |
|---|
| 590 | | - radix_tree_preload_end(); |
|---|
| 591 | | - if (err) { |
|---|
| 592 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 593 | | - /* |
|---|
| 594 | | - * Our insertion of a DAX entry failed, most likely |
|---|
| 595 | | - * because we were inserting a PMD entry and it |
|---|
| 596 | | - * collided with a PTE sized entry at a different |
|---|
| 597 | | - * index in the PMD range. We haven't inserted |
|---|
| 598 | | - * anything into the radix tree and have no waiters to |
|---|
| 599 | | - * wake. |
|---|
| 600 | | - */ |
|---|
| 601 | | - return ERR_PTR(err); |
|---|
| 602 | | - } |
|---|
| 603 | | - /* Good, we have inserted empty locked entry into the tree. */ |
|---|
| 604 | | - mapping->nrexceptional++; |
|---|
| 605 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 606 | | - return entry; |
|---|
| 537 | + dax_disassociate_entry(entry, mapping, false); |
|---|
| 538 | + xas_store(xas, NULL); /* undo the PMD join */ |
|---|
| 539 | + dax_wake_entry(xas, entry, WAKE_ALL); |
|---|
| 540 | + mapping->nrexceptional--; |
|---|
| 541 | + entry = NULL; |
|---|
| 542 | + xas_set(xas, index); |
|---|
| 607 | 543 | } |
|---|
| 608 | | - entry = lock_slot(mapping, slot); |
|---|
| 609 | | - out_unlock: |
|---|
| 610 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 544 | + |
|---|
| 545 | + if (entry) { |
|---|
| 546 | + dax_lock_entry(xas, entry); |
|---|
| 547 | + } else { |
|---|
| 548 | + unsigned long flags = DAX_EMPTY; |
|---|
| 549 | + |
|---|
| 550 | + if (order > 0) |
|---|
| 551 | + flags |= DAX_PMD; |
|---|
| 552 | + entry = dax_make_entry(pfn_to_pfn_t(0), flags); |
|---|
| 553 | + dax_lock_entry(xas, entry); |
|---|
| 554 | + if (xas_error(xas)) |
|---|
| 555 | + goto out_unlock; |
|---|
| 556 | + mapping->nrexceptional++; |
|---|
| 557 | + } |
|---|
| 558 | + |
|---|
| 559 | +out_unlock: |
|---|
| 560 | + xas_unlock_irq(xas); |
|---|
| 561 | + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) |
|---|
| 562 | + goto retry; |
|---|
| 563 | + if (xas->xa_node == XA_ERROR(-ENOMEM)) |
|---|
| 564 | + return xa_mk_internal(VM_FAULT_OOM); |
|---|
| 565 | + if (xas_error(xas)) |
|---|
| 566 | + return xa_mk_internal(VM_FAULT_SIGBUS); |
|---|
| 611 | 567 | return entry; |
|---|
| 568 | +fallback: |
|---|
| 569 | + xas_unlock_irq(xas); |
|---|
| 570 | + return xa_mk_internal(VM_FAULT_FALLBACK); |
|---|
| 612 | 571 | } |
|---|
| 613 | 572 | |
|---|
| 614 | 573 | /** |
|---|
| 615 | | - * dax_layout_busy_page - find first pinned page in @mapping |
|---|
| 574 | + * dax_layout_busy_page_range - find first pinned page in @mapping |
|---|
| 616 | 575 | * @mapping: address space to scan for a page with ref count > 1 |
|---|
| 576 | + * @start: Starting offset. Page containing 'start' is included. |
|---|
| 577 | + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, |
|---|
| 578 | + * pages from 'start' till the end of file are included. |
|---|
| 617 | 579 | * |
|---|
| 618 | 580 | * DAX requires ZONE_DEVICE mapped pages. These pages are never |
|---|
| 619 | 581 | * 'onlined' to the page allocator so they are considered idle when |
|---|
| .. | .. |
|---|
| 626 | 588 | * to be able to run unmap_mapping_range() and subsequently not race |
|---|
| 627 | 589 | * mapping_mapped() becoming true. |
|---|
| 628 | 590 | */ |
|---|
| 629 | | -struct page *dax_layout_busy_page(struct address_space *mapping) |
|---|
| 591 | +struct page *dax_layout_busy_page_range(struct address_space *mapping, |
|---|
| 592 | + loff_t start, loff_t end) |
|---|
| 630 | 593 | { |
|---|
| 631 | | - pgoff_t indices[PAGEVEC_SIZE]; |
|---|
| 594 | + void *entry; |
|---|
| 595 | + unsigned int scanned = 0; |
|---|
| 632 | 596 | struct page *page = NULL; |
|---|
| 633 | | - struct pagevec pvec; |
|---|
| 634 | | - pgoff_t index, end; |
|---|
| 635 | | - unsigned i; |
|---|
| 597 | + pgoff_t start_idx = start >> PAGE_SHIFT; |
|---|
| 598 | + pgoff_t end_idx; |
|---|
| 599 | + XA_STATE(xas, &mapping->i_pages, start_idx); |
|---|
| 636 | 600 | |
|---|
| 637 | 601 | /* |
|---|
| 638 | 602 | * In the 'limited' case get_user_pages() for dax is disabled. |
|---|
| .. | .. |
|---|
| 643 | 607 | if (!dax_mapping(mapping) || !mapping_mapped(mapping)) |
|---|
| 644 | 608 | return NULL; |
|---|
| 645 | 609 | |
|---|
| 646 | | - pagevec_init(&pvec); |
|---|
| 647 | | - index = 0; |
|---|
| 648 | | - end = -1; |
|---|
| 649 | | - |
|---|
| 610 | + /* If end == LLONG_MAX, all pages from start to till end of file */ |
|---|
| 611 | + if (end == LLONG_MAX) |
|---|
| 612 | + end_idx = ULONG_MAX; |
|---|
| 613 | + else |
|---|
| 614 | + end_idx = end >> PAGE_SHIFT; |
|---|
| 650 | 615 | /* |
|---|
| 651 | 616 | * If we race get_user_pages_fast() here either we'll see the |
|---|
| 652 | | - * elevated page count in the pagevec_lookup and wait, or |
|---|
| 617 | + * elevated page count in the iteration and wait, or |
|---|
| 653 | 618 | * get_user_pages_fast() will see that the page it took a reference |
|---|
| 654 | 619 | * against is no longer mapped in the page tables and bail to the |
|---|
| 655 | 620 | * get_user_pages() slow path. The slow path is protected by |
|---|
| 656 | 621 | * pte_lock() and pmd_lock(). New references are not taken without |
|---|
| 657 | | - * holding those locks, and unmap_mapping_range() will not zero the |
|---|
| 622 | + * holding those locks, and unmap_mapping_pages() will not zero the |
|---|
| 658 | 623 | * pte or pmd without holding the respective lock, so we are |
|---|
| 659 | 624 | * guaranteed to either see new references or prevent new |
|---|
| 660 | 625 | * references from being established. |
|---|
| 661 | 626 | */ |
|---|
| 662 | | - unmap_mapping_range(mapping, 0, 0, 0); |
|---|
| 627 | + unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); |
|---|
| 663 | 628 | |
|---|
| 664 | | - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, |
|---|
| 665 | | - min(end - index, (pgoff_t)PAGEVEC_SIZE), |
|---|
| 666 | | - indices)) { |
|---|
| 667 | | - pgoff_t nr_pages = 1; |
|---|
| 668 | | - |
|---|
| 669 | | - for (i = 0; i < pagevec_count(&pvec); i++) { |
|---|
| 670 | | - struct page *pvec_ent = pvec.pages[i]; |
|---|
| 671 | | - void *entry; |
|---|
| 672 | | - |
|---|
| 673 | | - index = indices[i]; |
|---|
| 674 | | - if (index >= end) |
|---|
| 675 | | - break; |
|---|
| 676 | | - |
|---|
| 677 | | - if (WARN_ON_ONCE( |
|---|
| 678 | | - !radix_tree_exceptional_entry(pvec_ent))) |
|---|
| 679 | | - continue; |
|---|
| 680 | | - |
|---|
| 681 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 682 | | - entry = get_unlocked_mapping_entry(mapping, index, NULL); |
|---|
| 683 | | - if (entry) { |
|---|
| 684 | | - page = dax_busy_page(entry); |
|---|
| 685 | | - /* |
|---|
| 686 | | - * Account for multi-order entries at |
|---|
| 687 | | - * the end of the pagevec. |
|---|
| 688 | | - */ |
|---|
| 689 | | - if (i + 1 >= pagevec_count(&pvec)) |
|---|
| 690 | | - nr_pages = 1UL << dax_radix_order(entry); |
|---|
| 691 | | - } |
|---|
| 692 | | - put_unlocked_mapping_entry(mapping, index, entry); |
|---|
| 693 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 694 | | - if (page) |
|---|
| 695 | | - break; |
|---|
| 696 | | - } |
|---|
| 697 | | - |
|---|
| 698 | | - /* |
|---|
| 699 | | - * We don't expect normal struct page entries to exist in our |
|---|
| 700 | | - * tree, but we keep these pagevec calls so that this code is |
|---|
| 701 | | - * consistent with the common pattern for handling pagevecs |
|---|
| 702 | | - * throughout the kernel. |
|---|
| 703 | | - */ |
|---|
| 704 | | - pagevec_remove_exceptionals(&pvec); |
|---|
| 705 | | - pagevec_release(&pvec); |
|---|
| 706 | | - index += nr_pages; |
|---|
| 707 | | - |
|---|
| 629 | + xas_lock_irq(&xas); |
|---|
| 630 | + xas_for_each(&xas, entry, end_idx) { |
|---|
| 631 | + if (WARN_ON_ONCE(!xa_is_value(entry))) |
|---|
| 632 | + continue; |
|---|
| 633 | + if (unlikely(dax_is_locked(entry))) |
|---|
| 634 | + entry = get_unlocked_entry(&xas, 0); |
|---|
| 635 | + if (entry) |
|---|
| 636 | + page = dax_busy_page(entry); |
|---|
| 637 | + put_unlocked_entry(&xas, entry, WAKE_NEXT); |
|---|
| 708 | 638 | if (page) |
|---|
| 709 | 639 | break; |
|---|
| 640 | + if (++scanned % XA_CHECK_SCHED) |
|---|
| 641 | + continue; |
|---|
| 642 | + |
|---|
| 643 | + xas_pause(&xas); |
|---|
| 644 | + xas_unlock_irq(&xas); |
|---|
| 645 | + cond_resched(); |
|---|
| 646 | + xas_lock_irq(&xas); |
|---|
| 710 | 647 | } |
|---|
| 648 | + xas_unlock_irq(&xas); |
|---|
| 711 | 649 | return page; |
|---|
| 650 | +} |
|---|
| 651 | +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); |
|---|
| 652 | + |
|---|
| 653 | +struct page *dax_layout_busy_page(struct address_space *mapping) |
|---|
| 654 | +{ |
|---|
| 655 | + return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); |
|---|
| 712 | 656 | } |
|---|
| 713 | 657 | EXPORT_SYMBOL_GPL(dax_layout_busy_page); |
|---|
| 714 | 658 | |
|---|
| 715 | | -static int __dax_invalidate_mapping_entry(struct address_space *mapping, |
|---|
| 659 | +static int __dax_invalidate_entry(struct address_space *mapping, |
|---|
| 716 | 660 | pgoff_t index, bool trunc) |
|---|
| 717 | 661 | { |
|---|
| 662 | + XA_STATE(xas, &mapping->i_pages, index); |
|---|
| 718 | 663 | int ret = 0; |
|---|
| 719 | 664 | void *entry; |
|---|
| 720 | | - struct radix_tree_root *pages = &mapping->i_pages; |
|---|
| 721 | 665 | |
|---|
| 722 | | - xa_lock_irq(pages); |
|---|
| 723 | | - entry = get_unlocked_mapping_entry(mapping, index, NULL); |
|---|
| 724 | | - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) |
|---|
| 666 | + xas_lock_irq(&xas); |
|---|
| 667 | + entry = get_unlocked_entry(&xas, 0); |
|---|
| 668 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
|---|
| 725 | 669 | goto out; |
|---|
| 726 | 670 | if (!trunc && |
|---|
| 727 | | - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || |
|---|
| 728 | | - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) |
|---|
| 671 | + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || |
|---|
| 672 | + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) |
|---|
| 729 | 673 | goto out; |
|---|
| 730 | 674 | dax_disassociate_entry(entry, mapping, trunc); |
|---|
| 731 | | - radix_tree_delete(pages, index); |
|---|
| 675 | + xas_store(&xas, NULL); |
|---|
| 732 | 676 | mapping->nrexceptional--; |
|---|
| 733 | 677 | ret = 1; |
|---|
| 734 | 678 | out: |
|---|
| 735 | | - put_unlocked_mapping_entry(mapping, index, entry); |
|---|
| 736 | | - xa_unlock_irq(pages); |
|---|
| 679 | + put_unlocked_entry(&xas, entry, WAKE_ALL); |
|---|
| 680 | + xas_unlock_irq(&xas); |
|---|
| 737 | 681 | return ret; |
|---|
| 738 | 682 | } |
|---|
| 683 | + |
|---|
| 739 | 684 | /* |
|---|
| 740 | | - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
|---|
| 741 | | - * entry to get unlocked before deleting it. |
|---|
| 685 | + * Delete DAX entry at @index from @mapping. Wait for it |
|---|
| 686 | + * to be unlocked before deleting it. |
|---|
| 742 | 687 | */ |
|---|
| 743 | 688 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) |
|---|
| 744 | 689 | { |
|---|
| 745 | | - int ret = __dax_invalidate_mapping_entry(mapping, index, true); |
|---|
| 690 | + int ret = __dax_invalidate_entry(mapping, index, true); |
|---|
| 746 | 691 | |
|---|
| 747 | 692 | /* |
|---|
| 748 | 693 | * This gets called from truncate / punch_hole path. As such, the caller |
|---|
| 749 | 694 | * must hold locks protecting against concurrent modifications of the |
|---|
| 750 | | - * radix tree (usually fs-private i_mmap_sem for writing). Since the |
|---|
| 751 | | - * caller has seen exceptional entry for this index, we better find it |
|---|
| 695 | + * page cache (usually fs-private i_mmap_sem for writing). Since the |
|---|
| 696 | + * caller has seen a DAX entry for this index, we better find it |
|---|
| 752 | 697 | * at that index as well... |
|---|
| 753 | 698 | */ |
|---|
| 754 | 699 | WARN_ON_ONCE(!ret); |
|---|
| .. | .. |
|---|
| 756 | 701 | } |
|---|
| 757 | 702 | |
|---|
| 758 | 703 | /* |
|---|
| 759 | | - * Invalidate exceptional DAX entry if it is clean. |
|---|
| 704 | + * Invalidate DAX entry if it is clean. |
|---|
| 760 | 705 | */ |
|---|
| 761 | 706 | int dax_invalidate_mapping_entry_sync(struct address_space *mapping, |
|---|
| 762 | 707 | pgoff_t index) |
|---|
| 763 | 708 | { |
|---|
| 764 | | - return __dax_invalidate_mapping_entry(mapping, index, false); |
|---|
| 709 | + return __dax_invalidate_entry(mapping, index, false); |
|---|
| 765 | 710 | } |
|---|
| 766 | 711 | |
|---|
| 767 | | -static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, |
|---|
| 768 | | - sector_t sector, size_t size, struct page *to, |
|---|
| 769 | | - unsigned long vaddr) |
|---|
| 712 | +static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev, |
|---|
| 713 | + sector_t sector, struct page *to, unsigned long vaddr) |
|---|
| 770 | 714 | { |
|---|
| 771 | 715 | void *vto, *kaddr; |
|---|
| 772 | 716 | pgoff_t pgoff; |
|---|
| 773 | 717 | long rc; |
|---|
| 774 | 718 | int id; |
|---|
| 775 | 719 | |
|---|
| 776 | | - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); |
|---|
| 720 | + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); |
|---|
| 777 | 721 | if (rc) |
|---|
| 778 | 722 | return rc; |
|---|
| 779 | 723 | |
|---|
| 780 | 724 | id = dax_read_lock(); |
|---|
| 781 | | - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); |
|---|
| 725 | + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL); |
|---|
| 782 | 726 | if (rc < 0) { |
|---|
| 783 | 727 | dax_read_unlock(id); |
|---|
| 784 | 728 | return rc; |
|---|
| 785 | 729 | } |
|---|
| 786 | 730 | vto = kmap_atomic(to); |
|---|
| 731 | +#ifdef CONFIG_ARM |
|---|
| 732 | +#ifndef copy_user_page |
|---|
| 733 | +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) |
|---|
| 734 | +#endif |
|---|
| 735 | +#endif |
|---|
| 787 | 736 | copy_user_page(vto, (void __force *)kaddr, vaddr, to); |
|---|
| 788 | 737 | kunmap_atomic(vto); |
|---|
| 789 | 738 | dax_read_unlock(id); |
|---|
| .. | .. |
|---|
| 797 | 746 | * already in the tree, we will skip the insertion and just dirty the PMD as |
|---|
| 798 | 747 | * appropriate. |
|---|
| 799 | 748 | */ |
|---|
| 800 | | -static void *dax_insert_mapping_entry(struct address_space *mapping, |
|---|
| 801 | | - struct vm_fault *vmf, |
|---|
| 802 | | - void *entry, pfn_t pfn_t, |
|---|
| 803 | | - unsigned long flags, bool dirty) |
|---|
| 749 | +static void *dax_insert_entry(struct xa_state *xas, |
|---|
| 750 | + struct address_space *mapping, struct vm_fault *vmf, |
|---|
| 751 | + void *entry, pfn_t pfn, unsigned long flags, bool dirty) |
|---|
| 804 | 752 | { |
|---|
| 805 | | - struct radix_tree_root *pages = &mapping->i_pages; |
|---|
| 806 | | - unsigned long pfn = pfn_t_to_pfn(pfn_t); |
|---|
| 807 | | - pgoff_t index = vmf->pgoff; |
|---|
| 808 | | - void *new_entry; |
|---|
| 753 | + void *new_entry = dax_make_entry(pfn, flags); |
|---|
| 809 | 754 | |
|---|
| 810 | 755 | if (dirty) |
|---|
| 811 | 756 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
|---|
| 812 | 757 | |
|---|
| 813 | | - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { |
|---|
| 758 | + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { |
|---|
| 759 | + unsigned long index = xas->xa_index; |
|---|
| 814 | 760 | /* we are replacing a zero page with block mapping */ |
|---|
| 815 | 761 | if (dax_is_pmd_entry(entry)) |
|---|
| 816 | 762 | unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, |
|---|
| 817 | | - PG_PMD_NR, false); |
|---|
| 763 | + PG_PMD_NR, false); |
|---|
| 818 | 764 | else /* pte entry */ |
|---|
| 819 | | - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); |
|---|
| 765 | + unmap_mapping_pages(mapping, index, 1, false); |
|---|
| 820 | 766 | } |
|---|
| 821 | 767 | |
|---|
| 822 | | - xa_lock_irq(pages); |
|---|
| 823 | | - new_entry = dax_radix_locked_entry(pfn, flags); |
|---|
| 824 | | - if (dax_entry_size(entry) != dax_entry_size(new_entry)) { |
|---|
| 768 | + xas_reset(xas); |
|---|
| 769 | + xas_lock_irq(xas); |
|---|
| 770 | + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
|---|
| 771 | + void *old; |
|---|
| 772 | + |
|---|
| 825 | 773 | dax_disassociate_entry(entry, mapping, false); |
|---|
| 826 | 774 | dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); |
|---|
| 827 | | - } |
|---|
| 828 | | - |
|---|
| 829 | | - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
|---|
| 830 | 775 | /* |
|---|
| 831 | | - * Only swap our new entry into the radix tree if the current |
|---|
| 776 | + * Only swap our new entry into the page cache if the current |
|---|
| 832 | 777 | * entry is a zero page or an empty entry. If a normal PTE or |
|---|
| 833 | | - * PMD entry is already in the tree, we leave it alone. This |
|---|
| 778 | + * PMD entry is already in the cache, we leave it alone. This |
|---|
| 834 | 779 | * means that if we are trying to insert a PTE and the |
|---|
| 835 | 780 | * existing entry is a PMD, we will just leave the PMD in the |
|---|
| 836 | 781 | * tree and dirty it if necessary. |
|---|
| 837 | 782 | */ |
|---|
| 838 | | - struct radix_tree_node *node; |
|---|
| 839 | | - void **slot; |
|---|
| 840 | | - void *ret; |
|---|
| 841 | | - |
|---|
| 842 | | - ret = __radix_tree_lookup(pages, index, &node, &slot); |
|---|
| 843 | | - WARN_ON_ONCE(ret != entry); |
|---|
| 844 | | - __radix_tree_replace(pages, node, slot, |
|---|
| 845 | | - new_entry, NULL); |
|---|
| 783 | + old = dax_lock_entry(xas, new_entry); |
|---|
| 784 | + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | |
|---|
| 785 | + DAX_LOCKED)); |
|---|
| 846 | 786 | entry = new_entry; |
|---|
| 787 | + } else { |
|---|
| 788 | + xas_load(xas); /* Walk the xa_state */ |
|---|
| 847 | 789 | } |
|---|
| 848 | 790 | |
|---|
| 849 | 791 | if (dirty) |
|---|
| 850 | | - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); |
|---|
| 792 | + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); |
|---|
| 851 | 793 | |
|---|
| 852 | | - xa_unlock_irq(pages); |
|---|
| 794 | + xas_unlock_irq(xas); |
|---|
| 853 | 795 | return entry; |
|---|
| 854 | 796 | } |
|---|
| 855 | 797 | |
|---|
| 856 | | -static inline unsigned long |
|---|
| 857 | | -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) |
|---|
| 798 | +static inline |
|---|
| 799 | +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) |
|---|
| 858 | 800 | { |
|---|
| 859 | 801 | unsigned long address; |
|---|
| 860 | 802 | |
|---|
| .. | .. |
|---|
| 864 | 806 | } |
|---|
| 865 | 807 | |
|---|
| 866 | 808 | /* Walk all mappings of a given index of a file and writeprotect them */ |
|---|
| 867 | | -static void dax_mapping_entry_mkclean(struct address_space *mapping, |
|---|
| 868 | | - pgoff_t index, unsigned long pfn) |
|---|
| 809 | +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, |
|---|
| 810 | + unsigned long pfn) |
|---|
| 869 | 811 | { |
|---|
| 870 | 812 | struct vm_area_struct *vma; |
|---|
| 871 | 813 | pte_t pte, *ptep = NULL; |
|---|
| .. | .. |
|---|
| 874 | 816 | |
|---|
| 875 | 817 | i_mmap_lock_read(mapping); |
|---|
| 876 | 818 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { |
|---|
| 877 | | - unsigned long address, start, end; |
|---|
| 819 | + struct mmu_notifier_range range; |
|---|
| 820 | + unsigned long address; |
|---|
| 878 | 821 | |
|---|
| 879 | 822 | cond_resched(); |
|---|
| 880 | 823 | |
|---|
| .. | .. |
|---|
| 884 | 827 | address = pgoff_address(index, vma); |
|---|
| 885 | 828 | |
|---|
| 886 | 829 | /* |
|---|
| 887 | | - * Note because we provide start/end to follow_pte_pmd it will |
|---|
| 888 | | - * call mmu_notifier_invalidate_range_start() on our behalf |
|---|
| 889 | | - * before taking any lock. |
|---|
| 830 | + * follow_invalidate_pte() will use the range to call |
|---|
| 831 | + * mmu_notifier_invalidate_range_start() on our behalf before |
|---|
| 832 | + * taking any lock. |
|---|
| 890 | 833 | */ |
|---|
| 891 | | - if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) |
|---|
| 834 | + if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, |
|---|
| 835 | + &pmdp, &ptl)) |
|---|
| 892 | 836 | continue; |
|---|
| 893 | 837 | |
|---|
| 894 | 838 | /* |
|---|
| .. | .. |
|---|
| 907 | 851 | if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) |
|---|
| 908 | 852 | goto unlock_pmd; |
|---|
| 909 | 853 | |
|---|
| 910 | | - flush_cache_page(vma, address, pfn); |
|---|
| 854 | + flush_cache_range(vma, address, |
|---|
| 855 | + address + HPAGE_PMD_SIZE); |
|---|
| 911 | 856 | pmd = pmdp_invalidate(vma, address, pmdp); |
|---|
| 912 | 857 | pmd = pmd_wrprotect(pmd); |
|---|
| 913 | 858 | pmd = pmd_mkclean(pmd); |
|---|
| .. | .. |
|---|
| 930 | 875 | pte_unmap_unlock(ptep, ptl); |
|---|
| 931 | 876 | } |
|---|
| 932 | 877 | |
|---|
| 933 | | - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
|---|
| 878 | + mmu_notifier_invalidate_range_end(&range); |
|---|
| 934 | 879 | } |
|---|
| 935 | 880 | i_mmap_unlock_read(mapping); |
|---|
| 936 | 881 | } |
|---|
| 937 | 882 | |
|---|
| 938 | | -static int dax_writeback_one(struct dax_device *dax_dev, |
|---|
| 939 | | - struct address_space *mapping, pgoff_t index, void *entry) |
|---|
| 883 | +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, |
|---|
| 884 | + struct address_space *mapping, void *entry) |
|---|
| 940 | 885 | { |
|---|
| 941 | | - struct radix_tree_root *pages = &mapping->i_pages; |
|---|
| 942 | | - void *entry2, **slot; |
|---|
| 943 | | - unsigned long pfn; |
|---|
| 886 | + unsigned long pfn, index, count; |
|---|
| 944 | 887 | long ret = 0; |
|---|
| 945 | | - size_t size; |
|---|
| 946 | 888 | |
|---|
| 947 | 889 | /* |
|---|
| 948 | 890 | * A page got tagged dirty in DAX mapping? Something is seriously |
|---|
| 949 | 891 | * wrong. |
|---|
| 950 | 892 | */ |
|---|
| 951 | | - if (WARN_ON(!radix_tree_exceptional_entry(entry))) |
|---|
| 893 | + if (WARN_ON(!xa_is_value(entry))) |
|---|
| 952 | 894 | return -EIO; |
|---|
| 953 | 895 | |
|---|
| 954 | | - xa_lock_irq(pages); |
|---|
| 955 | | - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); |
|---|
| 956 | | - /* Entry got punched out / reallocated? */ |
|---|
| 957 | | - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) |
|---|
| 958 | | - goto put_unlocked; |
|---|
| 959 | | - /* |
|---|
| 960 | | - * Entry got reallocated elsewhere? No need to writeback. We have to |
|---|
| 961 | | - * compare pfns as we must not bail out due to difference in lockbit |
|---|
| 962 | | - * or entry type. |
|---|
| 963 | | - */ |
|---|
| 964 | | - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) |
|---|
| 965 | | - goto put_unlocked; |
|---|
| 966 | | - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
|---|
| 967 | | - dax_is_zero_entry(entry))) { |
|---|
| 968 | | - ret = -EIO; |
|---|
| 969 | | - goto put_unlocked; |
|---|
| 896 | + if (unlikely(dax_is_locked(entry))) { |
|---|
| 897 | + void *old_entry = entry; |
|---|
| 898 | + |
|---|
| 899 | + entry = get_unlocked_entry(xas, 0); |
|---|
| 900 | + |
|---|
| 901 | + /* Entry got punched out / reallocated? */ |
|---|
| 902 | + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) |
|---|
| 903 | + goto put_unlocked; |
|---|
| 904 | + /* |
|---|
| 905 | + * Entry got reallocated elsewhere? No need to writeback. |
|---|
| 906 | + * We have to compare pfns as we must not bail out due to |
|---|
| 907 | + * difference in lockbit or entry type. |
|---|
| 908 | + */ |
|---|
| 909 | + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) |
|---|
| 910 | + goto put_unlocked; |
|---|
| 911 | + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
|---|
| 912 | + dax_is_zero_entry(entry))) { |
|---|
| 913 | + ret = -EIO; |
|---|
| 914 | + goto put_unlocked; |
|---|
| 915 | + } |
|---|
| 916 | + |
|---|
| 917 | + /* Another fsync thread may have already done this entry */ |
|---|
| 918 | + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) |
|---|
| 919 | + goto put_unlocked; |
|---|
| 970 | 920 | } |
|---|
| 971 | 921 | |
|---|
| 972 | | - /* Another fsync thread may have already written back this entry */ |
|---|
| 973 | | - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) |
|---|
| 974 | | - goto put_unlocked; |
|---|
| 975 | 922 | /* Lock the entry to serialize with page faults */ |
|---|
| 976 | | - entry = lock_slot(mapping, slot); |
|---|
| 923 | + dax_lock_entry(xas, entry); |
|---|
| 924 | + |
|---|
| 977 | 925 | /* |
|---|
| 978 | 926 | * We can clear the tag now but we have to be careful so that concurrent |
|---|
| 979 | 927 | * dax_writeback_one() calls for the same index cannot finish before we |
|---|
| .. | .. |
|---|
| 981 | 929 | * at the entry only under the i_pages lock and once they do that |
|---|
| 982 | 930 | * they will see the entry locked and wait for it to unlock. |
|---|
| 983 | 931 | */ |
|---|
| 984 | | - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); |
|---|
| 985 | | - xa_unlock_irq(pages); |
|---|
| 932 | + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); |
|---|
| 933 | + xas_unlock_irq(xas); |
|---|
| 986 | 934 | |
|---|
| 987 | 935 | /* |
|---|
| 988 | | - * Even if dax_writeback_mapping_range() was given a wbc->range_start |
|---|
| 989 | | - * in the middle of a PMD, the 'index' we are given will be aligned to |
|---|
| 990 | | - * the start index of the PMD, as will the pfn we pull from 'entry'. |
|---|
| 936 | + * If dax_writeback_mapping_range() was given a wbc->range_start |
|---|
| 937 | + * in the middle of a PMD, the 'index' we use needs to be |
|---|
| 938 | + * aligned to the start of the PMD. |
|---|
| 991 | 939 | * This allows us to flush for PMD_SIZE and not have to worry about |
|---|
| 992 | 940 | * partial PMD writebacks. |
|---|
| 993 | 941 | */ |
|---|
| 994 | | - pfn = dax_radix_pfn(entry); |
|---|
| 995 | | - size = PAGE_SIZE << dax_radix_order(entry); |
|---|
| 942 | + pfn = dax_to_pfn(entry); |
|---|
| 943 | + count = 1UL << dax_entry_order(entry); |
|---|
| 944 | + index = xas->xa_index & ~(count - 1); |
|---|
| 996 | 945 | |
|---|
| 997 | | - dax_mapping_entry_mkclean(mapping, index, pfn); |
|---|
| 998 | | - dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); |
|---|
| 946 | + dax_entry_mkclean(mapping, index, pfn); |
|---|
| 947 | + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); |
|---|
| 999 | 948 | /* |
|---|
| 1000 | 949 | * After we have flushed the cache, we can clear the dirty tag. There |
|---|
| 1001 | 950 | * cannot be new dirty data in the pfn after the flush has completed as |
|---|
| 1002 | 951 | * the pfn mappings are writeprotected and fault waits for mapping |
|---|
| 1003 | 952 | * entry lock. |
|---|
| 1004 | 953 | */ |
|---|
| 1005 | | - xa_lock_irq(pages); |
|---|
| 1006 | | - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); |
|---|
| 1007 | | - xa_unlock_irq(pages); |
|---|
| 1008 | | - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
|---|
| 1009 | | - put_locked_mapping_entry(mapping, index); |
|---|
| 954 | + xas_reset(xas); |
|---|
| 955 | + xas_lock_irq(xas); |
|---|
| 956 | + xas_store(xas, entry); |
|---|
| 957 | + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); |
|---|
| 958 | + dax_wake_entry(xas, entry, WAKE_NEXT); |
|---|
| 959 | + |
|---|
| 960 | + trace_dax_writeback_one(mapping->host, index, count); |
|---|
| 1010 | 961 | return ret; |
|---|
| 1011 | 962 | |
|---|
| 1012 | 963 | put_unlocked: |
|---|
| 1013 | | - put_unlocked_mapping_entry(mapping, index, entry2); |
|---|
| 1014 | | - xa_unlock_irq(pages); |
|---|
| 964 | + put_unlocked_entry(xas, entry, WAKE_NEXT); |
|---|
| 1015 | 965 | return ret; |
|---|
| 1016 | 966 | } |
|---|
| 1017 | 967 | |
|---|
| .. | .. |
|---|
| 1021 | 971 | * on persistent storage prior to completion of the operation. |
|---|
| 1022 | 972 | */ |
|---|
| 1023 | 973 | int dax_writeback_mapping_range(struct address_space *mapping, |
|---|
| 1024 | | - struct block_device *bdev, struct writeback_control *wbc) |
|---|
| 974 | + struct dax_device *dax_dev, struct writeback_control *wbc) |
|---|
| 1025 | 975 | { |
|---|
| 976 | + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); |
|---|
| 1026 | 977 | struct inode *inode = mapping->host; |
|---|
| 1027 | | - pgoff_t start_index, end_index; |
|---|
| 1028 | | - pgoff_t indices[PAGEVEC_SIZE]; |
|---|
| 1029 | | - struct dax_device *dax_dev; |
|---|
| 1030 | | - struct pagevec pvec; |
|---|
| 1031 | | - bool done = false; |
|---|
| 1032 | | - int i, ret = 0; |
|---|
| 978 | + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; |
|---|
| 979 | + void *entry; |
|---|
| 980 | + int ret = 0; |
|---|
| 981 | + unsigned int scanned = 0; |
|---|
| 1033 | 982 | |
|---|
| 1034 | 983 | if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) |
|---|
| 1035 | 984 | return -EIO; |
|---|
| .. | .. |
|---|
| 1037 | 986 | if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) |
|---|
| 1038 | 987 | return 0; |
|---|
| 1039 | 988 | |
|---|
| 1040 | | - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); |
|---|
| 1041 | | - if (!dax_dev) |
|---|
| 1042 | | - return -EIO; |
|---|
| 989 | + trace_dax_writeback_range(inode, xas.xa_index, end_index); |
|---|
| 1043 | 990 | |
|---|
| 1044 | | - start_index = wbc->range_start >> PAGE_SHIFT; |
|---|
| 1045 | | - end_index = wbc->range_end >> PAGE_SHIFT; |
|---|
| 991 | + tag_pages_for_writeback(mapping, xas.xa_index, end_index); |
|---|
| 1046 | 992 | |
|---|
| 1047 | | - trace_dax_writeback_range(inode, start_index, end_index); |
|---|
| 1048 | | - |
|---|
| 1049 | | - tag_pages_for_writeback(mapping, start_index, end_index); |
|---|
| 1050 | | - |
|---|
| 1051 | | - pagevec_init(&pvec); |
|---|
| 1052 | | - while (!done) { |
|---|
| 1053 | | - pvec.nr = find_get_entries_tag(mapping, start_index, |
|---|
| 1054 | | - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, |
|---|
| 1055 | | - pvec.pages, indices); |
|---|
| 1056 | | - |
|---|
| 1057 | | - if (pvec.nr == 0) |
|---|
| 993 | + xas_lock_irq(&xas); |
|---|
| 994 | + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { |
|---|
| 995 | + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); |
|---|
| 996 | + if (ret < 0) { |
|---|
| 997 | + mapping_set_error(mapping, ret); |
|---|
| 1058 | 998 | break; |
|---|
| 1059 | | - |
|---|
| 1060 | | - for (i = 0; i < pvec.nr; i++) { |
|---|
| 1061 | | - if (indices[i] > end_index) { |
|---|
| 1062 | | - done = true; |
|---|
| 1063 | | - break; |
|---|
| 1064 | | - } |
|---|
| 1065 | | - |
|---|
| 1066 | | - ret = dax_writeback_one(dax_dev, mapping, indices[i], |
|---|
| 1067 | | - pvec.pages[i]); |
|---|
| 1068 | | - if (ret < 0) { |
|---|
| 1069 | | - mapping_set_error(mapping, ret); |
|---|
| 1070 | | - goto out; |
|---|
| 1071 | | - } |
|---|
| 1072 | 999 | } |
|---|
| 1073 | | - start_index = indices[pvec.nr - 1] + 1; |
|---|
| 1000 | + if (++scanned % XA_CHECK_SCHED) |
|---|
| 1001 | + continue; |
|---|
| 1002 | + |
|---|
| 1003 | + xas_pause(&xas); |
|---|
| 1004 | + xas_unlock_irq(&xas); |
|---|
| 1005 | + cond_resched(); |
|---|
| 1006 | + xas_lock_irq(&xas); |
|---|
| 1074 | 1007 | } |
|---|
| 1075 | | -out: |
|---|
| 1076 | | - put_dax(dax_dev); |
|---|
| 1077 | | - trace_dax_writeback_range_done(inode, start_index, end_index); |
|---|
| 1078 | | - return (ret < 0 ? ret : 0); |
|---|
| 1008 | + xas_unlock_irq(&xas); |
|---|
| 1009 | + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); |
|---|
| 1010 | + return ret; |
|---|
| 1079 | 1011 | } |
|---|
| 1080 | 1012 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); |
|---|
| 1081 | 1013 | |
|---|
| .. | .. |
|---|
| 1123 | 1055 | * If this page is ever written to we will re-fault and change the mapping to |
|---|
| 1124 | 1056 | * point to real DAX storage instead. |
|---|
| 1125 | 1057 | */ |
|---|
| 1126 | | -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, |
|---|
| 1127 | | - struct vm_fault *vmf) |
|---|
| 1058 | +static vm_fault_t dax_load_hole(struct xa_state *xas, |
|---|
| 1059 | + struct address_space *mapping, void **entry, |
|---|
| 1060 | + struct vm_fault *vmf) |
|---|
| 1128 | 1061 | { |
|---|
| 1129 | 1062 | struct inode *inode = mapping->host; |
|---|
| 1130 | 1063 | unsigned long vaddr = vmf->address; |
|---|
| 1131 | 1064 | pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); |
|---|
| 1132 | 1065 | vm_fault_t ret; |
|---|
| 1133 | 1066 | |
|---|
| 1134 | | - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, |
|---|
| 1135 | | - false); |
|---|
| 1067 | + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, |
|---|
| 1068 | + DAX_ZERO_PAGE, false); |
|---|
| 1069 | + |
|---|
| 1136 | 1070 | ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); |
|---|
| 1137 | 1071 | trace_dax_load_hole(inode, vmf, ret); |
|---|
| 1138 | 1072 | return ret; |
|---|
| 1139 | 1073 | } |
|---|
| 1140 | 1074 | |
|---|
| 1141 | | -static bool dax_range_is_aligned(struct block_device *bdev, |
|---|
| 1142 | | - unsigned int offset, unsigned int length) |
|---|
| 1075 | +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) |
|---|
| 1143 | 1076 | { |
|---|
| 1144 | | - unsigned short sector_size = bdev_logical_block_size(bdev); |
|---|
| 1077 | + sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); |
|---|
| 1078 | + pgoff_t pgoff; |
|---|
| 1079 | + long rc, id; |
|---|
| 1080 | + void *kaddr; |
|---|
| 1081 | + bool page_aligned = false; |
|---|
| 1082 | + unsigned offset = offset_in_page(pos); |
|---|
| 1083 | + unsigned size = min_t(u64, PAGE_SIZE - offset, length); |
|---|
| 1145 | 1084 | |
|---|
| 1146 | | - if (!IS_ALIGNED(offset, sector_size)) |
|---|
| 1147 | | - return false; |
|---|
| 1148 | | - if (!IS_ALIGNED(length, sector_size)) |
|---|
| 1149 | | - return false; |
|---|
| 1085 | + if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && |
|---|
| 1086 | + (size == PAGE_SIZE)) |
|---|
| 1087 | + page_aligned = true; |
|---|
| 1150 | 1088 | |
|---|
| 1151 | | - return true; |
|---|
| 1152 | | -} |
|---|
| 1089 | + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); |
|---|
| 1090 | + if (rc) |
|---|
| 1091 | + return rc; |
|---|
| 1153 | 1092 | |
|---|
| 1154 | | -int __dax_zero_page_range(struct block_device *bdev, |
|---|
| 1155 | | - struct dax_device *dax_dev, sector_t sector, |
|---|
| 1156 | | - unsigned int offset, unsigned int size) |
|---|
| 1157 | | -{ |
|---|
| 1158 | | - if (dax_range_is_aligned(bdev, offset, size)) { |
|---|
| 1159 | | - sector_t start_sector = sector + (offset >> 9); |
|---|
| 1093 | + id = dax_read_lock(); |
|---|
| 1160 | 1094 | |
|---|
| 1161 | | - return blkdev_issue_zeroout(bdev, start_sector, |
|---|
| 1162 | | - size >> 9, GFP_NOFS, 0); |
|---|
| 1163 | | - } else { |
|---|
| 1164 | | - pgoff_t pgoff; |
|---|
| 1165 | | - long rc, id; |
|---|
| 1166 | | - void *kaddr; |
|---|
| 1167 | | - |
|---|
| 1168 | | - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); |
|---|
| 1169 | | - if (rc) |
|---|
| 1170 | | - return rc; |
|---|
| 1171 | | - |
|---|
| 1172 | | - id = dax_read_lock(); |
|---|
| 1173 | | - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); |
|---|
| 1174 | | - if (rc < 0) { |
|---|
| 1175 | | - dax_read_unlock(id); |
|---|
| 1176 | | - return rc; |
|---|
| 1177 | | - } |
|---|
| 1178 | | - memset(kaddr + offset, 0, size); |
|---|
| 1179 | | - dax_flush(dax_dev, kaddr + offset, size); |
|---|
| 1095 | + if (page_aligned) |
|---|
| 1096 | + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); |
|---|
| 1097 | + else |
|---|
| 1098 | + rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); |
|---|
| 1099 | + if (rc < 0) { |
|---|
| 1180 | 1100 | dax_read_unlock(id); |
|---|
| 1101 | + return rc; |
|---|
| 1181 | 1102 | } |
|---|
| 1182 | | - return 0; |
|---|
| 1103 | + |
|---|
| 1104 | + if (!page_aligned) { |
|---|
| 1105 | + memset(kaddr + offset, 0, size); |
|---|
| 1106 | + dax_flush(iomap->dax_dev, kaddr + offset, size); |
|---|
| 1107 | + } |
|---|
| 1108 | + dax_read_unlock(id); |
|---|
| 1109 | + return size; |
|---|
| 1183 | 1110 | } |
|---|
| 1184 | | -EXPORT_SYMBOL_GPL(__dax_zero_page_range); |
|---|
| 1185 | 1111 | |
|---|
| 1186 | 1112 | static loff_t |
|---|
| 1187 | 1113 | dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, |
|---|
| 1188 | | - struct iomap *iomap) |
|---|
| 1114 | + struct iomap *iomap, struct iomap *srcmap) |
|---|
| 1189 | 1115 | { |
|---|
| 1190 | 1116 | struct block_device *bdev = iomap->bdev; |
|---|
| 1191 | 1117 | struct dax_device *dax_dev = iomap->dax_dev; |
|---|
| .. | .. |
|---|
| 1295 | 1221 | unsigned flags = 0; |
|---|
| 1296 | 1222 | |
|---|
| 1297 | 1223 | if (iov_iter_rw(iter) == WRITE) { |
|---|
| 1298 | | - lockdep_assert_held_exclusive(&inode->i_rwsem); |
|---|
| 1224 | + lockdep_assert_held_write(&inode->i_rwsem); |
|---|
| 1299 | 1225 | flags |= IOMAP_WRITE; |
|---|
| 1300 | 1226 | } else { |
|---|
| 1301 | 1227 | lockdep_assert_held(&inode->i_rwsem); |
|---|
| .. | .. |
|---|
| 1322 | 1248 | { |
|---|
| 1323 | 1249 | if (error == 0) |
|---|
| 1324 | 1250 | return VM_FAULT_NOPAGE; |
|---|
| 1325 | | - if (error == -ENOMEM) |
|---|
| 1326 | | - return VM_FAULT_OOM; |
|---|
| 1327 | | - return VM_FAULT_SIGBUS; |
|---|
| 1251 | + return vmf_error(error); |
|---|
| 1328 | 1252 | } |
|---|
| 1329 | 1253 | |
|---|
| 1330 | 1254 | /* |
|---|
| .. | .. |
|---|
| 1343 | 1267 | { |
|---|
| 1344 | 1268 | struct vm_area_struct *vma = vmf->vma; |
|---|
| 1345 | 1269 | struct address_space *mapping = vma->vm_file->f_mapping; |
|---|
| 1270 | + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); |
|---|
| 1346 | 1271 | struct inode *inode = mapping->host; |
|---|
| 1347 | 1272 | unsigned long vaddr = vmf->address; |
|---|
| 1348 | 1273 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
|---|
| 1349 | | - struct iomap iomap = { 0 }; |
|---|
| 1274 | + struct iomap iomap = { .type = IOMAP_HOLE }; |
|---|
| 1275 | + struct iomap srcmap = { .type = IOMAP_HOLE }; |
|---|
| 1350 | 1276 | unsigned flags = IOMAP_FAULT; |
|---|
| 1351 | 1277 | int error, major = 0; |
|---|
| 1352 | 1278 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
|---|
| .. | .. |
|---|
| 1369 | 1295 | if (write && !vmf->cow_page) |
|---|
| 1370 | 1296 | flags |= IOMAP_WRITE; |
|---|
| 1371 | 1297 | |
|---|
| 1372 | | - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); |
|---|
| 1373 | | - if (IS_ERR(entry)) { |
|---|
| 1374 | | - ret = dax_fault_return(PTR_ERR(entry)); |
|---|
| 1298 | + entry = grab_mapping_entry(&xas, mapping, 0); |
|---|
| 1299 | + if (xa_is_internal(entry)) { |
|---|
| 1300 | + ret = xa_to_internal(entry); |
|---|
| 1375 | 1301 | goto out; |
|---|
| 1376 | 1302 | } |
|---|
| 1377 | 1303 | |
|---|
| .. | .. |
|---|
| 1391 | 1317 | * the file system block size to be equal the page size, which means |
|---|
| 1392 | 1318 | * that we never have to deal with more than a single extent here. |
|---|
| 1393 | 1319 | */ |
|---|
| 1394 | | - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
|---|
| 1320 | + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); |
|---|
| 1395 | 1321 | if (iomap_errp) |
|---|
| 1396 | 1322 | *iomap_errp = error; |
|---|
| 1397 | 1323 | if (error) { |
|---|
| .. | .. |
|---|
| 1412 | 1338 | clear_user_highpage(vmf->cow_page, vaddr); |
|---|
| 1413 | 1339 | break; |
|---|
| 1414 | 1340 | case IOMAP_MAPPED: |
|---|
| 1415 | | - error = copy_user_dax(iomap.bdev, iomap.dax_dev, |
|---|
| 1416 | | - sector, PAGE_SIZE, vmf->cow_page, vaddr); |
|---|
| 1341 | + error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, |
|---|
| 1342 | + sector, vmf->cow_page, vaddr); |
|---|
| 1417 | 1343 | break; |
|---|
| 1418 | 1344 | default: |
|---|
| 1419 | 1345 | WARN_ON_ONCE(1); |
|---|
| .. | .. |
|---|
| 1444 | 1370 | if (error < 0) |
|---|
| 1445 | 1371 | goto error_finish_iomap; |
|---|
| 1446 | 1372 | |
|---|
| 1447 | | - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
|---|
| 1373 | + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, |
|---|
| 1448 | 1374 | 0, write && !sync); |
|---|
| 1449 | 1375 | |
|---|
| 1450 | 1376 | /* |
|---|
| .. | .. |
|---|
| 1472 | 1398 | case IOMAP_UNWRITTEN: |
|---|
| 1473 | 1399 | case IOMAP_HOLE: |
|---|
| 1474 | 1400 | if (!write) { |
|---|
| 1475 | | - ret = dax_load_hole(mapping, entry, vmf); |
|---|
| 1401 | + ret = dax_load_hole(&xas, mapping, &entry, vmf); |
|---|
| 1476 | 1402 | goto finish_iomap; |
|---|
| 1477 | 1403 | } |
|---|
| 1478 | | - /*FALLTHRU*/ |
|---|
| 1404 | + fallthrough; |
|---|
| 1479 | 1405 | default: |
|---|
| 1480 | 1406 | WARN_ON_ONCE(1); |
|---|
| 1481 | 1407 | error = -EIO; |
|---|
| .. | .. |
|---|
| 1499 | 1425 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
|---|
| 1500 | 1426 | } |
|---|
| 1501 | 1427 | unlock_entry: |
|---|
| 1502 | | - put_locked_mapping_entry(mapping, vmf->pgoff); |
|---|
| 1428 | + dax_unlock_entry(&xas, entry); |
|---|
| 1503 | 1429 | out: |
|---|
| 1504 | 1430 | trace_dax_pte_fault_done(inode, vmf, ret); |
|---|
| 1505 | 1431 | return ret | major; |
|---|
| 1506 | 1432 | } |
|---|
| 1507 | 1433 | |
|---|
| 1508 | 1434 | #ifdef CONFIG_FS_DAX_PMD |
|---|
| 1509 | | -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
|---|
| 1510 | | - void *entry) |
|---|
| 1435 | +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, |
|---|
| 1436 | + struct iomap *iomap, void **entry) |
|---|
| 1511 | 1437 | { |
|---|
| 1512 | 1438 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
|---|
| 1513 | 1439 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
|---|
| 1440 | + struct vm_area_struct *vma = vmf->vma; |
|---|
| 1514 | 1441 | struct inode *inode = mapping->host; |
|---|
| 1442 | + pgtable_t pgtable = NULL; |
|---|
| 1515 | 1443 | struct page *zero_page; |
|---|
| 1516 | | - void *ret = NULL; |
|---|
| 1517 | 1444 | spinlock_t *ptl; |
|---|
| 1518 | 1445 | pmd_t pmd_entry; |
|---|
| 1519 | 1446 | pfn_t pfn; |
|---|
| .. | .. |
|---|
| 1524 | 1451 | goto fallback; |
|---|
| 1525 | 1452 | |
|---|
| 1526 | 1453 | pfn = page_to_pfn_t(zero_page); |
|---|
| 1527 | | - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
|---|
| 1528 | | - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); |
|---|
| 1454 | + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, |
|---|
| 1455 | + DAX_PMD | DAX_ZERO_PAGE, false); |
|---|
| 1456 | + |
|---|
| 1457 | + if (arch_needs_pgtable_deposit()) { |
|---|
| 1458 | + pgtable = pte_alloc_one(vma->vm_mm); |
|---|
| 1459 | + if (!pgtable) |
|---|
| 1460 | + return VM_FAULT_OOM; |
|---|
| 1461 | + } |
|---|
| 1529 | 1462 | |
|---|
| 1530 | 1463 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
|---|
| 1531 | 1464 | if (!pmd_none(*(vmf->pmd))) { |
|---|
| .. | .. |
|---|
| 1533 | 1466 | goto fallback; |
|---|
| 1534 | 1467 | } |
|---|
| 1535 | 1468 | |
|---|
| 1469 | + if (pgtable) { |
|---|
| 1470 | + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
|---|
| 1471 | + mm_inc_nr_ptes(vma->vm_mm); |
|---|
| 1472 | + } |
|---|
| 1536 | 1473 | pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); |
|---|
| 1537 | 1474 | pmd_entry = pmd_mkhuge(pmd_entry); |
|---|
| 1538 | 1475 | set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); |
|---|
| 1539 | 1476 | spin_unlock(ptl); |
|---|
| 1540 | | - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); |
|---|
| 1477 | + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); |
|---|
| 1541 | 1478 | return VM_FAULT_NOPAGE; |
|---|
| 1542 | 1479 | |
|---|
| 1543 | 1480 | fallback: |
|---|
| 1544 | | - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); |
|---|
| 1481 | + if (pgtable) |
|---|
| 1482 | + pte_free(vma->vm_mm, pgtable); |
|---|
| 1483 | + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); |
|---|
| 1545 | 1484 | return VM_FAULT_FALLBACK; |
|---|
| 1546 | 1485 | } |
|---|
| 1547 | 1486 | |
|---|
| .. | .. |
|---|
| 1550 | 1489 | { |
|---|
| 1551 | 1490 | struct vm_area_struct *vma = vmf->vma; |
|---|
| 1552 | 1491 | struct address_space *mapping = vma->vm_file->f_mapping; |
|---|
| 1492 | + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); |
|---|
| 1553 | 1493 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
|---|
| 1554 | 1494 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
|---|
| 1555 | 1495 | bool sync; |
|---|
| 1556 | 1496 | unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; |
|---|
| 1557 | 1497 | struct inode *inode = mapping->host; |
|---|
| 1558 | 1498 | vm_fault_t result = VM_FAULT_FALLBACK; |
|---|
| 1559 | | - struct iomap iomap = { 0 }; |
|---|
| 1560 | | - pgoff_t max_pgoff, pgoff; |
|---|
| 1499 | + struct iomap iomap = { .type = IOMAP_HOLE }; |
|---|
| 1500 | + struct iomap srcmap = { .type = IOMAP_HOLE }; |
|---|
| 1501 | + pgoff_t max_pgoff; |
|---|
| 1561 | 1502 | void *entry; |
|---|
| 1562 | 1503 | loff_t pos; |
|---|
| 1563 | 1504 | int error; |
|---|
| .. | .. |
|---|
| 1568 | 1509 | * supposed to hold locks serializing us with truncate / punch hole so |
|---|
| 1569 | 1510 | * this is a reliable test. |
|---|
| 1570 | 1511 | */ |
|---|
| 1571 | | - pgoff = linear_page_index(vma, pmd_addr); |
|---|
| 1572 | 1512 | max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
|---|
| 1573 | 1513 | |
|---|
| 1574 | 1514 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); |
|---|
| .. | .. |
|---|
| 1577 | 1517 | * Make sure that the faulting address's PMD offset (color) matches |
|---|
| 1578 | 1518 | * the PMD offset from the start of the file. This is necessary so |
|---|
| 1579 | 1519 | * that a PMD range in the page table overlaps exactly with a PMD |
|---|
| 1580 | | - * range in the radix tree. |
|---|
| 1520 | + * range in the page cache. |
|---|
| 1581 | 1521 | */ |
|---|
| 1582 | 1522 | if ((vmf->pgoff & PG_PMD_COLOUR) != |
|---|
| 1583 | 1523 | ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) |
|---|
| .. | .. |
|---|
| 1593 | 1533 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) |
|---|
| 1594 | 1534 | goto fallback; |
|---|
| 1595 | 1535 | |
|---|
| 1596 | | - if (pgoff >= max_pgoff) { |
|---|
| 1536 | + if (xas.xa_index >= max_pgoff) { |
|---|
| 1597 | 1537 | result = VM_FAULT_SIGBUS; |
|---|
| 1598 | 1538 | goto out; |
|---|
| 1599 | 1539 | } |
|---|
| 1600 | 1540 | |
|---|
| 1601 | 1541 | /* If the PMD would extend beyond the file size */ |
|---|
| 1602 | | - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) |
|---|
| 1542 | + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) |
|---|
| 1603 | 1543 | goto fallback; |
|---|
| 1604 | 1544 | |
|---|
| 1605 | 1545 | /* |
|---|
| 1606 | | - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a |
|---|
| 1607 | | - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page |
|---|
| 1608 | | - * is already in the tree, for instance), it will return -EEXIST and |
|---|
| 1609 | | - * we just fall back to 4k entries. |
|---|
| 1546 | + * grab_mapping_entry() will make sure we get an empty PMD entry, |
|---|
| 1547 | + * a zero PMD entry or a DAX PMD. If it can't (because a PTE |
|---|
| 1548 | + * entry is already in the array, for instance), it will return |
|---|
| 1549 | + * VM_FAULT_FALLBACK. |
|---|
| 1610 | 1550 | */ |
|---|
| 1611 | | - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); |
|---|
| 1612 | | - if (IS_ERR(entry)) |
|---|
| 1551 | + entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); |
|---|
| 1552 | + if (xa_is_internal(entry)) { |
|---|
| 1553 | + result = xa_to_internal(entry); |
|---|
| 1613 | 1554 | goto fallback; |
|---|
| 1555 | + } |
|---|
| 1614 | 1556 | |
|---|
| 1615 | 1557 | /* |
|---|
| 1616 | 1558 | * It is possible, particularly with mixed reads & writes to private |
|---|
| .. | .. |
|---|
| 1629 | 1571 | * setting up a mapping, so really we're using iomap_begin() as a way |
|---|
| 1630 | 1572 | * to look up our filesystem block. |
|---|
| 1631 | 1573 | */ |
|---|
| 1632 | | - pos = (loff_t)pgoff << PAGE_SHIFT; |
|---|
| 1633 | | - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); |
|---|
| 1574 | + pos = (loff_t)xas.xa_index << PAGE_SHIFT; |
|---|
| 1575 | + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, |
|---|
| 1576 | + &srcmap); |
|---|
| 1634 | 1577 | if (error) |
|---|
| 1635 | 1578 | goto unlock_entry; |
|---|
| 1636 | 1579 | |
|---|
| .. | .. |
|---|
| 1645 | 1588 | if (error < 0) |
|---|
| 1646 | 1589 | goto finish_iomap; |
|---|
| 1647 | 1590 | |
|---|
| 1648 | | - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
|---|
| 1649 | | - RADIX_DAX_PMD, write && !sync); |
|---|
| 1591 | + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, |
|---|
| 1592 | + DAX_PMD, write && !sync); |
|---|
| 1650 | 1593 | |
|---|
| 1651 | 1594 | /* |
|---|
| 1652 | 1595 | * If we are doing synchronous page fault and inode needs fsync, |
|---|
| .. | .. |
|---|
| 1669 | 1612 | case IOMAP_HOLE: |
|---|
| 1670 | 1613 | if (WARN_ON_ONCE(write)) |
|---|
| 1671 | 1614 | break; |
|---|
| 1672 | | - result = dax_pmd_load_hole(vmf, &iomap, entry); |
|---|
| 1615 | + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); |
|---|
| 1673 | 1616 | break; |
|---|
| 1674 | 1617 | default: |
|---|
| 1675 | 1618 | WARN_ON_ONCE(1); |
|---|
| .. | .. |
|---|
| 1692 | 1635 | &iomap); |
|---|
| 1693 | 1636 | } |
|---|
| 1694 | 1637 | unlock_entry: |
|---|
| 1695 | | - put_locked_mapping_entry(mapping, pgoff); |
|---|
| 1638 | + dax_unlock_entry(&xas, entry); |
|---|
| 1696 | 1639 | fallback: |
|---|
| 1697 | 1640 | if (result == VM_FAULT_FALLBACK) { |
|---|
| 1698 | 1641 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
|---|
| .. | .. |
|---|
| 1737 | 1680 | } |
|---|
| 1738 | 1681 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
|---|
| 1739 | 1682 | |
|---|
| 1740 | | -/** |
|---|
| 1683 | +/* |
|---|
| 1741 | 1684 | * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables |
|---|
| 1742 | 1685 | * @vmf: The description of the fault |
|---|
| 1743 | | - * @pe_size: Size of entry to be inserted |
|---|
| 1744 | 1686 | * @pfn: PFN to insert |
|---|
| 1687 | + * @order: Order of entry to insert. |
|---|
| 1745 | 1688 | * |
|---|
| 1746 | | - * This function inserts writeable PTE or PMD entry into page tables for mmaped |
|---|
| 1747 | | - * DAX file. It takes care of marking corresponding radix tree entry as dirty |
|---|
| 1748 | | - * as well. |
|---|
| 1689 | + * This function inserts a writeable PTE or PMD entry into the page tables |
|---|
| 1690 | + * for an mmaped DAX file. It also marks the page cache entry as dirty. |
|---|
| 1749 | 1691 | */ |
|---|
| 1750 | | -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, |
|---|
| 1751 | | - enum page_entry_size pe_size, |
|---|
| 1752 | | - pfn_t pfn) |
|---|
| 1692 | +static vm_fault_t |
|---|
| 1693 | +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) |
|---|
| 1753 | 1694 | { |
|---|
| 1754 | 1695 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
|---|
| 1755 | | - void *entry, **slot; |
|---|
| 1756 | | - pgoff_t index = vmf->pgoff; |
|---|
| 1696 | + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); |
|---|
| 1697 | + void *entry; |
|---|
| 1757 | 1698 | vm_fault_t ret; |
|---|
| 1758 | 1699 | |
|---|
| 1759 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 1760 | | - entry = get_unlocked_mapping_entry(mapping, index, &slot); |
|---|
| 1700 | + xas_lock_irq(&xas); |
|---|
| 1701 | + entry = get_unlocked_entry(&xas, order); |
|---|
| 1761 | 1702 | /* Did we race with someone splitting entry or so? */ |
|---|
| 1762 | | - if (!entry || |
|---|
| 1763 | | - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || |
|---|
| 1764 | | - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { |
|---|
| 1765 | | - put_unlocked_mapping_entry(mapping, index, entry); |
|---|
| 1766 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 1703 | + if (!entry || dax_is_conflict(entry) || |
|---|
| 1704 | + (order == 0 && !dax_is_pte_entry(entry))) { |
|---|
| 1705 | + put_unlocked_entry(&xas, entry, WAKE_NEXT); |
|---|
| 1706 | + xas_unlock_irq(&xas); |
|---|
| 1767 | 1707 | trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, |
|---|
| 1768 | 1708 | VM_FAULT_NOPAGE); |
|---|
| 1769 | 1709 | return VM_FAULT_NOPAGE; |
|---|
| 1770 | 1710 | } |
|---|
| 1771 | | - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); |
|---|
| 1772 | | - entry = lock_slot(mapping, slot); |
|---|
| 1773 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 1774 | | - switch (pe_size) { |
|---|
| 1775 | | - case PE_SIZE_PTE: |
|---|
| 1711 | + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); |
|---|
| 1712 | + dax_lock_entry(&xas, entry); |
|---|
| 1713 | + xas_unlock_irq(&xas); |
|---|
| 1714 | + if (order == 0) |
|---|
| 1776 | 1715 | ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); |
|---|
| 1777 | | - break; |
|---|
| 1778 | 1716 | #ifdef CONFIG_FS_DAX_PMD |
|---|
| 1779 | | - case PE_SIZE_PMD: |
|---|
| 1717 | + else if (order == PMD_ORDER) |
|---|
| 1780 | 1718 | ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); |
|---|
| 1781 | | - break; |
|---|
| 1782 | 1719 | #endif |
|---|
| 1783 | | - default: |
|---|
| 1720 | + else |
|---|
| 1784 | 1721 | ret = VM_FAULT_FALLBACK; |
|---|
| 1785 | | - } |
|---|
| 1786 | | - put_locked_mapping_entry(mapping, index); |
|---|
| 1722 | + dax_unlock_entry(&xas, entry); |
|---|
| 1787 | 1723 | trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); |
|---|
| 1788 | 1724 | return ret; |
|---|
| 1789 | 1725 | } |
|---|
| .. | .. |
|---|
| 1803 | 1739 | { |
|---|
| 1804 | 1740 | int err; |
|---|
| 1805 | 1741 | loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; |
|---|
| 1806 | | - size_t len = 0; |
|---|
| 1742 | + unsigned int order = pe_order(pe_size); |
|---|
| 1743 | + size_t len = PAGE_SIZE << order; |
|---|
| 1807 | 1744 | |
|---|
| 1808 | | - if (pe_size == PE_SIZE_PTE) |
|---|
| 1809 | | - len = PAGE_SIZE; |
|---|
| 1810 | | - else if (pe_size == PE_SIZE_PMD) |
|---|
| 1811 | | - len = PMD_SIZE; |
|---|
| 1812 | | - else |
|---|
| 1813 | | - WARN_ON_ONCE(1); |
|---|
| 1814 | 1745 | err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); |
|---|
| 1815 | 1746 | if (err) |
|---|
| 1816 | 1747 | return VM_FAULT_SIGBUS; |
|---|
| 1817 | | - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); |
|---|
| 1748 | + return dax_insert_pfn_mkwrite(vmf, pfn, order); |
|---|
| 1818 | 1749 | } |
|---|
| 1819 | 1750 | EXPORT_SYMBOL_GPL(dax_finish_sync_fault); |
|---|