| .. | .. |
|---|
| 21 | 21 | #include <linux/vmalloc.h> |
|---|
| 22 | 22 | #include <linux/swap_slots.h> |
|---|
| 23 | 23 | #include <linux/huge_mm.h> |
|---|
| 24 | | - |
|---|
| 25 | | -#include <asm/pgtable.h> |
|---|
| 24 | +#include <linux/shmem_fs.h> |
|---|
| 26 | 25 | #include "internal.h" |
|---|
| 27 | 26 | |
|---|
| 28 | 27 | /* |
|---|
| .. | .. |
|---|
| 59 | 58 | #define GET_SWAP_RA_VAL(vma) \ |
|---|
| 60 | 59 | (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) |
|---|
| 61 | 60 | |
|---|
| 62 | | -#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
|---|
| 63 | | -#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) |
|---|
| 61 | +#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) |
|---|
| 62 | +#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) |
|---|
| 64 | 63 | |
|---|
| 65 | 64 | static struct { |
|---|
| 66 | 65 | unsigned long add_total; |
|---|
| .. | .. |
|---|
| 74 | 73 | unsigned int i, j, nr; |
|---|
| 75 | 74 | unsigned long ret = 0; |
|---|
| 76 | 75 | struct address_space *spaces; |
|---|
| 76 | + struct swap_info_struct *si; |
|---|
| 77 | 77 | |
|---|
| 78 | | - rcu_read_lock(); |
|---|
| 79 | 78 | for (i = 0; i < MAX_SWAPFILES; i++) { |
|---|
| 80 | | - /* |
|---|
| 81 | | - * The corresponding entries in nr_swapper_spaces and |
|---|
| 82 | | - * swapper_spaces will be reused only after at least |
|---|
| 83 | | - * one grace period. So it is impossible for them |
|---|
| 84 | | - * belongs to different usage. |
|---|
| 85 | | - */ |
|---|
| 86 | | - nr = nr_swapper_spaces[i]; |
|---|
| 87 | | - spaces = rcu_dereference(swapper_spaces[i]); |
|---|
| 88 | | - if (!nr || !spaces) |
|---|
| 79 | + swp_entry_t entry = swp_entry(i, 1); |
|---|
| 80 | + |
|---|
| 81 | + /* Avoid get_swap_device() to warn for bad swap entry */ |
|---|
| 82 | + if (!swp_swap_info(entry)) |
|---|
| 89 | 83 | continue; |
|---|
| 84 | + /* Prevent swapoff to free swapper_spaces */ |
|---|
| 85 | + si = get_swap_device(entry); |
|---|
| 86 | + if (!si) |
|---|
| 87 | + continue; |
|---|
| 88 | + nr = nr_swapper_spaces[i]; |
|---|
| 89 | + spaces = swapper_spaces[i]; |
|---|
| 90 | 90 | for (j = 0; j < nr; j++) |
|---|
| 91 | 91 | ret += spaces[j].nrpages; |
|---|
| 92 | + put_swap_device(si); |
|---|
| 92 | 93 | } |
|---|
| 93 | | - rcu_read_unlock(); |
|---|
| 94 | 94 | return ret; |
|---|
| 95 | 95 | } |
|---|
| 96 | +EXPORT_SYMBOL_GPL(total_swapcache_pages); |
|---|
| 96 | 97 | |
|---|
| 97 | 98 | static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); |
|---|
| 98 | 99 | |
|---|
| .. | .. |
|---|
| 107 | 108 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
|---|
| 108 | 109 | } |
|---|
| 109 | 110 | |
|---|
| 111 | +void *get_shadow_from_swap_cache(swp_entry_t entry) |
|---|
| 112 | +{ |
|---|
| 113 | + struct address_space *address_space = swap_address_space(entry); |
|---|
| 114 | + pgoff_t idx = swp_offset(entry); |
|---|
| 115 | + struct page *page; |
|---|
| 116 | + |
|---|
| 117 | + page = find_get_entry(address_space, idx); |
|---|
| 118 | + if (xa_is_value(page)) |
|---|
| 119 | + return page; |
|---|
| 120 | + if (page) |
|---|
| 121 | + put_page(page); |
|---|
| 122 | + return NULL; |
|---|
| 123 | +} |
|---|
| 124 | + |
|---|
| 110 | 125 | /* |
|---|
| 111 | | - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
|---|
| 126 | + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
|---|
| 112 | 127 | * but sets SwapCache flag and private instead of mapping and index. |
|---|
| 113 | 128 | */ |
|---|
| 114 | | -int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
|---|
| 129 | +int add_to_swap_cache(struct page *page, swp_entry_t entry, |
|---|
| 130 | + gfp_t gfp, void **shadowp) |
|---|
| 115 | 131 | { |
|---|
| 116 | | - int error, i, nr = hpage_nr_pages(page); |
|---|
| 117 | | - struct address_space *address_space; |
|---|
| 132 | + struct address_space *address_space = swap_address_space(entry); |
|---|
| 118 | 133 | pgoff_t idx = swp_offset(entry); |
|---|
| 134 | + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); |
|---|
| 135 | + unsigned long i, nr = thp_nr_pages(page); |
|---|
| 136 | + void *old; |
|---|
| 119 | 137 | |
|---|
| 120 | 138 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 121 | 139 | VM_BUG_ON_PAGE(PageSwapCache(page), page); |
|---|
| .. | .. |
|---|
| 124 | 142 | page_ref_add(page, nr); |
|---|
| 125 | 143 | SetPageSwapCache(page); |
|---|
| 126 | 144 | |
|---|
| 127 | | - address_space = swap_address_space(entry); |
|---|
| 128 | | - xa_lock_irq(&address_space->i_pages); |
|---|
| 129 | | - for (i = 0; i < nr; i++) { |
|---|
| 130 | | - set_page_private(page + i, entry.val + i); |
|---|
| 131 | | - error = radix_tree_insert(&address_space->i_pages, |
|---|
| 132 | | - idx + i, page + i); |
|---|
| 133 | | - if (unlikely(error)) |
|---|
| 134 | | - break; |
|---|
| 135 | | - } |
|---|
| 136 | | - if (likely(!error)) { |
|---|
| 145 | + do { |
|---|
| 146 | + unsigned long nr_shadows = 0; |
|---|
| 147 | + |
|---|
| 148 | + xas_lock_irq(&xas); |
|---|
| 149 | + xas_create_range(&xas); |
|---|
| 150 | + if (xas_error(&xas)) |
|---|
| 151 | + goto unlock; |
|---|
| 152 | + for (i = 0; i < nr; i++) { |
|---|
| 153 | + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); |
|---|
| 154 | + old = xas_load(&xas); |
|---|
| 155 | + if (xa_is_value(old)) { |
|---|
| 156 | + nr_shadows++; |
|---|
| 157 | + if (shadowp) |
|---|
| 158 | + *shadowp = old; |
|---|
| 159 | + } |
|---|
| 160 | + set_page_private(page + i, entry.val + i); |
|---|
| 161 | + xas_store(&xas, page); |
|---|
| 162 | + xas_next(&xas); |
|---|
| 163 | + } |
|---|
| 164 | + address_space->nrexceptional -= nr_shadows; |
|---|
| 137 | 165 | address_space->nrpages += nr; |
|---|
| 138 | 166 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); |
|---|
| 139 | 167 | ADD_CACHE_INFO(add_total, nr); |
|---|
| 140 | | - } else { |
|---|
| 141 | | - /* |
|---|
| 142 | | - * Only the context which have set SWAP_HAS_CACHE flag |
|---|
| 143 | | - * would call add_to_swap_cache(). |
|---|
| 144 | | - * So add_to_swap_cache() doesn't returns -EEXIST. |
|---|
| 145 | | - */ |
|---|
| 146 | | - VM_BUG_ON(error == -EEXIST); |
|---|
| 147 | | - set_page_private(page + i, 0UL); |
|---|
| 148 | | - while (i--) { |
|---|
| 149 | | - radix_tree_delete(&address_space->i_pages, idx + i); |
|---|
| 150 | | - set_page_private(page + i, 0UL); |
|---|
| 151 | | - } |
|---|
| 152 | | - ClearPageSwapCache(page); |
|---|
| 153 | | - page_ref_sub(page, nr); |
|---|
| 154 | | - } |
|---|
| 155 | | - xa_unlock_irq(&address_space->i_pages); |
|---|
| 168 | +unlock: |
|---|
| 169 | + xas_unlock_irq(&xas); |
|---|
| 170 | + } while (xas_nomem(&xas, gfp)); |
|---|
| 156 | 171 | |
|---|
| 157 | | - return error; |
|---|
| 158 | | -} |
|---|
| 172 | + if (!xas_error(&xas)) |
|---|
| 173 | + return 0; |
|---|
| 159 | 174 | |
|---|
| 160 | | - |
|---|
| 161 | | -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) |
|---|
| 162 | | -{ |
|---|
| 163 | | - int error; |
|---|
| 164 | | - |
|---|
| 165 | | - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); |
|---|
| 166 | | - if (!error) { |
|---|
| 167 | | - error = __add_to_swap_cache(page, entry); |
|---|
| 168 | | - radix_tree_preload_end(); |
|---|
| 169 | | - } |
|---|
| 170 | | - return error; |
|---|
| 175 | + ClearPageSwapCache(page); |
|---|
| 176 | + page_ref_sub(page, nr); |
|---|
| 177 | + return xas_error(&xas); |
|---|
| 171 | 178 | } |
|---|
| 172 | 179 | |
|---|
| 173 | 180 | /* |
|---|
| 174 | 181 | * This must be called only on pages that have |
|---|
| 175 | 182 | * been verified to be in the swap cache. |
|---|
| 176 | 183 | */ |
|---|
| 177 | | -void __delete_from_swap_cache(struct page *page) |
|---|
| 184 | +void __delete_from_swap_cache(struct page *page, |
|---|
| 185 | + swp_entry_t entry, void *shadow) |
|---|
| 178 | 186 | { |
|---|
| 179 | | - struct address_space *address_space; |
|---|
| 180 | | - int i, nr = hpage_nr_pages(page); |
|---|
| 181 | | - swp_entry_t entry; |
|---|
| 182 | | - pgoff_t idx; |
|---|
| 187 | + struct address_space *address_space = swap_address_space(entry); |
|---|
| 188 | + int i, nr = thp_nr_pages(page); |
|---|
| 189 | + pgoff_t idx = swp_offset(entry); |
|---|
| 190 | + XA_STATE(xas, &address_space->i_pages, idx); |
|---|
| 183 | 191 | |
|---|
| 184 | 192 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 185 | 193 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
|---|
| 186 | 194 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
|---|
| 187 | 195 | |
|---|
| 188 | | - entry.val = page_private(page); |
|---|
| 189 | | - address_space = swap_address_space(entry); |
|---|
| 190 | | - idx = swp_offset(entry); |
|---|
| 191 | 196 | for (i = 0; i < nr; i++) { |
|---|
| 192 | | - radix_tree_delete(&address_space->i_pages, idx + i); |
|---|
| 197 | + void *entry = xas_store(&xas, shadow); |
|---|
| 198 | + VM_BUG_ON_PAGE(entry != page, entry); |
|---|
| 193 | 199 | set_page_private(page + i, 0); |
|---|
| 200 | + xas_next(&xas); |
|---|
| 194 | 201 | } |
|---|
| 195 | 202 | ClearPageSwapCache(page); |
|---|
| 203 | + if (shadow) |
|---|
| 204 | + address_space->nrexceptional += nr; |
|---|
| 196 | 205 | address_space->nrpages -= nr; |
|---|
| 197 | 206 | __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); |
|---|
| 198 | 207 | ADD_CACHE_INFO(del_total, nr); |
|---|
| .. | .. |
|---|
| 218 | 227 | return 0; |
|---|
| 219 | 228 | |
|---|
| 220 | 229 | /* |
|---|
| 221 | | - * Radix-tree node allocations from PF_MEMALLOC contexts could |
|---|
| 230 | + * XArray node allocations from PF_MEMALLOC contexts could |
|---|
| 222 | 231 | * completely exhaust the page allocator. __GFP_NOMEMALLOC |
|---|
| 223 | 232 | * stops emergency reserves from being allocated. |
|---|
| 224 | 233 | * |
|---|
| .. | .. |
|---|
| 229 | 238 | * Add it to the swap cache. |
|---|
| 230 | 239 | */ |
|---|
| 231 | 240 | err = add_to_swap_cache(page, entry, |
|---|
| 232 | | - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); |
|---|
| 233 | | - /* -ENOMEM radix-tree allocation failure */ |
|---|
| 241 | + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); |
|---|
| 234 | 242 | if (err) |
|---|
| 235 | 243 | /* |
|---|
| 236 | 244 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely |
|---|
| .. | .. |
|---|
| 239 | 247 | goto fail; |
|---|
| 240 | 248 | /* |
|---|
| 241 | 249 | * Normally the page will be dirtied in unmap because its pte should be |
|---|
| 242 | | - * dirty. A special case is MADV_FREE page. The page'e pte could have |
|---|
| 250 | + * dirty. A special case is MADV_FREE page. The page's pte could have |
|---|
| 243 | 251 | * dirty bit cleared but the page's SwapBacked bit is still set because |
|---|
| 244 | 252 | * clearing the dirty bit and SwapBacked bit has no lock protected. For |
|---|
| 245 | 253 | * such page, unmap will not set dirty bit for it, so page reclaim will |
|---|
| .. | .. |
|---|
| 264 | 272 | */ |
|---|
| 265 | 273 | void delete_from_swap_cache(struct page *page) |
|---|
| 266 | 274 | { |
|---|
| 267 | | - swp_entry_t entry; |
|---|
| 268 | | - struct address_space *address_space; |
|---|
| 275 | + swp_entry_t entry = { .val = page_private(page) }; |
|---|
| 276 | + struct address_space *address_space = swap_address_space(entry); |
|---|
| 269 | 277 | |
|---|
| 270 | | - entry.val = page_private(page); |
|---|
| 271 | | - |
|---|
| 272 | | - address_space = swap_address_space(entry); |
|---|
| 273 | 278 | xa_lock_irq(&address_space->i_pages); |
|---|
| 274 | | - __delete_from_swap_cache(page); |
|---|
| 279 | + __delete_from_swap_cache(page, entry, NULL); |
|---|
| 275 | 280 | xa_unlock_irq(&address_space->i_pages); |
|---|
| 276 | 281 | |
|---|
| 277 | 282 | put_swap_page(page, entry); |
|---|
| 278 | | - page_ref_sub(page, hpage_nr_pages(page)); |
|---|
| 283 | + page_ref_sub(page, thp_nr_pages(page)); |
|---|
| 284 | +} |
|---|
| 285 | + |
|---|
| 286 | +void clear_shadow_from_swap_cache(int type, unsigned long begin, |
|---|
| 287 | + unsigned long end) |
|---|
| 288 | +{ |
|---|
| 289 | + unsigned long curr = begin; |
|---|
| 290 | + void *old; |
|---|
| 291 | + |
|---|
| 292 | + for (;;) { |
|---|
| 293 | + unsigned long nr_shadows = 0; |
|---|
| 294 | + swp_entry_t entry = swp_entry(type, curr); |
|---|
| 295 | + struct address_space *address_space = swap_address_space(entry); |
|---|
| 296 | + XA_STATE(xas, &address_space->i_pages, curr); |
|---|
| 297 | + |
|---|
| 298 | + xa_lock_irq(&address_space->i_pages); |
|---|
| 299 | + xas_for_each(&xas, old, end) { |
|---|
| 300 | + if (!xa_is_value(old)) |
|---|
| 301 | + continue; |
|---|
| 302 | + xas_store(&xas, NULL); |
|---|
| 303 | + nr_shadows++; |
|---|
| 304 | + } |
|---|
| 305 | + address_space->nrexceptional -= nr_shadows; |
|---|
| 306 | + xa_unlock_irq(&address_space->i_pages); |
|---|
| 307 | + |
|---|
| 308 | + /* search the next swapcache until we meet end */ |
|---|
| 309 | + curr >>= SWAP_ADDRESS_SPACE_SHIFT; |
|---|
| 310 | + curr++; |
|---|
| 311 | + curr <<= SWAP_ADDRESS_SPACE_SHIFT; |
|---|
| 312 | + if (curr > end) |
|---|
| 313 | + break; |
|---|
| 314 | + } |
|---|
| 279 | 315 | } |
|---|
| 280 | 316 | |
|---|
| 281 | 317 | /* |
|---|
| .. | .. |
|---|
| 335 | 371 | unsigned long addr) |
|---|
| 336 | 372 | { |
|---|
| 337 | 373 | struct page *page; |
|---|
| 374 | + struct swap_info_struct *si; |
|---|
| 338 | 375 | |
|---|
| 376 | + si = get_swap_device(entry); |
|---|
| 377 | + if (!si) |
|---|
| 378 | + return NULL; |
|---|
| 339 | 379 | page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
|---|
| 380 | + put_swap_device(si); |
|---|
| 340 | 381 | |
|---|
| 341 | 382 | INC_CACHE_INFO(find_total); |
|---|
| 342 | 383 | if (page) { |
|---|
| .. | .. |
|---|
| 375 | 416 | return page; |
|---|
| 376 | 417 | } |
|---|
| 377 | 418 | |
|---|
| 419 | +/** |
|---|
| 420 | + * find_get_incore_page - Find and get a page from the page or swap caches. |
|---|
| 421 | + * @mapping: The address_space to search. |
|---|
| 422 | + * @index: The page cache index. |
|---|
| 423 | + * |
|---|
| 424 | + * This differs from find_get_page() in that it will also look for the |
|---|
| 425 | + * page in the swap cache. |
|---|
| 426 | + * |
|---|
| 427 | + * Return: The found page or %NULL. |
|---|
| 428 | + */ |
|---|
| 429 | +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) |
|---|
| 430 | +{ |
|---|
| 431 | + swp_entry_t swp; |
|---|
| 432 | + struct swap_info_struct *si; |
|---|
| 433 | + struct page *page = find_get_entry(mapping, index); |
|---|
| 434 | + |
|---|
| 435 | + if (!page) |
|---|
| 436 | + return page; |
|---|
| 437 | + if (!xa_is_value(page)) |
|---|
| 438 | + return find_subpage(page, index); |
|---|
| 439 | + if (!shmem_mapping(mapping)) |
|---|
| 440 | + return NULL; |
|---|
| 441 | + |
|---|
| 442 | + swp = radix_to_swp_entry(page); |
|---|
| 443 | + /* Prevent swapoff from happening to us */ |
|---|
| 444 | + si = get_swap_device(swp); |
|---|
| 445 | + if (!si) |
|---|
| 446 | + return NULL; |
|---|
| 447 | + page = find_get_page(swap_address_space(swp), swp_offset(swp)); |
|---|
| 448 | + put_swap_device(si); |
|---|
| 449 | + return page; |
|---|
| 450 | +} |
|---|
| 451 | + |
|---|
| 378 | 452 | struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, |
|---|
| 379 | 453 | struct vm_area_struct *vma, unsigned long addr, |
|---|
| 380 | 454 | bool *new_page_allocated) |
|---|
| 381 | 455 | { |
|---|
| 382 | | - struct page *found_page, *new_page = NULL; |
|---|
| 383 | | - struct address_space *swapper_space = swap_address_space(entry); |
|---|
| 384 | | - int err; |
|---|
| 456 | + struct swap_info_struct *si; |
|---|
| 457 | + struct page *page; |
|---|
| 458 | + void *shadow = NULL; |
|---|
| 459 | + |
|---|
| 385 | 460 | *new_page_allocated = false; |
|---|
| 386 | 461 | |
|---|
| 387 | | - do { |
|---|
| 462 | + for (;;) { |
|---|
| 463 | + int err; |
|---|
| 388 | 464 | /* |
|---|
| 389 | 465 | * First check the swap cache. Since this is normally |
|---|
| 390 | 466 | * called after lookup_swap_cache() failed, re-calling |
|---|
| 391 | 467 | * that would confuse statistics. |
|---|
| 392 | 468 | */ |
|---|
| 393 | | - found_page = find_get_page(swapper_space, swp_offset(entry)); |
|---|
| 394 | | - if (found_page) |
|---|
| 395 | | - break; |
|---|
| 469 | + si = get_swap_device(entry); |
|---|
| 470 | + if (!si) |
|---|
| 471 | + return NULL; |
|---|
| 472 | + page = find_get_page(swap_address_space(entry), |
|---|
| 473 | + swp_offset(entry)); |
|---|
| 474 | + put_swap_device(si); |
|---|
| 475 | + if (page) |
|---|
| 476 | + return page; |
|---|
| 396 | 477 | |
|---|
| 397 | 478 | /* |
|---|
| 398 | 479 | * Just skip read ahead for unused swap slot. |
|---|
| .. | .. |
|---|
| 403 | 484 | * else swap_off will be aborted if we return NULL. |
|---|
| 404 | 485 | */ |
|---|
| 405 | 486 | if (!__swp_swapcount(entry) && swap_slot_cache_enabled) |
|---|
| 406 | | - break; |
|---|
| 487 | + return NULL; |
|---|
| 407 | 488 | |
|---|
| 408 | 489 | /* |
|---|
| 409 | | - * Get a new page to read into from swap. |
|---|
| 490 | + * Get a new page to read into from swap. Allocate it now, |
|---|
| 491 | + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will |
|---|
| 492 | + * cause any racers to loop around until we add it to cache. |
|---|
| 410 | 493 | */ |
|---|
| 411 | | - if (!new_page) { |
|---|
| 412 | | - new_page = alloc_page_vma(gfp_mask, vma, addr); |
|---|
| 413 | | - if (!new_page) |
|---|
| 414 | | - break; /* Out of memory */ |
|---|
| 415 | | - } |
|---|
| 416 | | - |
|---|
| 417 | | - /* |
|---|
| 418 | | - * call radix_tree_preload() while we can wait. |
|---|
| 419 | | - */ |
|---|
| 420 | | - err = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); |
|---|
| 421 | | - if (err) |
|---|
| 422 | | - break; |
|---|
| 494 | + page = alloc_page_vma(gfp_mask, vma, addr); |
|---|
| 495 | + if (!page) |
|---|
| 496 | + return NULL; |
|---|
| 423 | 497 | |
|---|
| 424 | 498 | /* |
|---|
| 425 | 499 | * Swap entry may have been freed since our caller observed it. |
|---|
| 426 | 500 | */ |
|---|
| 427 | 501 | err = swapcache_prepare(entry); |
|---|
| 428 | | - if (err == -EEXIST) { |
|---|
| 429 | | - radix_tree_preload_end(); |
|---|
| 430 | | - /* |
|---|
| 431 | | - * We might race against get_swap_page() and stumble |
|---|
| 432 | | - * across a SWAP_HAS_CACHE swap_map entry whose page |
|---|
| 433 | | - * has not been brought into the swapcache yet. |
|---|
| 434 | | - */ |
|---|
| 435 | | - cond_resched(); |
|---|
| 436 | | - continue; |
|---|
| 437 | | - } |
|---|
| 438 | | - if (err) { /* swp entry is obsolete ? */ |
|---|
| 439 | | - radix_tree_preload_end(); |
|---|
| 502 | + if (!err) |
|---|
| 440 | 503 | break; |
|---|
| 441 | | - } |
|---|
| 442 | 504 | |
|---|
| 443 | | - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ |
|---|
| 444 | | - __SetPageLocked(new_page); |
|---|
| 445 | | - __SetPageSwapBacked(new_page); |
|---|
| 446 | | - err = __add_to_swap_cache(new_page, entry); |
|---|
| 447 | | - if (likely(!err)) { |
|---|
| 448 | | - radix_tree_preload_end(); |
|---|
| 449 | | - /* |
|---|
| 450 | | - * Initiate read into locked page and return. |
|---|
| 451 | | - */ |
|---|
| 452 | | - SetPageWorkingset(new_page); |
|---|
| 453 | | - lru_cache_add_anon(new_page); |
|---|
| 454 | | - *new_page_allocated = true; |
|---|
| 455 | | - return new_page; |
|---|
| 456 | | - } |
|---|
| 457 | | - radix_tree_preload_end(); |
|---|
| 458 | | - __ClearPageLocked(new_page); |
|---|
| 505 | + put_page(page); |
|---|
| 506 | + if (err != -EEXIST) |
|---|
| 507 | + return NULL; |
|---|
| 508 | + |
|---|
| 459 | 509 | /* |
|---|
| 460 | | - * add_to_swap_cache() doesn't return -EEXIST, so we can safely |
|---|
| 461 | | - * clear SWAP_HAS_CACHE flag. |
|---|
| 510 | + * We might race against __delete_from_swap_cache(), and |
|---|
| 511 | + * stumble across a swap_map entry whose SWAP_HAS_CACHE |
|---|
| 512 | + * has not yet been cleared. Or race against another |
|---|
| 513 | + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE |
|---|
| 514 | + * in swap_map, but not yet added its page to swap cache. |
|---|
| 462 | 515 | */ |
|---|
| 463 | | - put_swap_page(new_page, entry); |
|---|
| 464 | | - } while (err != -ENOMEM); |
|---|
| 516 | + schedule_timeout_uninterruptible(1); |
|---|
| 517 | + } |
|---|
| 465 | 518 | |
|---|
| 466 | | - if (new_page) |
|---|
| 467 | | - put_page(new_page); |
|---|
| 468 | | - return found_page; |
|---|
| 519 | + /* |
|---|
| 520 | + * The swap entry is ours to swap in. Prepare the new page. |
|---|
| 521 | + */ |
|---|
| 522 | + |
|---|
| 523 | + __SetPageLocked(page); |
|---|
| 524 | + __SetPageSwapBacked(page); |
|---|
| 525 | + |
|---|
| 526 | + /* May fail (-ENOMEM) if XArray node allocation failed. */ |
|---|
| 527 | + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { |
|---|
| 528 | + put_swap_page(page, entry); |
|---|
| 529 | + goto fail_unlock; |
|---|
| 530 | + } |
|---|
| 531 | + |
|---|
| 532 | + if (mem_cgroup_charge(page, NULL, gfp_mask)) { |
|---|
| 533 | + delete_from_swap_cache(page); |
|---|
| 534 | + goto fail_unlock; |
|---|
| 535 | + } |
|---|
| 536 | + |
|---|
| 537 | + if (shadow) |
|---|
| 538 | + workingset_refault(page, shadow); |
|---|
| 539 | + |
|---|
| 540 | + /* Caller will initiate read into locked page */ |
|---|
| 541 | + SetPageWorkingset(page); |
|---|
| 542 | + lru_cache_add(page); |
|---|
| 543 | + *new_page_allocated = true; |
|---|
| 544 | + return page; |
|---|
| 545 | + |
|---|
| 546 | +fail_unlock: |
|---|
| 547 | + unlock_page(page); |
|---|
| 548 | + put_page(page); |
|---|
| 549 | + return NULL; |
|---|
| 469 | 550 | } |
|---|
| 470 | 551 | |
|---|
| 471 | 552 | /* |
|---|
| .. | .. |
|---|
| 565 | 646 | * the readahead. |
|---|
| 566 | 647 | * |
|---|
| 567 | 648 | * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. |
|---|
| 649 | + * This is needed to ensure the VMA will not be freed in our back. In the case |
|---|
| 650 | + * of the speculative page fault handler, this cannot happen, even if we don't |
|---|
| 651 | + * hold the mmap_sem. Callees are assumed to take care of reading VMA's fields |
|---|
| 652 | + * using READ_ONCE() to read consistent values. |
|---|
| 568 | 653 | */ |
|---|
| 569 | 654 | struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, |
|---|
| 570 | 655 | struct vm_fault *vmf) |
|---|
| .. | .. |
|---|
| 583 | 668 | mask = swapin_nr_pages(offset) - 1; |
|---|
| 584 | 669 | if (!mask) |
|---|
| 585 | 670 | goto skip; |
|---|
| 671 | + |
|---|
| 672 | + /* Test swap type to make sure the dereference is safe */ |
|---|
| 673 | + if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { |
|---|
| 674 | + struct inode *inode = si->swap_file->f_mapping->host; |
|---|
| 675 | + if (inode_read_congested(inode)) |
|---|
| 676 | + goto skip; |
|---|
| 677 | + } |
|---|
| 586 | 678 | |
|---|
| 587 | 679 | do_poll = false; |
|---|
| 588 | 680 | /* Read a page_cluster sized and aligned cluster around offset. */ |
|---|
| .. | .. |
|---|
| 628 | 720 | return -ENOMEM; |
|---|
| 629 | 721 | for (i = 0; i < nr; i++) { |
|---|
| 630 | 722 | space = spaces + i; |
|---|
| 631 | | - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); |
|---|
| 723 | + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); |
|---|
| 632 | 724 | atomic_set(&space->i_mmap_writable, 0); |
|---|
| 633 | 725 | space->a_ops = &swap_aops; |
|---|
| 634 | 726 | /* swap cache doesn't use writeback related tags */ |
|---|
| 635 | 727 | mapping_set_no_writeback_tags(space); |
|---|
| 636 | 728 | } |
|---|
| 637 | 729 | nr_swapper_spaces[type] = nr; |
|---|
| 638 | | - rcu_assign_pointer(swapper_spaces[type], spaces); |
|---|
| 730 | + swapper_spaces[type] = spaces; |
|---|
| 639 | 731 | |
|---|
| 640 | 732 | return 0; |
|---|
| 641 | 733 | } |
|---|
| 642 | 734 | |
|---|
| 643 | 735 | void exit_swap_address_space(unsigned int type) |
|---|
| 644 | 736 | { |
|---|
| 645 | | - struct address_space *spaces; |
|---|
| 646 | | - |
|---|
| 647 | | - spaces = swapper_spaces[type]; |
|---|
| 737 | + kvfree(swapper_spaces[type]); |
|---|
| 648 | 738 | nr_swapper_spaces[type] = 0; |
|---|
| 649 | | - rcu_assign_pointer(swapper_spaces[type], NULL); |
|---|
| 650 | | - synchronize_rcu(); |
|---|
| 651 | | - kvfree(spaces); |
|---|
| 739 | + swapper_spaces[type] = NULL; |
|---|
| 652 | 740 | } |
|---|
| 653 | 741 | |
|---|
| 654 | 742 | static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, |
|---|
| .. | .. |
|---|
| 658 | 746 | unsigned long *start, |
|---|
| 659 | 747 | unsigned long *end) |
|---|
| 660 | 748 | { |
|---|
| 661 | | - *start = max3(lpfn, PFN_DOWN(vma->vm_start), |
|---|
| 749 | + *start = max3(lpfn, PFN_DOWN(READ_ONCE(vma->vm_start)), |
|---|
| 662 | 750 | PFN_DOWN(faddr & PMD_MASK)); |
|---|
| 663 | | - *end = min3(rpfn, PFN_DOWN(vma->vm_end), |
|---|
| 751 | + *end = min3(rpfn, PFN_DOWN(READ_ONCE(vma->vm_end)), |
|---|
| 664 | 752 | PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); |
|---|
| 665 | 753 | } |
|---|
| 666 | 754 | |
|---|
| .. | .. |
|---|
| 732 | 820 | pte_unmap(orig_pte); |
|---|
| 733 | 821 | } |
|---|
| 734 | 822 | |
|---|
| 823 | +/** |
|---|
| 824 | + * swap_vma_readahead - swap in pages in hope we need them soon |
|---|
| 825 | + * @fentry: swap entry of this memory |
|---|
| 826 | + * @gfp_mask: memory allocation flags |
|---|
| 827 | + * @vmf: fault information |
|---|
| 828 | + * |
|---|
| 829 | + * Returns the struct page for entry and addr, after queueing swapin. |
|---|
| 830 | + * |
|---|
| 831 | + * Primitive swap readahead code. We simply read in a few pages whoes |
|---|
| 832 | + * virtual addresses are around the fault address in the same vma. |
|---|
| 833 | + * |
|---|
| 834 | + * Caller must hold read mmap_lock if vmf->vma is not NULL. |
|---|
| 835 | + * |
|---|
| 836 | + */ |
|---|
| 735 | 837 | static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, |
|---|
| 736 | 838 | struct vm_fault *vmf) |
|---|
| 737 | 839 | { |
|---|