hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/mm/swap_state.c
....@@ -21,8 +21,7 @@
2121 #include <linux/vmalloc.h>
2222 #include <linux/swap_slots.h>
2323 #include <linux/huge_mm.h>
24
-
25
-#include <asm/pgtable.h>
24
+#include <linux/shmem_fs.h>
2625 #include "internal.h"
2726
2827 /*
....@@ -59,8 +58,8 @@
5958 #define GET_SWAP_RA_VAL(vma) \
6059 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
6160
62
-#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
63
-#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
61
+#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
62
+#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
6463
6564 static struct {
6665 unsigned long add_total;
....@@ -74,25 +73,27 @@
7473 unsigned int i, j, nr;
7574 unsigned long ret = 0;
7675 struct address_space *spaces;
76
+ struct swap_info_struct *si;
7777
78
- rcu_read_lock();
7978 for (i = 0; i < MAX_SWAPFILES; i++) {
80
- /*
81
- * The corresponding entries in nr_swapper_spaces and
82
- * swapper_spaces will be reused only after at least
83
- * one grace period. So it is impossible for them
84
- * belongs to different usage.
85
- */
86
- nr = nr_swapper_spaces[i];
87
- spaces = rcu_dereference(swapper_spaces[i]);
88
- if (!nr || !spaces)
79
+ swp_entry_t entry = swp_entry(i, 1);
80
+
81
+ /* Avoid get_swap_device() to warn for bad swap entry */
82
+ if (!swp_swap_info(entry))
8983 continue;
84
+ /* Prevent swapoff to free swapper_spaces */
85
+ si = get_swap_device(entry);
86
+ if (!si)
87
+ continue;
88
+ nr = nr_swapper_spaces[i];
89
+ spaces = swapper_spaces[i];
9090 for (j = 0; j < nr; j++)
9191 ret += spaces[j].nrpages;
92
+ put_swap_device(si);
9293 }
93
- rcu_read_unlock();
9494 return ret;
9595 }
96
+EXPORT_SYMBOL_GPL(total_swapcache_pages);
9697
9798 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
9899
....@@ -107,15 +108,32 @@
107108 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
108109 }
109110
111
+void *get_shadow_from_swap_cache(swp_entry_t entry)
112
+{
113
+ struct address_space *address_space = swap_address_space(entry);
114
+ pgoff_t idx = swp_offset(entry);
115
+ struct page *page;
116
+
117
+ page = find_get_entry(address_space, idx);
118
+ if (xa_is_value(page))
119
+ return page;
120
+ if (page)
121
+ put_page(page);
122
+ return NULL;
123
+}
124
+
110125 /*
111
- * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
126
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
112127 * but sets SwapCache flag and private instead of mapping and index.
113128 */
114
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
129
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
130
+ gfp_t gfp, void **shadowp)
115131 {
116
- int error, i, nr = hpage_nr_pages(page);
117
- struct address_space *address_space;
132
+ struct address_space *address_space = swap_address_space(entry);
118133 pgoff_t idx = swp_offset(entry);
134
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
135
+ unsigned long i, nr = thp_nr_pages(page);
136
+ void *old;
119137
120138 VM_BUG_ON_PAGE(!PageLocked(page), page);
121139 VM_BUG_ON_PAGE(PageSwapCache(page), page);
....@@ -124,75 +142,66 @@
124142 page_ref_add(page, nr);
125143 SetPageSwapCache(page);
126144
127
- address_space = swap_address_space(entry);
128
- xa_lock_irq(&address_space->i_pages);
129
- for (i = 0; i < nr; i++) {
130
- set_page_private(page + i, entry.val + i);
131
- error = radix_tree_insert(&address_space->i_pages,
132
- idx + i, page + i);
133
- if (unlikely(error))
134
- break;
135
- }
136
- if (likely(!error)) {
145
+ do {
146
+ unsigned long nr_shadows = 0;
147
+
148
+ xas_lock_irq(&xas);
149
+ xas_create_range(&xas);
150
+ if (xas_error(&xas))
151
+ goto unlock;
152
+ for (i = 0; i < nr; i++) {
153
+ VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
154
+ old = xas_load(&xas);
155
+ if (xa_is_value(old)) {
156
+ nr_shadows++;
157
+ if (shadowp)
158
+ *shadowp = old;
159
+ }
160
+ set_page_private(page + i, entry.val + i);
161
+ xas_store(&xas, page);
162
+ xas_next(&xas);
163
+ }
164
+ address_space->nrexceptional -= nr_shadows;
137165 address_space->nrpages += nr;
138166 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
139167 ADD_CACHE_INFO(add_total, nr);
140
- } else {
141
- /*
142
- * Only the context which have set SWAP_HAS_CACHE flag
143
- * would call add_to_swap_cache().
144
- * So add_to_swap_cache() doesn't returns -EEXIST.
145
- */
146
- VM_BUG_ON(error == -EEXIST);
147
- set_page_private(page + i, 0UL);
148
- while (i--) {
149
- radix_tree_delete(&address_space->i_pages, idx + i);
150
- set_page_private(page + i, 0UL);
151
- }
152
- ClearPageSwapCache(page);
153
- page_ref_sub(page, nr);
154
- }
155
- xa_unlock_irq(&address_space->i_pages);
168
+unlock:
169
+ xas_unlock_irq(&xas);
170
+ } while (xas_nomem(&xas, gfp));
156171
157
- return error;
158
-}
172
+ if (!xas_error(&xas))
173
+ return 0;
159174
160
-
161
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
162
-{
163
- int error;
164
-
165
- error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
166
- if (!error) {
167
- error = __add_to_swap_cache(page, entry);
168
- radix_tree_preload_end();
169
- }
170
- return error;
175
+ ClearPageSwapCache(page);
176
+ page_ref_sub(page, nr);
177
+ return xas_error(&xas);
171178 }
172179
173180 /*
174181 * This must be called only on pages that have
175182 * been verified to be in the swap cache.
176183 */
177
-void __delete_from_swap_cache(struct page *page)
184
+void __delete_from_swap_cache(struct page *page,
185
+ swp_entry_t entry, void *shadow)
178186 {
179
- struct address_space *address_space;
180
- int i, nr = hpage_nr_pages(page);
181
- swp_entry_t entry;
182
- pgoff_t idx;
187
+ struct address_space *address_space = swap_address_space(entry);
188
+ int i, nr = thp_nr_pages(page);
189
+ pgoff_t idx = swp_offset(entry);
190
+ XA_STATE(xas, &address_space->i_pages, idx);
183191
184192 VM_BUG_ON_PAGE(!PageLocked(page), page);
185193 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
186194 VM_BUG_ON_PAGE(PageWriteback(page), page);
187195
188
- entry.val = page_private(page);
189
- address_space = swap_address_space(entry);
190
- idx = swp_offset(entry);
191196 for (i = 0; i < nr; i++) {
192
- radix_tree_delete(&address_space->i_pages, idx + i);
197
+ void *entry = xas_store(&xas, shadow);
198
+ VM_BUG_ON_PAGE(entry != page, entry);
193199 set_page_private(page + i, 0);
200
+ xas_next(&xas);
194201 }
195202 ClearPageSwapCache(page);
203
+ if (shadow)
204
+ address_space->nrexceptional += nr;
196205 address_space->nrpages -= nr;
197206 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
198207 ADD_CACHE_INFO(del_total, nr);
....@@ -218,7 +227,7 @@
218227 return 0;
219228
220229 /*
221
- * Radix-tree node allocations from PF_MEMALLOC contexts could
230
+ * XArray node allocations from PF_MEMALLOC contexts could
222231 * completely exhaust the page allocator. __GFP_NOMEMALLOC
223232 * stops emergency reserves from being allocated.
224233 *
....@@ -229,8 +238,7 @@
229238 * Add it to the swap cache.
230239 */
231240 err = add_to_swap_cache(page, entry,
232
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
233
- /* -ENOMEM radix-tree allocation failure */
241
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
234242 if (err)
235243 /*
236244 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
....@@ -239,7 +247,7 @@
239247 goto fail;
240248 /*
241249 * Normally the page will be dirtied in unmap because its pte should be
242
- * dirty. A special case is MADV_FREE page. The page'e pte could have
250
+ * dirty. A special case is MADV_FREE page. The page's pte could have
243251 * dirty bit cleared but the page's SwapBacked bit is still set because
244252 * clearing the dirty bit and SwapBacked bit has no lock protected. For
245253 * such page, unmap will not set dirty bit for it, so page reclaim will
....@@ -264,18 +272,46 @@
264272 */
265273 void delete_from_swap_cache(struct page *page)
266274 {
267
- swp_entry_t entry;
268
- struct address_space *address_space;
275
+ swp_entry_t entry = { .val = page_private(page) };
276
+ struct address_space *address_space = swap_address_space(entry);
269277
270
- entry.val = page_private(page);
271
-
272
- address_space = swap_address_space(entry);
273278 xa_lock_irq(&address_space->i_pages);
274
- __delete_from_swap_cache(page);
279
+ __delete_from_swap_cache(page, entry, NULL);
275280 xa_unlock_irq(&address_space->i_pages);
276281
277282 put_swap_page(page, entry);
278
- page_ref_sub(page, hpage_nr_pages(page));
283
+ page_ref_sub(page, thp_nr_pages(page));
284
+}
285
+
286
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
287
+ unsigned long end)
288
+{
289
+ unsigned long curr = begin;
290
+ void *old;
291
+
292
+ for (;;) {
293
+ unsigned long nr_shadows = 0;
294
+ swp_entry_t entry = swp_entry(type, curr);
295
+ struct address_space *address_space = swap_address_space(entry);
296
+ XA_STATE(xas, &address_space->i_pages, curr);
297
+
298
+ xa_lock_irq(&address_space->i_pages);
299
+ xas_for_each(&xas, old, end) {
300
+ if (!xa_is_value(old))
301
+ continue;
302
+ xas_store(&xas, NULL);
303
+ nr_shadows++;
304
+ }
305
+ address_space->nrexceptional -= nr_shadows;
306
+ xa_unlock_irq(&address_space->i_pages);
307
+
308
+ /* search the next swapcache until we meet end */
309
+ curr >>= SWAP_ADDRESS_SPACE_SHIFT;
310
+ curr++;
311
+ curr <<= SWAP_ADDRESS_SPACE_SHIFT;
312
+ if (curr > end)
313
+ break;
314
+ }
279315 }
280316
281317 /*
....@@ -335,8 +371,13 @@
335371 unsigned long addr)
336372 {
337373 struct page *page;
374
+ struct swap_info_struct *si;
338375
376
+ si = get_swap_device(entry);
377
+ if (!si)
378
+ return NULL;
339379 page = find_get_page(swap_address_space(entry), swp_offset(entry));
380
+ put_swap_device(si);
340381
341382 INC_CACHE_INFO(find_total);
342383 if (page) {
....@@ -375,24 +416,64 @@
375416 return page;
376417 }
377418
419
+/**
420
+ * find_get_incore_page - Find and get a page from the page or swap caches.
421
+ * @mapping: The address_space to search.
422
+ * @index: The page cache index.
423
+ *
424
+ * This differs from find_get_page() in that it will also look for the
425
+ * page in the swap cache.
426
+ *
427
+ * Return: The found page or %NULL.
428
+ */
429
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
430
+{
431
+ swp_entry_t swp;
432
+ struct swap_info_struct *si;
433
+ struct page *page = find_get_entry(mapping, index);
434
+
435
+ if (!page)
436
+ return page;
437
+ if (!xa_is_value(page))
438
+ return find_subpage(page, index);
439
+ if (!shmem_mapping(mapping))
440
+ return NULL;
441
+
442
+ swp = radix_to_swp_entry(page);
443
+ /* Prevent swapoff from happening to us */
444
+ si = get_swap_device(swp);
445
+ if (!si)
446
+ return NULL;
447
+ page = find_get_page(swap_address_space(swp), swp_offset(swp));
448
+ put_swap_device(si);
449
+ return page;
450
+}
451
+
378452 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
379453 struct vm_area_struct *vma, unsigned long addr,
380454 bool *new_page_allocated)
381455 {
382
- struct page *found_page, *new_page = NULL;
383
- struct address_space *swapper_space = swap_address_space(entry);
384
- int err;
456
+ struct swap_info_struct *si;
457
+ struct page *page;
458
+ void *shadow = NULL;
459
+
385460 *new_page_allocated = false;
386461
387
- do {
462
+ for (;;) {
463
+ int err;
388464 /*
389465 * First check the swap cache. Since this is normally
390466 * called after lookup_swap_cache() failed, re-calling
391467 * that would confuse statistics.
392468 */
393
- found_page = find_get_page(swapper_space, swp_offset(entry));
394
- if (found_page)
395
- break;
469
+ si = get_swap_device(entry);
470
+ if (!si)
471
+ return NULL;
472
+ page = find_get_page(swap_address_space(entry),
473
+ swp_offset(entry));
474
+ put_swap_device(si);
475
+ if (page)
476
+ return page;
396477
397478 /*
398479 * Just skip read ahead for unused swap slot.
....@@ -403,69 +484,69 @@
403484 * else swap_off will be aborted if we return NULL.
404485 */
405486 if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
406
- break;
487
+ return NULL;
407488
408489 /*
409
- * Get a new page to read into from swap.
490
+ * Get a new page to read into from swap. Allocate it now,
491
+ * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
492
+ * cause any racers to loop around until we add it to cache.
410493 */
411
- if (!new_page) {
412
- new_page = alloc_page_vma(gfp_mask, vma, addr);
413
- if (!new_page)
414
- break; /* Out of memory */
415
- }
416
-
417
- /*
418
- * call radix_tree_preload() while we can wait.
419
- */
420
- err = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
421
- if (err)
422
- break;
494
+ page = alloc_page_vma(gfp_mask, vma, addr);
495
+ if (!page)
496
+ return NULL;
423497
424498 /*
425499 * Swap entry may have been freed since our caller observed it.
426500 */
427501 err = swapcache_prepare(entry);
428
- if (err == -EEXIST) {
429
- radix_tree_preload_end();
430
- /*
431
- * We might race against get_swap_page() and stumble
432
- * across a SWAP_HAS_CACHE swap_map entry whose page
433
- * has not been brought into the swapcache yet.
434
- */
435
- cond_resched();
436
- continue;
437
- }
438
- if (err) { /* swp entry is obsolete ? */
439
- radix_tree_preload_end();
502
+ if (!err)
440503 break;
441
- }
442504
443
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
444
- __SetPageLocked(new_page);
445
- __SetPageSwapBacked(new_page);
446
- err = __add_to_swap_cache(new_page, entry);
447
- if (likely(!err)) {
448
- radix_tree_preload_end();
449
- /*
450
- * Initiate read into locked page and return.
451
- */
452
- SetPageWorkingset(new_page);
453
- lru_cache_add_anon(new_page);
454
- *new_page_allocated = true;
455
- return new_page;
456
- }
457
- radix_tree_preload_end();
458
- __ClearPageLocked(new_page);
505
+ put_page(page);
506
+ if (err != -EEXIST)
507
+ return NULL;
508
+
459509 /*
460
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
461
- * clear SWAP_HAS_CACHE flag.
510
+ * We might race against __delete_from_swap_cache(), and
511
+ * stumble across a swap_map entry whose SWAP_HAS_CACHE
512
+ * has not yet been cleared. Or race against another
513
+ * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
514
+ * in swap_map, but not yet added its page to swap cache.
462515 */
463
- put_swap_page(new_page, entry);
464
- } while (err != -ENOMEM);
516
+ schedule_timeout_uninterruptible(1);
517
+ }
465518
466
- if (new_page)
467
- put_page(new_page);
468
- return found_page;
519
+ /*
520
+ * The swap entry is ours to swap in. Prepare the new page.
521
+ */
522
+
523
+ __SetPageLocked(page);
524
+ __SetPageSwapBacked(page);
525
+
526
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
527
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
528
+ put_swap_page(page, entry);
529
+ goto fail_unlock;
530
+ }
531
+
532
+ if (mem_cgroup_charge(page, NULL, gfp_mask)) {
533
+ delete_from_swap_cache(page);
534
+ goto fail_unlock;
535
+ }
536
+
537
+ if (shadow)
538
+ workingset_refault(page, shadow);
539
+
540
+ /* Caller will initiate read into locked page */
541
+ SetPageWorkingset(page);
542
+ lru_cache_add(page);
543
+ *new_page_allocated = true;
544
+ return page;
545
+
546
+fail_unlock:
547
+ unlock_page(page);
548
+ put_page(page);
549
+ return NULL;
469550 }
470551
471552 /*
....@@ -565,6 +646,10 @@
565646 * the readahead.
566647 *
567648 * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
649
+ * This is needed to ensure the VMA will not be freed in our back. In the case
650
+ * of the speculative page fault handler, this cannot happen, even if we don't
651
+ * hold the mmap_sem. Callees are assumed to take care of reading VMA's fields
652
+ * using READ_ONCE() to read consistent values.
568653 */
569654 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
570655 struct vm_fault *vmf)
....@@ -583,6 +668,13 @@
583668 mask = swapin_nr_pages(offset) - 1;
584669 if (!mask)
585670 goto skip;
671
+
672
+ /* Test swap type to make sure the dereference is safe */
673
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
674
+ struct inode *inode = si->swap_file->f_mapping->host;
675
+ if (inode_read_congested(inode))
676
+ goto skip;
677
+ }
586678
587679 do_poll = false;
588680 /* Read a page_cluster sized and aligned cluster around offset. */
....@@ -628,27 +720,23 @@
628720 return -ENOMEM;
629721 for (i = 0; i < nr; i++) {
630722 space = spaces + i;
631
- INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
723
+ xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
632724 atomic_set(&space->i_mmap_writable, 0);
633725 space->a_ops = &swap_aops;
634726 /* swap cache doesn't use writeback related tags */
635727 mapping_set_no_writeback_tags(space);
636728 }
637729 nr_swapper_spaces[type] = nr;
638
- rcu_assign_pointer(swapper_spaces[type], spaces);
730
+ swapper_spaces[type] = spaces;
639731
640732 return 0;
641733 }
642734
643735 void exit_swap_address_space(unsigned int type)
644736 {
645
- struct address_space *spaces;
646
-
647
- spaces = swapper_spaces[type];
737
+ kvfree(swapper_spaces[type]);
648738 nr_swapper_spaces[type] = 0;
649
- rcu_assign_pointer(swapper_spaces[type], NULL);
650
- synchronize_rcu();
651
- kvfree(spaces);
739
+ swapper_spaces[type] = NULL;
652740 }
653741
654742 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
....@@ -658,9 +746,9 @@
658746 unsigned long *start,
659747 unsigned long *end)
660748 {
661
- *start = max3(lpfn, PFN_DOWN(vma->vm_start),
749
+ *start = max3(lpfn, PFN_DOWN(READ_ONCE(vma->vm_start)),
662750 PFN_DOWN(faddr & PMD_MASK));
663
- *end = min3(rpfn, PFN_DOWN(vma->vm_end),
751
+ *end = min3(rpfn, PFN_DOWN(READ_ONCE(vma->vm_end)),
664752 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
665753 }
666754
....@@ -732,6 +820,20 @@
732820 pte_unmap(orig_pte);
733821 }
734822
823
+/**
824
+ * swap_vma_readahead - swap in pages in hope we need them soon
825
+ * @fentry: swap entry of this memory
826
+ * @gfp_mask: memory allocation flags
827
+ * @vmf: fault information
828
+ *
829
+ * Returns the struct page for entry and addr, after queueing swapin.
830
+ *
831
+ * Primitive swap readahead code. We simply read in a few pages whoes
832
+ * virtual addresses are around the fault address in the same vma.
833
+ *
834
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
835
+ *
836
+ */
735837 static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
736838 struct vm_fault *vmf)
737839 {