hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/mm/shmem.c
....@@ -36,8 +36,17 @@
3636 #include <linux/uio.h>
3737 #include <linux/khugepaged.h>
3838 #include <linux/hugetlb.h>
39
+#include <linux/frontswap.h>
40
+#include <linux/fs_parser.h>
41
+#include <linux/mm_inline.h>
3942
4043 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
44
+
45
+#include "internal.h"
46
+
47
+#undef CREATE_TRACE_POINTS
48
+#include <trace/hooks/shmem_fs.h>
49
+#include <trace/hooks/mm.h>
4150
4251 static struct vfsmount *shm_mnt;
4352
....@@ -80,7 +89,6 @@
8089 #include <linux/uuid.h>
8190
8291 #include <linux/uaccess.h>
83
-#include <asm/pgtable.h>
8492
8593 #include "internal.h"
8694
....@@ -106,21 +114,43 @@
106114 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
107115 };
108116
117
+struct shmem_options {
118
+ unsigned long long blocks;
119
+ unsigned long long inodes;
120
+ struct mempolicy *mpol;
121
+ kuid_t uid;
122
+ kgid_t gid;
123
+ umode_t mode;
124
+ bool full_inums;
125
+ int huge;
126
+ int seen;
127
+#define SHMEM_SEEN_BLOCKS 1
128
+#define SHMEM_SEEN_INODES 2
129
+#define SHMEM_SEEN_HUGE 4
130
+#define SHMEM_SEEN_INUMS 8
131
+};
132
+
109133 #ifdef CONFIG_TMPFS
110134 static unsigned long shmem_default_max_blocks(void)
111135 {
112
- return totalram_pages / 2;
136
+ return totalram_pages() / 2;
113137 }
114138
115139 static unsigned long shmem_default_max_inodes(void)
116140 {
117
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
141
+ unsigned long nr_pages = totalram_pages();
142
+
143
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
118144 }
119145 #endif
120146
121147 static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
122148 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
123149 struct shmem_inode_info *info, pgoff_t index);
150
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
151
+ struct page **pagep, enum sgp_type sgp,
152
+ gfp_t gfp, struct vm_area_struct *vma,
153
+ vm_fault_t *fault_type);
124154 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
125155 struct page **pagep, enum sgp_type sgp,
126156 gfp_t gfp, struct vm_area_struct *vma,
....@@ -239,18 +269,79 @@
239269 static LIST_HEAD(shmem_swaplist);
240270 static DEFINE_MUTEX(shmem_swaplist_mutex);
241271
242
-static int shmem_reserve_inode(struct super_block *sb)
272
+/*
273
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
274
+ * produces a novel ino for the newly allocated inode.
275
+ *
276
+ * It may also be called when making a hard link to permit the space needed by
277
+ * each dentry. However, in that case, no new inode number is needed since that
278
+ * internally draws from another pool of inode numbers (currently global
279
+ * get_next_ino()). This case is indicated by passing NULL as inop.
280
+ */
281
+#define SHMEM_INO_BATCH 1024
282
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
243283 {
244284 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
245
- if (sbinfo->max_inodes) {
246
- spin_lock(&sbinfo->stat_lock);
247
- if (!sbinfo->free_inodes) {
248
- spin_unlock(&sbinfo->stat_lock);
249
- return -ENOSPC;
285
+ ino_t ino;
286
+
287
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
288
+ raw_spin_lock(&sbinfo->stat_lock);
289
+ if (sbinfo->max_inodes) {
290
+ if (!sbinfo->free_inodes) {
291
+ raw_spin_unlock(&sbinfo->stat_lock);
292
+ return -ENOSPC;
293
+ }
294
+ sbinfo->free_inodes--;
250295 }
251
- sbinfo->free_inodes--;
252
- spin_unlock(&sbinfo->stat_lock);
296
+ if (inop) {
297
+ ino = sbinfo->next_ino++;
298
+ if (unlikely(is_zero_ino(ino)))
299
+ ino = sbinfo->next_ino++;
300
+ if (unlikely(!sbinfo->full_inums &&
301
+ ino > UINT_MAX)) {
302
+ /*
303
+ * Emulate get_next_ino uint wraparound for
304
+ * compatibility
305
+ */
306
+ if (IS_ENABLED(CONFIG_64BIT))
307
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
308
+ __func__, MINOR(sb->s_dev));
309
+ sbinfo->next_ino = 1;
310
+ ino = sbinfo->next_ino++;
311
+ }
312
+ *inop = ino;
313
+ }
314
+ raw_spin_unlock(&sbinfo->stat_lock);
315
+ } else if (inop) {
316
+ /*
317
+ * __shmem_file_setup, one of our callers, is lock-free: it
318
+ * doesn't hold stat_lock in shmem_reserve_inode since
319
+ * max_inodes is always 0, and is called from potentially
320
+ * unknown contexts. As such, use a per-cpu batched allocator
321
+ * which doesn't require the per-sb stat_lock unless we are at
322
+ * the batch boundary.
323
+ *
324
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
325
+ * shmem mounts are not exposed to userspace, so we don't need
326
+ * to worry about things like glibc compatibility.
327
+ */
328
+ ino_t *next_ino;
329
+
330
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
331
+ ino = *next_ino;
332
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
333
+ raw_spin_lock(&sbinfo->stat_lock);
334
+ ino = sbinfo->next_ino;
335
+ sbinfo->next_ino += SHMEM_INO_BATCH;
336
+ raw_spin_unlock(&sbinfo->stat_lock);
337
+ if (unlikely(is_zero_ino(ino)))
338
+ ino++;
339
+ }
340
+ *inop = ino;
341
+ *next_ino = ++ino;
342
+ put_cpu();
253343 }
344
+
254345 return 0;
255346 }
256347
....@@ -258,9 +349,9 @@
258349 {
259350 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
260351 if (sbinfo->max_inodes) {
261
- spin_lock(&sbinfo->stat_lock);
352
+ raw_spin_lock(&sbinfo->stat_lock);
262353 sbinfo->free_inodes++;
263
- spin_unlock(&sbinfo->stat_lock);
354
+ raw_spin_unlock(&sbinfo->stat_lock);
264355 }
265356 }
266357
....@@ -326,24 +417,20 @@
326417 }
327418
328419 /*
329
- * Replace item expected in radix tree by a new item, while holding tree lock.
420
+ * Replace item expected in xarray by a new item, while holding xa_lock.
330421 */
331
-static int shmem_radix_tree_replace(struct address_space *mapping,
422
+static int shmem_replace_entry(struct address_space *mapping,
332423 pgoff_t index, void *expected, void *replacement)
333424 {
334
- struct radix_tree_node *node;
335
- void __rcu **pslot;
425
+ XA_STATE(xas, &mapping->i_pages, index);
336426 void *item;
337427
338428 VM_BUG_ON(!expected);
339429 VM_BUG_ON(!replacement);
340
- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
341
- if (!item)
342
- return -ENOENT;
430
+ item = xas_load(&xas);
343431 if (item != expected)
344432 return -ENOENT;
345
- __radix_tree_replace(&mapping->i_pages, node, pslot,
346
- replacement, NULL);
433
+ xas_store(&xas, replacement);
347434 return 0;
348435 }
349436
....@@ -357,12 +444,7 @@
357444 static bool shmem_confirm_swap(struct address_space *mapping,
358445 pgoff_t index, swp_entry_t swap)
359446 {
360
- void *item;
361
-
362
- rcu_read_lock();
363
- item = radix_tree_lookup(&mapping->i_pages, index);
364
- rcu_read_unlock();
365
- return item == swp_to_radix_entry(swap);
447
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
366448 }
367449
368450 /*
....@@ -397,12 +479,12 @@
397479 #define SHMEM_HUGE_DENY (-1)
398480 #define SHMEM_HUGE_FORCE (-2)
399481
400
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
482
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
401483 /* ifdef here to avoid bloating shmem.o when not necessary */
402484
403485 static int shmem_huge __read_mostly;
404486
405
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
487
+#if defined(CONFIG_SYSFS)
406488 static int shmem_parse_huge(const char *str)
407489 {
408490 if (!strcmp(str, "never"))
....@@ -419,7 +501,9 @@
419501 return SHMEM_HUGE_FORCE;
420502 return -EINVAL;
421503 }
504
+#endif
422505
506
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
423507 static const char *shmem_format_huge(int huge)
424508 {
425509 switch (huge) {
....@@ -570,7 +654,7 @@
570654 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
571655 return READ_ONCE(sbinfo->shrinklist_len);
572656 }
573
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
657
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
574658
575659 #define shmem_huge SHMEM_HUGE_DENY
576660
....@@ -579,11 +663,11 @@
579663 {
580664 return 0;
581665 }
582
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
666
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
583667
584668 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
585669 {
586
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
670
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
587671 (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
588672 shmem_huge != SHMEM_HUGE_DENY)
589673 return true;
....@@ -595,9 +679,13 @@
595679 */
596680 static int shmem_add_to_page_cache(struct page *page,
597681 struct address_space *mapping,
598
- pgoff_t index, void *expected)
682
+ pgoff_t index, void *expected, gfp_t gfp,
683
+ struct mm_struct *charge_mm)
599684 {
600
- int error, nr = hpage_nr_pages(page);
685
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
686
+ unsigned long i = 0;
687
+ unsigned long nr = compound_nr(page);
688
+ int error;
601689
602690 VM_BUG_ON_PAGE(PageTail(page), page);
603691 VM_BUG_ON_PAGE(index != round_down(index, nr), page);
....@@ -609,46 +697,53 @@
609697 page->mapping = mapping;
610698 page->index = index;
611699
612
- xa_lock_irq(&mapping->i_pages);
613
- if (PageTransHuge(page)) {
614
- void __rcu **results;
615
- pgoff_t idx;
616
- int i;
617
-
618
- error = 0;
619
- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
620
- &results, &idx, index, 1) &&
621
- idx < index + HPAGE_PMD_NR) {
622
- error = -EEXIST;
623
- }
624
-
625
- if (!error) {
626
- for (i = 0; i < HPAGE_PMD_NR; i++) {
627
- error = radix_tree_insert(&mapping->i_pages,
628
- index + i, page + i);
629
- VM_BUG_ON(error);
700
+ if (!PageSwapCache(page)) {
701
+ error = mem_cgroup_charge(page, charge_mm, gfp);
702
+ if (error) {
703
+ if (PageTransHuge(page)) {
704
+ count_vm_event(THP_FILE_FALLBACK);
705
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
630706 }
631
- count_vm_event(THP_FILE_ALLOC);
707
+ goto error;
632708 }
633
- } else if (!expected) {
634
- error = radix_tree_insert(&mapping->i_pages, index, page);
635
- } else {
636
- error = shmem_radix_tree_replace(mapping, index, expected,
637
- page);
709
+ }
710
+ cgroup_throttle_swaprate(page, gfp);
711
+
712
+ do {
713
+ void *entry;
714
+ xas_lock_irq(&xas);
715
+ entry = xas_find_conflict(&xas);
716
+ if (entry != expected)
717
+ xas_set_err(&xas, -EEXIST);
718
+ xas_create_range(&xas);
719
+ if (xas_error(&xas))
720
+ goto unlock;
721
+next:
722
+ xas_store(&xas, page);
723
+ if (++i < nr) {
724
+ xas_next(&xas);
725
+ goto next;
726
+ }
727
+ if (PageTransHuge(page)) {
728
+ count_vm_event(THP_FILE_ALLOC);
729
+ __inc_node_page_state(page, NR_SHMEM_THPS);
730
+ }
731
+ mapping->nrpages += nr;
732
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
733
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
734
+unlock:
735
+ xas_unlock_irq(&xas);
736
+ } while (xas_nomem(&xas, gfp));
737
+
738
+ if (xas_error(&xas)) {
739
+ error = xas_error(&xas);
740
+ goto error;
638741 }
639742
640
- if (!error) {
641
- mapping->nrpages += nr;
642
- if (PageTransHuge(page))
643
- __inc_node_page_state(page, NR_SHMEM_THPS);
644
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
645
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
646
- xa_unlock_irq(&mapping->i_pages);
647
- } else {
648
- page->mapping = NULL;
649
- xa_unlock_irq(&mapping->i_pages);
650
- page_ref_sub(page, nr);
651
- }
743
+ return 0;
744
+error:
745
+ page->mapping = NULL;
746
+ page_ref_sub(page, nr);
652747 return error;
653748 }
654749
....@@ -663,27 +758,25 @@
663758 VM_BUG_ON_PAGE(PageCompound(page), page);
664759
665760 xa_lock_irq(&mapping->i_pages);
666
- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
761
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
667762 page->mapping = NULL;
668763 mapping->nrpages--;
669
- __dec_node_page_state(page, NR_FILE_PAGES);
670
- __dec_node_page_state(page, NR_SHMEM);
764
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
765
+ __dec_lruvec_page_state(page, NR_SHMEM);
671766 xa_unlock_irq(&mapping->i_pages);
672767 put_page(page);
673768 BUG_ON(error);
674769 }
675770
676771 /*
677
- * Remove swap entry from radix tree, free the swap and its page cache.
772
+ * Remove swap entry from page cache, free the swap and its page cache.
678773 */
679774 static int shmem_free_swap(struct address_space *mapping,
680775 pgoff_t index, void *radswap)
681776 {
682777 void *old;
683778
684
- xa_lock_irq(&mapping->i_pages);
685
- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
686
- xa_unlock_irq(&mapping->i_pages);
779
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
687780 if (old != radswap)
688781 return -ENOENT;
689782 free_swap_and_cache(radix_to_swp_entry(radswap));
....@@ -700,29 +793,19 @@
700793 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
701794 pgoff_t start, pgoff_t end)
702795 {
703
- struct radix_tree_iter iter;
704
- void __rcu **slot;
796
+ XA_STATE(xas, &mapping->i_pages, start);
705797 struct page *page;
706798 unsigned long swapped = 0;
707799
708800 rcu_read_lock();
709
-
710
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
711
- if (iter.index >= end)
712
- break;
713
-
714
- page = radix_tree_deref_slot(slot);
715
-
716
- if (radix_tree_deref_retry(page)) {
717
- slot = radix_tree_iter_retry(&iter);
801
+ xas_for_each(&xas, page, end - 1) {
802
+ if (xas_retry(&xas, page))
718803 continue;
719
- }
720
-
721
- if (radix_tree_exceptional_entry(page))
804
+ if (xa_is_value(page))
722805 swapped++;
723806
724807 if (need_resched()) {
725
- slot = radix_tree_iter_resume(slot, &iter);
808
+ xas_pause(&xas);
726809 cond_resched_rcu();
727810 }
728811 }
....@@ -797,7 +880,33 @@
797880 }
798881
799882 /*
800
- * Remove range of pages and swap entries from radix tree, and free them.
883
+ * Check whether a hole-punch or truncation needs to split a huge page,
884
+ * returning true if no split was required, or the split has been successful.
885
+ *
886
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
887
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
888
+ * head, and then succeeded to trylock on tail.
889
+ *
890
+ * A split can only succeed when there are no additional references on the
891
+ * huge page: so the split below relies upon find_get_entries() having stopped
892
+ * when it found a subpage of the huge page, without getting further references.
893
+ */
894
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
895
+{
896
+ if (!PageTransCompound(page))
897
+ return true;
898
+
899
+ /* Just proceed to delete a huge page wholly within the range punched */
900
+ if (PageHead(page) &&
901
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
902
+ return true;
903
+
904
+ /* Try to split huge page, so we can truly punch the hole or truncate */
905
+ return split_huge_page(page) >= 0;
906
+}
907
+
908
+/*
909
+ * Remove range of pages and swap entries from page cache, and free them.
801910 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
802911 */
803912 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
....@@ -833,7 +942,7 @@
833942 if (index >= end)
834943 break;
835944
836
- if (radix_tree_exceptional_entry(page)) {
945
+ if (xa_is_value(page)) {
837946 if (unfalloc)
838947 continue;
839948 nr_swaps_freed += !shmem_free_swap(mapping,
....@@ -846,31 +955,11 @@
846955 if (!trylock_page(page))
847956 continue;
848957
849
- if (PageTransTail(page)) {
850
- /* Middle of THP: zero out the page */
851
- clear_highpage(page);
852
- unlock_page(page);
853
- continue;
854
- } else if (PageTransHuge(page)) {
855
- if (index == round_down(end, HPAGE_PMD_NR)) {
856
- /*
857
- * Range ends in the middle of THP:
858
- * zero out the page
859
- */
860
- clear_highpage(page);
861
- unlock_page(page);
862
- continue;
863
- }
864
- index += HPAGE_PMD_NR - 1;
865
- i += HPAGE_PMD_NR - 1;
866
- }
867
-
868
- if (!unfalloc || !PageUptodate(page)) {
869
- VM_BUG_ON_PAGE(PageTail(page), page);
870
- if (page_mapping(page) == mapping) {
871
- VM_BUG_ON_PAGE(PageWriteback(page), page);
958
+ if ((!unfalloc || !PageUptodate(page)) &&
959
+ page_mapping(page) == mapping) {
960
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
961
+ if (shmem_punch_compound(page, start, end))
872962 truncate_inode_page(mapping, page);
873
- }
874963 }
875964 unlock_page(page);
876965 }
....@@ -930,7 +1019,7 @@
9301019 if (index >= end)
9311020 break;
9321021
933
- if (radix_tree_exceptional_entry(page)) {
1022
+ if (xa_is_value(page)) {
9341023 if (unfalloc)
9351024 continue;
9361025 if (shmem_free_swap(mapping, index, page)) {
....@@ -944,42 +1033,24 @@
9441033
9451034 lock_page(page);
9461035
947
- if (PageTransTail(page)) {
948
- /* Middle of THP: zero out the page */
949
- clear_highpage(page);
950
- unlock_page(page);
951
- /*
952
- * Partial thp truncate due 'start' in middle
953
- * of THP: don't need to look on these pages
954
- * again on !pvec.nr restart.
955
- */
956
- if (index != round_down(end, HPAGE_PMD_NR))
957
- start++;
958
- continue;
959
- } else if (PageTransHuge(page)) {
960
- if (index == round_down(end, HPAGE_PMD_NR)) {
961
- /*
962
- * Range ends in the middle of THP:
963
- * zero out the page
964
- */
965
- clear_highpage(page);
966
- unlock_page(page);
967
- continue;
968
- }
969
- index += HPAGE_PMD_NR - 1;
970
- i += HPAGE_PMD_NR - 1;
971
- }
972
-
9731036 if (!unfalloc || !PageUptodate(page)) {
974
- VM_BUG_ON_PAGE(PageTail(page), page);
975
- if (page_mapping(page) == mapping) {
976
- VM_BUG_ON_PAGE(PageWriteback(page), page);
977
- truncate_inode_page(mapping, page);
978
- } else {
1037
+ if (page_mapping(page) != mapping) {
9791038 /* Page was replaced by swap: retry */
9801039 unlock_page(page);
9811040 index--;
9821041 break;
1042
+ }
1043
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
1044
+ if (shmem_punch_compound(page, start, end))
1045
+ truncate_inode_page(mapping, page);
1046
+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1047
+ /* Wipe the page and don't get stuck */
1048
+ clear_highpage(page);
1049
+ flush_dcache_page(page);
1050
+ set_page_dirty(page);
1051
+ if (index <
1052
+ round_up(start, HPAGE_PMD_NR))
1053
+ start = index + 1;
9831054 }
9841055 }
9851056 unlock_page(page);
....@@ -1067,7 +1138,7 @@
10671138 * Part of the huge page can be beyond i_size: subject
10681139 * to shrink under memory pressure.
10691140 */
1070
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1141
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
10711142 spin_lock(&sbinfo->shrinklist_lock);
10721143 /*
10731144 * _careful to defend against unlocked access to
....@@ -1106,9 +1177,14 @@
11061177 }
11071178 spin_unlock(&sbinfo->shrinklist_lock);
11081179 }
1109
- if (!list_empty(&info->swaplist)) {
1180
+ while (!list_empty(&info->swaplist)) {
1181
+ /* Wait while shmem_unuse() is scanning this inode... */
1182
+ wait_var_event(&info->stop_eviction,
1183
+ !atomic_read(&info->stop_eviction));
11101184 mutex_lock(&shmem_swaplist_mutex);
1111
- list_del_init(&info->swaplist);
1185
+ /* ...but beware of the race if we peeked too early */
1186
+ if (!atomic_read(&info->stop_eviction))
1187
+ list_del_init(&info->swaplist);
11121188 mutex_unlock(&shmem_swaplist_mutex);
11131189 }
11141190 }
....@@ -1119,166 +1195,174 @@
11191195 clear_inode(inode);
11201196 }
11211197
1122
-static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
1198
+extern struct swap_info_struct *swap_info[];
1199
+
1200
+static int shmem_find_swap_entries(struct address_space *mapping,
1201
+ pgoff_t start, unsigned int nr_entries,
1202
+ struct page **entries, pgoff_t *indices,
1203
+ unsigned int type, bool frontswap)
11231204 {
1124
- struct radix_tree_iter iter;
1125
- void __rcu **slot;
1126
- unsigned long found = -1;
1127
- unsigned int checked = 0;
1205
+ XA_STATE(xas, &mapping->i_pages, start);
1206
+ struct page *page;
1207
+ swp_entry_t entry;
1208
+ unsigned int ret = 0;
1209
+
1210
+ if (!nr_entries)
1211
+ return 0;
11281212
11291213 rcu_read_lock();
1130
- radix_tree_for_each_slot(slot, root, &iter, 0) {
1131
- void *entry = radix_tree_deref_slot(slot);
1132
-
1133
- if (radix_tree_deref_retry(entry)) {
1134
- slot = radix_tree_iter_retry(&iter);
1214
+ xas_for_each(&xas, page, ULONG_MAX) {
1215
+ if (xas_retry(&xas, page))
11351216 continue;
1217
+
1218
+ if (!xa_is_value(page))
1219
+ continue;
1220
+
1221
+ entry = radix_to_swp_entry(page);
1222
+ if (swp_type(entry) != type)
1223
+ continue;
1224
+ if (frontswap &&
1225
+ !frontswap_test(swap_info[type], swp_offset(entry)))
1226
+ continue;
1227
+
1228
+ indices[ret] = xas.xa_index;
1229
+ entries[ret] = page;
1230
+
1231
+ if (need_resched()) {
1232
+ xas_pause(&xas);
1233
+ cond_resched_rcu();
11361234 }
1137
- if (entry == item) {
1138
- found = iter.index;
1235
+ if (++ret == nr_entries)
11391236 break;
1140
- }
1141
- checked++;
1142
- if ((checked % 4096) != 0)
1143
- continue;
1144
- slot = radix_tree_iter_resume(slot, &iter);
1145
- cond_resched_rcu();
11461237 }
1147
-
11481238 rcu_read_unlock();
1149
- return found;
1239
+
1240
+ return ret;
1241
+}
1242
+
1243
+/*
1244
+ * Move the swapped pages for an inode to page cache. Returns the count
1245
+ * of pages swapped in, or the error in case of failure.
1246
+ */
1247
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1248
+ pgoff_t *indices)
1249
+{
1250
+ int i = 0;
1251
+ int ret = 0;
1252
+ int error = 0;
1253
+ struct address_space *mapping = inode->i_mapping;
1254
+
1255
+ for (i = 0; i < pvec.nr; i++) {
1256
+ struct page *page = pvec.pages[i];
1257
+
1258
+ if (!xa_is_value(page))
1259
+ continue;
1260
+ error = shmem_swapin_page(inode, indices[i],
1261
+ &page, SGP_CACHE,
1262
+ mapping_gfp_mask(mapping),
1263
+ NULL, NULL);
1264
+ if (error == 0) {
1265
+ unlock_page(page);
1266
+ put_page(page);
1267
+ ret++;
1268
+ }
1269
+ if (error == -ENOMEM)
1270
+ break;
1271
+ error = 0;
1272
+ }
1273
+ return error ? error : ret;
11501274 }
11511275
11521276 /*
11531277 * If swap found in inode, free it and move page from swapcache to filecache.
11541278 */
1155
-static int shmem_unuse_inode(struct shmem_inode_info *info,
1156
- swp_entry_t swap, struct page **pagep)
1279
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1280
+ bool frontswap, unsigned long *fs_pages_to_unuse)
11571281 {
1158
- struct address_space *mapping = info->vfs_inode.i_mapping;
1159
- void *radswap;
1160
- pgoff_t index;
1161
- gfp_t gfp;
1162
- int error = 0;
1282
+ struct address_space *mapping = inode->i_mapping;
1283
+ pgoff_t start = 0;
1284
+ struct pagevec pvec;
1285
+ pgoff_t indices[PAGEVEC_SIZE];
1286
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1287
+ int ret = 0;
11631288
1164
- radswap = swp_to_radix_entry(swap);
1165
- index = find_swap_entry(&mapping->i_pages, radswap);
1166
- if (index == -1)
1167
- return -EAGAIN; /* tell shmem_unuse we found nothing */
1289
+ pagevec_init(&pvec);
1290
+ do {
1291
+ unsigned int nr_entries = PAGEVEC_SIZE;
11681292
1169
- /*
1170
- * Move _head_ to start search for next from here.
1171
- * But be careful: shmem_evict_inode checks list_empty without taking
1172
- * mutex, and there's an instant in list_move_tail when info->swaplist
1173
- * would appear empty, if it were the only one on shmem_swaplist.
1174
- */
1175
- if (shmem_swaplist.next != &info->swaplist)
1176
- list_move_tail(&shmem_swaplist, &info->swaplist);
1293
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1294
+ nr_entries = *fs_pages_to_unuse;
11771295
1178
- gfp = mapping_gfp_mask(mapping);
1179
- if (shmem_should_replace_page(*pagep, gfp)) {
1180
- mutex_unlock(&shmem_swaplist_mutex);
1181
- error = shmem_replace_page(pagep, gfp, info, index);
1182
- mutex_lock(&shmem_swaplist_mutex);
1183
- /*
1184
- * We needed to drop mutex to make that restrictive page
1185
- * allocation, but the inode might have been freed while we
1186
- * dropped it: although a racing shmem_evict_inode() cannot
1187
- * complete without emptying the radix_tree, our page lock
1188
- * on this swapcache page is not enough to prevent that -
1189
- * free_swap_and_cache() of our swap entry will only
1190
- * trylock_page(), removing swap from radix_tree whatever.
1191
- *
1192
- * We must not proceed to shmem_add_to_page_cache() if the
1193
- * inode has been freed, but of course we cannot rely on
1194
- * inode or mapping or info to check that. However, we can
1195
- * safely check if our swap entry is still in use (and here
1196
- * it can't have got reused for another page): if it's still
1197
- * in use, then the inode cannot have been freed yet, and we
1198
- * can safely proceed (if it's no longer in use, that tells
1199
- * nothing about the inode, but we don't need to unuse swap).
1200
- */
1201
- if (!page_swapcount(*pagep))
1202
- error = -ENOENT;
1203
- }
1204
-
1205
- /*
1206
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
1207
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
1208
- * beneath us (pagelock doesn't help until the page is in pagecache).
1209
- */
1210
- if (!error)
1211
- error = shmem_add_to_page_cache(*pagep, mapping, index,
1212
- radswap);
1213
- if (error != -ENOMEM) {
1214
- /*
1215
- * Truncation and eviction use free_swap_and_cache(), which
1216
- * only does trylock page: if we raced, best clean up here.
1217
- */
1218
- delete_from_swap_cache(*pagep);
1219
- set_page_dirty(*pagep);
1220
- if (!error) {
1221
- spin_lock_irq(&info->lock);
1222
- info->swapped--;
1223
- spin_unlock_irq(&info->lock);
1224
- swap_free(swap);
1296
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1297
+ pvec.pages, indices,
1298
+ type, frontswap);
1299
+ if (pvec.nr == 0) {
1300
+ ret = 0;
1301
+ break;
12251302 }
1226
- }
1227
- return error;
1303
+
1304
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
1305
+ if (ret < 0)
1306
+ break;
1307
+
1308
+ if (frontswap_partial) {
1309
+ *fs_pages_to_unuse -= ret;
1310
+ if (*fs_pages_to_unuse == 0) {
1311
+ ret = FRONTSWAP_PAGES_UNUSED;
1312
+ break;
1313
+ }
1314
+ }
1315
+
1316
+ start = indices[pvec.nr - 1];
1317
+ } while (true);
1318
+
1319
+ return ret;
12281320 }
12291321
12301322 /*
1231
- * Search through swapped inodes to find and replace swap by page.
1323
+ * Read all the shared memory data that resides in the swap
1324
+ * device 'type' back into memory, so the swap device can be
1325
+ * unused.
12321326 */
1233
-int shmem_unuse(swp_entry_t swap, struct page *page)
1327
+int shmem_unuse(unsigned int type, bool frontswap,
1328
+ unsigned long *fs_pages_to_unuse)
12341329 {
1235
- struct list_head *this, *next;
1236
- struct shmem_inode_info *info;
1237
- struct mem_cgroup *memcg;
1330
+ struct shmem_inode_info *info, *next;
12381331 int error = 0;
12391332
1240
- /*
1241
- * There's a faint possibility that swap page was replaced before
1242
- * caller locked it: caller will come back later with the right page.
1243
- */
1244
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
1245
- goto out;
1246
-
1247
- /*
1248
- * Charge page using GFP_KERNEL while we can wait, before taking
1249
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
1250
- * Charged back to the user (not to caller) when swap account is used.
1251
- */
1252
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
1253
- &memcg, false);
1254
- if (error)
1255
- goto out;
1256
- /* No radix_tree_preload: swap entry keeps a place for page in tree */
1257
- error = -EAGAIN;
1333
+ if (list_empty(&shmem_swaplist))
1334
+ return 0;
12581335
12591336 mutex_lock(&shmem_swaplist_mutex);
1260
- list_for_each_safe(this, next, &shmem_swaplist) {
1261
- info = list_entry(this, struct shmem_inode_info, swaplist);
1262
- if (info->swapped)
1263
- error = shmem_unuse_inode(info, swap, &page);
1264
- else
1337
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1338
+ if (!info->swapped) {
12651339 list_del_init(&info->swaplist);
1340
+ continue;
1341
+ }
1342
+ /*
1343
+ * Drop the swaplist mutex while searching the inode for swap;
1344
+ * but before doing so, make sure shmem_evict_inode() will not
1345
+ * remove placeholder inode from swaplist, nor let it be freed
1346
+ * (igrab() would protect from unlink, but not from unmount).
1347
+ */
1348
+ atomic_inc(&info->stop_eviction);
1349
+ mutex_unlock(&shmem_swaplist_mutex);
1350
+
1351
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1352
+ fs_pages_to_unuse);
12661353 cond_resched();
1267
- if (error != -EAGAIN)
1354
+
1355
+ mutex_lock(&shmem_swaplist_mutex);
1356
+ next = list_next_entry(info, swaplist);
1357
+ if (!info->swapped)
1358
+ list_del_init(&info->swaplist);
1359
+ if (atomic_dec_and_test(&info->stop_eviction))
1360
+ wake_up_var(&info->stop_eviction);
1361
+ if (error)
12681362 break;
1269
- /* found nothing in this: move on to search the next */
12701363 }
12711364 mutex_unlock(&shmem_swaplist_mutex);
12721365
1273
- if (error) {
1274
- if (error != -ENOMEM)
1275
- error = 0;
1276
- mem_cgroup_cancel_charge(page, memcg, false);
1277
- } else
1278
- mem_cgroup_commit_charge(page, memcg, true, false);
1279
-out:
1280
- unlock_page(page);
1281
- put_page(page);
12821366 return error;
12831367 }
12841368
....@@ -1348,6 +1432,7 @@
13481432 SetPageUptodate(page);
13491433 }
13501434
1435
+ trace_android_vh_set_shmem_page_flag(page);
13511436 swap = get_swap_page(page);
13521437 if (!swap.val)
13531438 goto redirty;
....@@ -1362,9 +1447,11 @@
13621447 */
13631448 mutex_lock(&shmem_swaplist_mutex);
13641449 if (list_empty(&info->swaplist))
1365
- list_add_tail(&info->swaplist, &shmem_swaplist);
1450
+ list_add(&info->swaplist, &shmem_swaplist);
13661451
1367
- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1452
+ if (add_to_swap_cache(page, swap,
1453
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1454
+ NULL) == 0) {
13681455 spin_lock_irq(&info->lock);
13691456 shmem_recalc_inode(inode);
13701457 info->swapped++;
....@@ -1406,10 +1493,10 @@
14061493 {
14071494 struct mempolicy *mpol = NULL;
14081495 if (sbinfo->mpol) {
1409
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1496
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
14101497 mpol = sbinfo->mpol;
14111498 mpol_get(mpol);
1412
- spin_unlock(&sbinfo->stat_lock);
1499
+ raw_spin_unlock(&sbinfo->stat_lock);
14131500 }
14141501 return mpol;
14151502 }
....@@ -1447,11 +1534,11 @@
14471534 {
14481535 struct vm_area_struct pvma;
14491536 struct page *page;
1450
- struct vm_fault vmf;
1537
+ struct vm_fault vmf = {
1538
+ .vma = &pvma,
1539
+ };
14511540
14521541 shmem_pseudo_vma_init(&pvma, info, index);
1453
- vmf.vma = &pvma;
1454
- vmf.address = 0;
14551542 page = swap_cluster_readahead(swap, gfp, &vmf);
14561543 shmem_pseudo_vma_destroy(&pvma);
14571544
....@@ -1462,23 +1549,14 @@
14621549 struct shmem_inode_info *info, pgoff_t index)
14631550 {
14641551 struct vm_area_struct pvma;
1465
- struct inode *inode = &info->vfs_inode;
1466
- struct address_space *mapping = inode->i_mapping;
1467
- pgoff_t idx, hindex;
1468
- void __rcu **results;
1552
+ struct address_space *mapping = info->vfs_inode.i_mapping;
1553
+ pgoff_t hindex;
14691554 struct page *page;
14701555
1471
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1472
- return NULL;
1473
-
14741556 hindex = round_down(index, HPAGE_PMD_NR);
1475
- rcu_read_lock();
1476
- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
1477
- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1478
- rcu_read_unlock();
1557
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1558
+ XA_PRESENT))
14791559 return NULL;
1480
- }
1481
- rcu_read_unlock();
14821560
14831561 shmem_pseudo_vma_init(&pvma, info, hindex);
14841562 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
....@@ -1486,6 +1564,8 @@
14861564 shmem_pseudo_vma_destroy(&pvma);
14871565 if (page)
14881566 prep_transhuge_page(page);
1567
+ else
1568
+ count_vm_event(THP_FILE_FALLBACK);
14891569 return page;
14901570 }
14911571
....@@ -1493,7 +1573,11 @@
14931573 struct shmem_inode_info *info, pgoff_t index)
14941574 {
14951575 struct vm_area_struct pvma;
1496
- struct page *page;
1576
+ struct page *page = NULL;
1577
+
1578
+ trace_android_vh_shmem_alloc_page(&page);
1579
+ if (page)
1580
+ return page;
14971581
14981582 shmem_pseudo_vma_init(&pvma, info, index);
14991583 page = alloc_page_vma(gfp, &pvma, 0);
....@@ -1511,7 +1595,7 @@
15111595 int nr;
15121596 int err = -ENOSPC;
15131597
1514
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1598
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
15151599 huge = false;
15161600 nr = huge ? HPAGE_PMD_NR : 1;
15171601
....@@ -1589,11 +1673,11 @@
15891673 * a nice clean interface for us to replace oldpage by newpage there.
15901674 */
15911675 xa_lock_irq(&swap_mapping->i_pages);
1592
- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1593
- newpage);
1676
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
15941677 if (!error) {
1595
- __inc_node_page_state(newpage, NR_FILE_PAGES);
1596
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
1678
+ mem_cgroup_migrate(oldpage, newpage);
1679
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
1680
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
15971681 }
15981682 xa_unlock_irq(&swap_mapping->i_pages);
15991683
....@@ -1605,8 +1689,7 @@
16051689 */
16061690 oldpage = newpage;
16071691 } else {
1608
- mem_cgroup_migrate(oldpage, newpage);
1609
- lru_cache_add_anon(newpage);
1692
+ lru_cache_add(newpage);
16101693 *pagep = newpage;
16111694 }
16121695
....@@ -1620,13 +1703,109 @@
16201703 }
16211704
16221705 /*
1706
+ * Swap in the page pointed to by *pagep.
1707
+ * Caller has to make sure that *pagep contains a valid swapped page.
1708
+ * Returns 0 and the page in pagep if success. On failure, returns the
1709
+ * error code and NULL in *pagep.
1710
+ */
1711
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1712
+ struct page **pagep, enum sgp_type sgp,
1713
+ gfp_t gfp, struct vm_area_struct *vma,
1714
+ vm_fault_t *fault_type)
1715
+{
1716
+ struct address_space *mapping = inode->i_mapping;
1717
+ struct shmem_inode_info *info = SHMEM_I(inode);
1718
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1719
+ struct page *page;
1720
+ swp_entry_t swap;
1721
+ int error;
1722
+
1723
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1724
+ swap = radix_to_swp_entry(*pagep);
1725
+ *pagep = NULL;
1726
+
1727
+ /* Look it up and read it in.. */
1728
+ page = lookup_swap_cache(swap, NULL, 0);
1729
+ if (!page) {
1730
+ /* Or update major stats only when swapin succeeds?? */
1731
+ if (fault_type) {
1732
+ *fault_type |= VM_FAULT_MAJOR;
1733
+ count_vm_event(PGMAJFAULT);
1734
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
1735
+ }
1736
+ /* Here we actually start the io */
1737
+ page = shmem_swapin(swap, gfp, info, index);
1738
+ if (!page) {
1739
+ error = -ENOMEM;
1740
+ goto failed;
1741
+ }
1742
+ }
1743
+
1744
+ /* We have to do this with page locked to prevent races */
1745
+ lock_page(page);
1746
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
1747
+ !shmem_confirm_swap(mapping, index, swap)) {
1748
+ error = -EEXIST;
1749
+ goto unlock;
1750
+ }
1751
+ if (!PageUptodate(page)) {
1752
+ error = -EIO;
1753
+ goto failed;
1754
+ }
1755
+ wait_on_page_writeback(page);
1756
+
1757
+ /*
1758
+ * Some architectures may have to restore extra metadata to the
1759
+ * physical page after reading from swap.
1760
+ */
1761
+ arch_swap_restore(swap, page);
1762
+
1763
+ if (shmem_should_replace_page(page, gfp)) {
1764
+ error = shmem_replace_page(&page, gfp, info, index);
1765
+ if (error)
1766
+ goto failed;
1767
+ }
1768
+
1769
+ error = shmem_add_to_page_cache(page, mapping, index,
1770
+ swp_to_radix_entry(swap), gfp,
1771
+ charge_mm);
1772
+ if (error)
1773
+ goto failed;
1774
+
1775
+ spin_lock_irq(&info->lock);
1776
+ info->swapped--;
1777
+ shmem_recalc_inode(inode);
1778
+ spin_unlock_irq(&info->lock);
1779
+
1780
+ if (sgp == SGP_WRITE)
1781
+ mark_page_accessed(page);
1782
+
1783
+ delete_from_swap_cache(page);
1784
+ set_page_dirty(page);
1785
+ swap_free(swap);
1786
+
1787
+ *pagep = page;
1788
+ return 0;
1789
+failed:
1790
+ if (!shmem_confirm_swap(mapping, index, swap))
1791
+ error = -EEXIST;
1792
+unlock:
1793
+ if (page) {
1794
+ unlock_page(page);
1795
+ put_page(page);
1796
+ }
1797
+
1798
+ return error;
1799
+}
1800
+
1801
+/*
16231802 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
16241803 *
16251804 * If we allocate a new one we do not mark it dirty. That's up to the
16261805 * vm. If we swap it in we mark it dirty since we also free the swap
16271806 * entry since a page cannot live in both the swap and page cache.
16281807 *
1629
- * fault_mm and fault_type are only supplied by shmem_fault:
1808
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
16301809 * otherwise they are NULL.
16311810 */
16321811 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
....@@ -1638,9 +1817,7 @@
16381817 struct shmem_inode_info *info = SHMEM_I(inode);
16391818 struct shmem_sb_info *sbinfo;
16401819 struct mm_struct *charge_mm;
1641
- struct mem_cgroup *memcg;
16421820 struct page *page;
1643
- swp_entry_t swap;
16441821 enum sgp_type sgp_huge = sgp;
16451822 pgoff_t hindex = index;
16461823 int error;
....@@ -1652,19 +1829,37 @@
16521829 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
16531830 sgp = SGP_CACHE;
16541831 repeat:
1655
- swap.val = 0;
1656
- page = find_lock_entry(mapping, index);
1657
- if (radix_tree_exceptional_entry(page)) {
1658
- swap = radix_to_swp_entry(page);
1659
- page = NULL;
1660
- }
1661
-
16621832 if (sgp <= SGP_CACHE &&
16631833 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1664
- error = -EINVAL;
1665
- goto unlock;
1834
+ return -EINVAL;
16661835 }
16671836
1837
+ sbinfo = SHMEM_SB(inode->i_sb);
1838
+ charge_mm = vma ? vma->vm_mm : current->mm;
1839
+
1840
+ page = find_lock_entry(mapping, index);
1841
+
1842
+ if (page && vma && userfaultfd_minor(vma)) {
1843
+ if (!xa_is_value(page)) {
1844
+ unlock_page(page);
1845
+ put_page(page);
1846
+ }
1847
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1848
+ return 0;
1849
+ }
1850
+
1851
+ if (xa_is_value(page)) {
1852
+ error = shmem_swapin_page(inode, index, &page,
1853
+ sgp, gfp, vma, fault_type);
1854
+ if (error == -EEXIST)
1855
+ goto repeat;
1856
+
1857
+ *pagep = page;
1858
+ return error;
1859
+ }
1860
+
1861
+ if (page)
1862
+ hindex = page->index;
16681863 if (page && sgp == SGP_WRITE)
16691864 mark_page_accessed(page);
16701865
....@@ -1675,230 +1870,141 @@
16751870 unlock_page(page);
16761871 put_page(page);
16771872 page = NULL;
1873
+ hindex = index;
16781874 }
1679
- if (page || (sgp == SGP_READ && !swap.val)) {
1680
- *pagep = page;
1681
- return 0;
1682
- }
1875
+ if (page || sgp == SGP_READ)
1876
+ goto out;
16831877
16841878 /*
16851879 * Fast cache lookup did not find it:
16861880 * bring it back from swap or allocate.
16871881 */
1688
- sbinfo = SHMEM_SB(inode->i_sb);
1689
- charge_mm = vma ? vma->vm_mm : current->mm;
16901882
1691
- if (swap.val) {
1692
- /* Look it up and read it in.. */
1693
- page = lookup_swap_cache(swap, NULL, 0);
1694
- if (!page) {
1695
- /* Or update major stats only when swapin succeeds?? */
1696
- if (fault_type) {
1697
- *fault_type |= VM_FAULT_MAJOR;
1698
- count_vm_event(PGMAJFAULT);
1699
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
1700
- }
1701
- /* Here we actually start the io */
1702
- page = shmem_swapin(swap, gfp, info, index);
1703
- if (!page) {
1704
- error = -ENOMEM;
1705
- goto failed;
1706
- }
1707
- }
1883
+ if (vma && userfaultfd_missing(vma)) {
1884
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1885
+ return 0;
1886
+ }
17081887
1709
- /* We have to do this with page locked to prevent races */
1710
- lock_page(page);
1711
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
1712
- !shmem_confirm_swap(mapping, index, swap)) {
1713
- error = -EEXIST; /* try again */
1714
- goto unlock;
1715
- }
1716
- if (!PageUptodate(page)) {
1717
- error = -EIO;
1718
- goto failed;
1719
- }
1720
- wait_on_page_writeback(page);
1888
+ /* shmem_symlink() */
1889
+ if (mapping->a_ops != &shmem_aops)
1890
+ goto alloc_nohuge;
1891
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1892
+ goto alloc_nohuge;
1893
+ if (shmem_huge == SHMEM_HUGE_FORCE)
1894
+ goto alloc_huge;
1895
+ switch (sbinfo->huge) {
1896
+ case SHMEM_HUGE_NEVER:
1897
+ goto alloc_nohuge;
1898
+ case SHMEM_HUGE_WITHIN_SIZE: {
1899
+ loff_t i_size;
1900
+ pgoff_t off;
17211901
1722
- if (shmem_should_replace_page(page, gfp)) {
1723
- error = shmem_replace_page(&page, gfp, info, index);
1724
- if (error)
1725
- goto failed;
1726
- }
1727
-
1728
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1729
- false);
1730
- if (!error) {
1731
- error = shmem_add_to_page_cache(page, mapping, index,
1732
- swp_to_radix_entry(swap));
1733
- /*
1734
- * We already confirmed swap under page lock, and make
1735
- * no memory allocation here, so usually no possibility
1736
- * of error; but free_swap_and_cache() only trylocks a
1737
- * page, so it is just possible that the entry has been
1738
- * truncated or holepunched since swap was confirmed.
1739
- * shmem_undo_range() will have done some of the
1740
- * unaccounting, now delete_from_swap_cache() will do
1741
- * the rest.
1742
- * Reset swap.val? No, leave it so "failed" goes back to
1743
- * "repeat": reading a hole and writing should succeed.
1744
- */
1745
- if (error) {
1746
- mem_cgroup_cancel_charge(page, memcg, false);
1747
- delete_from_swap_cache(page);
1748
- }
1749
- }
1750
- if (error)
1751
- goto failed;
1752
-
1753
- mem_cgroup_commit_charge(page, memcg, true, false);
1754
-
1755
- spin_lock_irq(&info->lock);
1756
- info->swapped--;
1757
- shmem_recalc_inode(inode);
1758
- spin_unlock_irq(&info->lock);
1759
-
1760
- if (sgp == SGP_WRITE)
1761
- mark_page_accessed(page);
1762
-
1763
- delete_from_swap_cache(page);
1764
- set_page_dirty(page);
1765
- swap_free(swap);
1766
-
1767
- } else {
1768
- if (vma && userfaultfd_missing(vma)) {
1769
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1770
- return 0;
1771
- }
1772
-
1773
- /* shmem_symlink() */
1774
- if (mapping->a_ops != &shmem_aops)
1775
- goto alloc_nohuge;
1776
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1777
- goto alloc_nohuge;
1778
- if (shmem_huge == SHMEM_HUGE_FORCE)
1902
+ off = round_up(index, HPAGE_PMD_NR);
1903
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
1904
+ if (i_size >= HPAGE_PMD_SIZE &&
1905
+ i_size >> PAGE_SHIFT >= off)
17791906 goto alloc_huge;
1780
- switch (sbinfo->huge) {
1781
- loff_t i_size;
1782
- pgoff_t off;
1783
- case SHMEM_HUGE_NEVER:
1784
- goto alloc_nohuge;
1785
- case SHMEM_HUGE_WITHIN_SIZE:
1786
- off = round_up(index, HPAGE_PMD_NR);
1787
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
1788
- if (i_size >= HPAGE_PMD_SIZE &&
1789
- i_size >> PAGE_SHIFT >= off)
1790
- goto alloc_huge;
1791
- /* fallthrough */
1792
- case SHMEM_HUGE_ADVISE:
1793
- if (sgp_huge == SGP_HUGE)
1794
- goto alloc_huge;
1795
- /* TODO: implement fadvise() hints */
1796
- goto alloc_nohuge;
1797
- }
1907
+
1908
+ fallthrough;
1909
+ }
1910
+ case SHMEM_HUGE_ADVISE:
1911
+ if (sgp_huge == SGP_HUGE)
1912
+ goto alloc_huge;
1913
+ /* TODO: implement fadvise() hints */
1914
+ goto alloc_nohuge;
1915
+ }
17981916
17991917 alloc_huge:
1800
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1801
- if (IS_ERR(page)) {
1802
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1803
- index, false);
1804
- }
1805
- if (IS_ERR(page)) {
1806
- int retry = 5;
1807
- error = PTR_ERR(page);
1808
- page = NULL;
1809
- if (error != -ENOSPC)
1810
- goto failed;
1811
- /*
1812
- * Try to reclaim some spece by splitting a huge page
1813
- * beyond i_size on the filesystem.
1814
- */
1815
- while (retry--) {
1816
- int ret;
1817
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1818
- if (ret == SHRINK_STOP)
1819
- break;
1820
- if (ret)
1821
- goto alloc_nohuge;
1822
- }
1823
- goto failed;
1824
- }
1918
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1919
+ if (IS_ERR(page)) {
1920
+alloc_nohuge:
1921
+ page = shmem_alloc_and_acct_page(gfp, inode,
1922
+ index, false);
1923
+ }
1924
+ if (IS_ERR(page)) {
1925
+ int retry = 5;
18251926
1826
- if (PageTransHuge(page))
1827
- hindex = round_down(index, HPAGE_PMD_NR);
1828
- else
1829
- hindex = index;
1830
-
1831
- if (sgp == SGP_WRITE)
1832
- __SetPageReferenced(page);
1833
-
1834
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1835
- PageTransHuge(page));
1836
- if (error)
1837
- goto unacct;
1838
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1839
- compound_order(page));
1840
- if (!error) {
1841
- error = shmem_add_to_page_cache(page, mapping, hindex,
1842
- NULL);
1843
- radix_tree_preload_end();
1844
- }
1845
- if (error) {
1846
- mem_cgroup_cancel_charge(page, memcg,
1847
- PageTransHuge(page));
1848
- goto unacct;
1849
- }
1850
- mem_cgroup_commit_charge(page, memcg, false,
1851
- PageTransHuge(page));
1852
- lru_cache_add_anon(page);
1853
-
1854
- spin_lock_irq(&info->lock);
1855
- info->alloced += 1 << compound_order(page);
1856
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1857
- shmem_recalc_inode(inode);
1858
- spin_unlock_irq(&info->lock);
1859
- alloced = true;
1860
-
1861
- if (PageTransHuge(page) &&
1862
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1863
- hindex + HPAGE_PMD_NR - 1) {
1864
- /*
1865
- * Part of the huge page is beyond i_size: subject
1866
- * to shrink under memory pressure.
1867
- */
1868
- spin_lock(&sbinfo->shrinklist_lock);
1869
- /*
1870
- * _careful to defend against unlocked access to
1871
- * ->shrink_list in shmem_unused_huge_shrink()
1872
- */
1873
- if (list_empty_careful(&info->shrinklist)) {
1874
- list_add_tail(&info->shrinklist,
1875
- &sbinfo->shrinklist);
1876
- sbinfo->shrinklist_len++;
1877
- }
1878
- spin_unlock(&sbinfo->shrinklist_lock);
1879
- }
1880
-
1927
+ error = PTR_ERR(page);
1928
+ page = NULL;
1929
+ if (error != -ENOSPC)
1930
+ goto unlock;
18811931 /*
1882
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1932
+ * Try to reclaim some space by splitting a huge page
1933
+ * beyond i_size on the filesystem.
18831934 */
1884
- if (sgp == SGP_FALLOC)
1885
- sgp = SGP_WRITE;
1935
+ while (retry--) {
1936
+ int ret;
1937
+
1938
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1939
+ if (ret == SHRINK_STOP)
1940
+ break;
1941
+ if (ret)
1942
+ goto alloc_nohuge;
1943
+ }
1944
+ goto unlock;
1945
+ }
1946
+
1947
+ if (PageTransHuge(page))
1948
+ hindex = round_down(index, HPAGE_PMD_NR);
1949
+ else
1950
+ hindex = index;
1951
+
1952
+ if (sgp == SGP_WRITE)
1953
+ __SetPageReferenced(page);
1954
+
1955
+ error = shmem_add_to_page_cache(page, mapping, hindex,
1956
+ NULL, gfp & GFP_RECLAIM_MASK,
1957
+ charge_mm);
1958
+ if (error)
1959
+ goto unacct;
1960
+ lru_cache_add(page);
1961
+
1962
+ spin_lock_irq(&info->lock);
1963
+ info->alloced += compound_nr(page);
1964
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1965
+ shmem_recalc_inode(inode);
1966
+ spin_unlock_irq(&info->lock);
1967
+ alloced = true;
1968
+
1969
+ if (PageTransHuge(page) &&
1970
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1971
+ hindex + HPAGE_PMD_NR - 1) {
1972
+ /*
1973
+ * Part of the huge page is beyond i_size: subject
1974
+ * to shrink under memory pressure.
1975
+ */
1976
+ spin_lock(&sbinfo->shrinklist_lock);
1977
+ /*
1978
+ * _careful to defend against unlocked access to
1979
+ * ->shrink_list in shmem_unused_huge_shrink()
1980
+ */
1981
+ if (list_empty_careful(&info->shrinklist)) {
1982
+ list_add_tail(&info->shrinklist,
1983
+ &sbinfo->shrinklist);
1984
+ sbinfo->shrinklist_len++;
1985
+ }
1986
+ spin_unlock(&sbinfo->shrinklist_lock);
1987
+ }
1988
+
1989
+ /*
1990
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1991
+ */
1992
+ if (sgp == SGP_FALLOC)
1993
+ sgp = SGP_WRITE;
18861994 clear:
1887
- /*
1888
- * Let SGP_WRITE caller clear ends if write does not fill page;
1889
- * but SGP_FALLOC on a page fallocated earlier must initialize
1890
- * it now, lest undo on failure cancel our earlier guarantee.
1891
- */
1892
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
1893
- struct page *head = compound_head(page);
1894
- int i;
1995
+ /*
1996
+ * Let SGP_WRITE caller clear ends if write does not fill page;
1997
+ * but SGP_FALLOC on a page fallocated earlier must initialize
1998
+ * it now, lest undo on failure cancel our earlier guarantee.
1999
+ */
2000
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
2001
+ int i;
18952002
1896
- for (i = 0; i < (1 << compound_order(head)); i++) {
1897
- clear_highpage(head + i);
1898
- flush_dcache_page(head + i);
1899
- }
1900
- SetPageUptodate(head);
2003
+ for (i = 0; i < compound_nr(page); i++) {
2004
+ clear_highpage(page + i);
2005
+ flush_dcache_page(page + i);
19012006 }
2007
+ SetPageUptodate(page);
19022008 }
19032009
19042010 /* Perhaps the file has been truncated since we checked */
....@@ -1914,6 +2020,7 @@
19142020 error = -EINVAL;
19152021 goto unlock;
19162022 }
2023
+out:
19172024 *pagep = page + index - hindex;
19182025 return 0;
19192026
....@@ -1921,16 +2028,13 @@
19212028 * Error recovery.
19222029 */
19232030 unacct:
1924
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
2031
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
19252032
19262033 if (PageTransHuge(page)) {
19272034 unlock_page(page);
19282035 put_page(page);
19292036 goto alloc_nohuge;
19302037 }
1931
-failed:
1932
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1933
- error = -EEXIST;
19342038 unlock:
19352039 if (page) {
19362040 unlock_page(page);
....@@ -1942,7 +2046,7 @@
19422046 spin_unlock_irq(&info->lock);
19432047 goto repeat;
19442048 }
1945
- if (error == -EEXIST) /* from above or from radix_tree_insert */
2049
+ if (error == -EEXIST)
19462050 goto repeat;
19472051 return error;
19482052 }
....@@ -1994,16 +2098,14 @@
19942098 shmem_falloc->waitq &&
19952099 vmf->pgoff >= shmem_falloc->start &&
19962100 vmf->pgoff < shmem_falloc->next) {
2101
+ struct file *fpin;
19972102 wait_queue_head_t *shmem_falloc_waitq;
19982103 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
19992104
20002105 ret = VM_FAULT_NOPAGE;
2001
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
2002
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
2003
- /* It's polite to up mmap_sem if we can */
2004
- up_read(&vma->vm_mm->mmap_sem);
2106
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2107
+ if (fpin)
20052108 ret = VM_FAULT_RETRY;
2006
- }
20072109
20082110 shmem_falloc_waitq = shmem_falloc->waitq;
20092111 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
....@@ -2021,6 +2123,9 @@
20212123 spin_lock(&inode->i_lock);
20222124 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
20232125 spin_unlock(&inode->i_lock);
2126
+
2127
+ if (fpin)
2128
+ fput(fpin);
20242129 return ret;
20252130 }
20262131 spin_unlock(&inode->i_lock);
....@@ -2059,7 +2164,7 @@
20592164 get_area = current->mm->get_unmapped_area;
20602165 addr = get_area(file, uaddr, len, pgoff, flags);
20612166
2062
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
2167
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
20632168 return addr;
20642169 if (IS_ERR_VALUE(addr))
20652170 return addr;
....@@ -2179,26 +2284,18 @@
21792284 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
21802285 {
21812286 struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2287
+ int ret;
21822288
2183
- if (info->seals & F_SEAL_FUTURE_WRITE) {
2184
- /*
2185
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2186
- * "future write" seal active.
2187
- */
2188
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2189
- return -EPERM;
2289
+ ret = seal_check_future_write(info->seals, vma);
2290
+ if (ret)
2291
+ return ret;
21902292
2191
- /*
2192
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2193
- * read-only mapping, take care to not allow mprotect to revert
2194
- * protections.
2195
- */
2196
- vma->vm_flags &= ~(VM_MAYWRITE);
2197
- }
2293
+ /* arm64 - allow memory tagging on RAM-based files */
2294
+ vma->vm_flags |= VM_MTE_ALLOWED;
21982295
21992296 file_accessed(file);
22002297 vma->vm_ops = &shmem_vm_ops;
2201
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2298
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
22022299 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
22032300 (vma->vm_end & HPAGE_PMD_MASK)) {
22042301 khugepaged_enter(vma, vma->vm_flags);
....@@ -2212,13 +2309,14 @@
22122309 struct inode *inode;
22132310 struct shmem_inode_info *info;
22142311 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2312
+ ino_t ino;
22152313
2216
- if (shmem_reserve_inode(sb))
2314
+ if (shmem_reserve_inode(sb, &ino))
22172315 return NULL;
22182316
22192317 inode = new_inode(sb);
22202318 if (inode) {
2221
- inode->i_ino = get_next_ino();
2319
+ inode->i_ino = ino;
22222320 inode_init_owner(inode, dir, mode);
22232321 inode->i_blocks = 0;
22242322 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
....@@ -2226,6 +2324,7 @@
22262324 info = SHMEM_I(inode);
22272325 memset(info, 0, (char *)inode - (char *)info);
22282326 spin_lock_init(&info->lock);
2327
+ atomic_set(&info->stop_eviction, 0);
22292328 info->seals = F_SEAL_SEAL;
22302329 info->flags = flags & VM_NORESERVE;
22312330 INIT_LIST_HEAD(&info->shrinklist);
....@@ -2272,28 +2371,25 @@
22722371 return mapping->a_ops == &shmem_aops;
22732372 }
22742373
2275
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2276
- pmd_t *dst_pmd,
2277
- struct vm_area_struct *dst_vma,
2278
- unsigned long dst_addr,
2279
- unsigned long src_addr,
2280
- bool zeropage,
2281
- struct page **pagep)
2374
+#ifdef CONFIG_USERFAULTFD
2375
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2376
+ pmd_t *dst_pmd,
2377
+ struct vm_area_struct *dst_vma,
2378
+ unsigned long dst_addr,
2379
+ unsigned long src_addr,
2380
+ bool zeropage,
2381
+ struct page **pagep)
22822382 {
22832383 struct inode *inode = file_inode(dst_vma->vm_file);
22842384 struct shmem_inode_info *info = SHMEM_I(inode);
22852385 struct address_space *mapping = inode->i_mapping;
22862386 gfp_t gfp = mapping_gfp_mask(mapping);
22872387 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2288
- struct mem_cgroup *memcg;
2289
- spinlock_t *ptl;
22902388 void *page_kaddr;
22912389 struct page *page;
2292
- pte_t _dst_pte, *dst_pte;
22932390 int ret;
2294
- pgoff_t offset, max_off;
2391
+ pgoff_t max_off;
22952392
2296
- ret = -ENOMEM;
22972393 if (!shmem_inode_acct_block(inode, 1)) {
22982394 /*
22992395 * We may have got a page, returned -ENOENT triggering a retry,
....@@ -2304,29 +2400,30 @@
23042400 put_page(*pagep);
23052401 *pagep = NULL;
23062402 }
2307
- goto out;
2403
+ return -ENOMEM;
23082404 }
23092405
23102406 if (!*pagep) {
2407
+ ret = -ENOMEM;
23112408 page = shmem_alloc_page(gfp, info, pgoff);
23122409 if (!page)
23132410 goto out_unacct_blocks;
23142411
2315
- if (!zeropage) { /* mcopy_atomic */
2412
+ if (!zeropage) { /* COPY */
23162413 page_kaddr = kmap_atomic(page);
23172414 ret = copy_from_user(page_kaddr,
23182415 (const void __user *)src_addr,
23192416 PAGE_SIZE);
23202417 kunmap_atomic(page_kaddr);
23212418
2322
- /* fallback to copy_from_user outside mmap_sem */
2419
+ /* fallback to copy_from_user outside mmap_lock */
23232420 if (unlikely(ret)) {
23242421 *pagep = page;
2325
- shmem_inode_unacct_blocks(inode, 1);
2422
+ ret = -ENOENT;
23262423 /* don't free the page */
2327
- return -ENOENT;
2424
+ goto out_unacct_blocks;
23282425 }
2329
- } else { /* mfill_zeropage_atomic */
2426
+ } else { /* ZEROPAGE */
23302427 clear_highpage(page);
23312428 }
23322429 } else {
....@@ -2334,57 +2431,26 @@
23342431 *pagep = NULL;
23352432 }
23362433
2337
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
2434
+ VM_BUG_ON(PageLocked(page));
2435
+ VM_BUG_ON(PageSwapBacked(page));
23382436 __SetPageLocked(page);
23392437 __SetPageSwapBacked(page);
23402438 __SetPageUptodate(page);
23412439
23422440 ret = -EFAULT;
2343
- offset = linear_page_index(dst_vma, dst_addr);
23442441 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2345
- if (unlikely(offset >= max_off))
2442
+ if (unlikely(pgoff >= max_off))
23462443 goto out_release;
23472444
2348
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
2445
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2446
+ gfp & GFP_RECLAIM_MASK, dst_mm);
23492447 if (ret)
23502448 goto out_release;
23512449
2352
- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
2353
- if (!ret) {
2354
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
2355
- radix_tree_preload_end();
2356
- }
2450
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
2451
+ page, true, false);
23572452 if (ret)
2358
- goto out_release_uncharge;
2359
-
2360
- mem_cgroup_commit_charge(page, memcg, false, false);
2361
-
2362
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2363
- if (dst_vma->vm_flags & VM_WRITE)
2364
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2365
- else {
2366
- /*
2367
- * We don't set the pte dirty if the vma has no
2368
- * VM_WRITE permission, so mark the page dirty or it
2369
- * could be freed from under us. We could do it
2370
- * unconditionally before unlock_page(), but doing it
2371
- * only if VM_WRITE is not set is faster.
2372
- */
2373
- set_page_dirty(page);
2374
- }
2375
-
2376
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2377
-
2378
- ret = -EFAULT;
2379
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2380
- if (unlikely(offset >= max_off))
2381
- goto out_release_uncharge_unlock;
2382
-
2383
- ret = -EEXIST;
2384
- if (!pte_none(*dst_pte))
2385
- goto out_release_uncharge_unlock;
2386
-
2387
- lru_cache_add_anon(page);
2453
+ goto out_delete_from_cache;
23882454
23892455 spin_lock_irq(&info->lock);
23902456 info->alloced++;
....@@ -2392,52 +2458,19 @@
23922458 shmem_recalc_inode(inode);
23932459 spin_unlock_irq(&info->lock);
23942460
2395
- inc_mm_counter(dst_mm, mm_counter_file(page));
2396
- page_add_file_rmap(page, false);
2397
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2398
-
2399
- /* No need to invalidate - it was non-present before */
2400
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
2401
- pte_unmap_unlock(dst_pte, ptl);
2461
+ SetPageDirty(page);
24022462 unlock_page(page);
2403
- ret = 0;
2404
-out:
2405
- return ret;
2406
-out_release_uncharge_unlock:
2407
- pte_unmap_unlock(dst_pte, ptl);
2408
- ClearPageDirty(page);
2463
+ return 0;
2464
+out_delete_from_cache:
24092465 delete_from_page_cache(page);
2410
-out_release_uncharge:
2411
- mem_cgroup_cancel_charge(page, memcg, false);
24122466 out_release:
24132467 unlock_page(page);
24142468 put_page(page);
24152469 out_unacct_blocks:
24162470 shmem_inode_unacct_blocks(inode, 1);
2417
- goto out;
2471
+ return ret;
24182472 }
2419
-
2420
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2421
- pmd_t *dst_pmd,
2422
- struct vm_area_struct *dst_vma,
2423
- unsigned long dst_addr,
2424
- unsigned long src_addr,
2425
- struct page **pagep)
2426
-{
2427
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2428
- dst_addr, src_addr, false, pagep);
2429
-}
2430
-
2431
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2432
- pmd_t *dst_pmd,
2433
- struct vm_area_struct *dst_vma,
2434
- unsigned long dst_addr)
2435
-{
2436
- struct page *page = NULL;
2437
-
2438
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2439
- dst_addr, 0, true, &page);
2440
-}
2473
+#endif /* CONFIG_USERFAULTFD */
24412474
24422475 #ifdef CONFIG_TMPFS
24432476 static const struct inode_operations shmem_symlink_inode_operations;
....@@ -2617,7 +2650,7 @@
26172650 }
26182651
26192652 /*
2620
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
2653
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
26212654 */
26222655 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
26232656 pgoff_t index, pgoff_t end, int whence)
....@@ -2647,7 +2680,7 @@
26472680 index = indices[i];
26482681 }
26492682 page = pvec.pages[i];
2650
- if (page && !radix_tree_exceptional_entry(page)) {
2683
+ if (page && !xa_is_value(page)) {
26512684 if (!PageUptodate(page))
26522685 page = NULL;
26532686 }
....@@ -2943,7 +2976,7 @@
29432976 * first link must skip that, to get the accounting right.
29442977 */
29452978 if (inode->i_nlink) {
2946
- ret = shmem_reserve_inode(inode->i_sb);
2979
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
29472980 if (ret)
29482981 goto out;
29492982 }
....@@ -3095,12 +3128,9 @@
30953128
30963129 error = security_inode_init_security(inode, dir, &dentry->d_name,
30973130 shmem_initxattrs, NULL);
3098
- if (error) {
3099
- if (error != -EOPNOTSUPP) {
3100
- iput(inode);
3101
- return error;
3102
- }
3103
- error = 0;
3131
+ if (error && error != -EOPNOTSUPP) {
3132
+ iput(inode);
3133
+ return error;
31043134 }
31053135
31063136 inode->i_size = len-1;
....@@ -3192,7 +3222,7 @@
31923222 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
31933223 GFP_KERNEL);
31943224 if (!new_xattr->name) {
3195
- kfree(new_xattr);
3225
+ kvfree(new_xattr);
31963226 return -ENOMEM;
31973227 }
31983228
....@@ -3209,7 +3239,8 @@
32093239
32103240 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
32113241 struct dentry *unused, struct inode *inode,
3212
- const char *name, void *buffer, size_t size)
3242
+ const char *name, void *buffer, size_t size,
3243
+ int flags)
32133244 {
32143245 struct shmem_inode_info *info = SHMEM_I(inode);
32153246
....@@ -3225,7 +3256,7 @@
32253256 struct shmem_inode_info *info = SHMEM_I(inode);
32263257
32273258 name = xattr_full_name(handler, name);
3228
- return simple_xattr_set(&info->xattrs, name, value, size, flags);
3259
+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
32293260 }
32303261
32313262 static const struct xattr_handler shmem_security_xattr_handler = {
....@@ -3352,16 +3383,142 @@
33523383 .fh_to_dentry = shmem_fh_to_dentry,
33533384 };
33543385
3355
-static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
3356
- bool remount)
3386
+enum shmem_param {
3387
+ Opt_gid,
3388
+ Opt_huge,
3389
+ Opt_mode,
3390
+ Opt_mpol,
3391
+ Opt_nr_blocks,
3392
+ Opt_nr_inodes,
3393
+ Opt_size,
3394
+ Opt_uid,
3395
+ Opt_inode32,
3396
+ Opt_inode64,
3397
+};
3398
+
3399
+static const struct constant_table shmem_param_enums_huge[] = {
3400
+ {"never", SHMEM_HUGE_NEVER },
3401
+ {"always", SHMEM_HUGE_ALWAYS },
3402
+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
3403
+ {"advise", SHMEM_HUGE_ADVISE },
3404
+ {}
3405
+};
3406
+
3407
+const struct fs_parameter_spec shmem_fs_parameters[] = {
3408
+ fsparam_u32 ("gid", Opt_gid),
3409
+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
3410
+ fsparam_u32oct("mode", Opt_mode),
3411
+ fsparam_string("mpol", Opt_mpol),
3412
+ fsparam_string("nr_blocks", Opt_nr_blocks),
3413
+ fsparam_string("nr_inodes", Opt_nr_inodes),
3414
+ fsparam_string("size", Opt_size),
3415
+ fsparam_u32 ("uid", Opt_uid),
3416
+ fsparam_flag ("inode32", Opt_inode32),
3417
+ fsparam_flag ("inode64", Opt_inode64),
3418
+ {}
3419
+};
3420
+
3421
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
33573422 {
3358
- char *this_char, *value, *rest;
3359
- struct mempolicy *mpol = NULL;
3360
- uid_t uid;
3361
- gid_t gid;
3423
+ struct shmem_options *ctx = fc->fs_private;
3424
+ struct fs_parse_result result;
3425
+ unsigned long long size;
3426
+ char *rest;
3427
+ int opt;
3428
+
3429
+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3430
+ if (opt < 0)
3431
+ return opt;
3432
+
3433
+ switch (opt) {
3434
+ case Opt_size:
3435
+ size = memparse(param->string, &rest);
3436
+ if (*rest == '%') {
3437
+ size <<= PAGE_SHIFT;
3438
+ size *= totalram_pages();
3439
+ do_div(size, 100);
3440
+ rest++;
3441
+ }
3442
+ if (*rest)
3443
+ goto bad_value;
3444
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3445
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
3446
+ break;
3447
+ case Opt_nr_blocks:
3448
+ ctx->blocks = memparse(param->string, &rest);
3449
+ if (*rest)
3450
+ goto bad_value;
3451
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
3452
+ break;
3453
+ case Opt_nr_inodes:
3454
+ ctx->inodes = memparse(param->string, &rest);
3455
+ if (*rest)
3456
+ goto bad_value;
3457
+ ctx->seen |= SHMEM_SEEN_INODES;
3458
+ break;
3459
+ case Opt_mode:
3460
+ ctx->mode = result.uint_32 & 07777;
3461
+ break;
3462
+ case Opt_uid:
3463
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3464
+ if (!uid_valid(ctx->uid))
3465
+ goto bad_value;
3466
+ break;
3467
+ case Opt_gid:
3468
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3469
+ if (!gid_valid(ctx->gid))
3470
+ goto bad_value;
3471
+ break;
3472
+ case Opt_huge:
3473
+ ctx->huge = result.uint_32;
3474
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
3475
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3476
+ has_transparent_hugepage()))
3477
+ goto unsupported_parameter;
3478
+ ctx->seen |= SHMEM_SEEN_HUGE;
3479
+ break;
3480
+ case Opt_mpol:
3481
+ if (IS_ENABLED(CONFIG_NUMA)) {
3482
+ mpol_put(ctx->mpol);
3483
+ ctx->mpol = NULL;
3484
+ if (mpol_parse_str(param->string, &ctx->mpol))
3485
+ goto bad_value;
3486
+ break;
3487
+ }
3488
+ goto unsupported_parameter;
3489
+ case Opt_inode32:
3490
+ ctx->full_inums = false;
3491
+ ctx->seen |= SHMEM_SEEN_INUMS;
3492
+ break;
3493
+ case Opt_inode64:
3494
+ if (sizeof(ino_t) < 8) {
3495
+ return invalfc(fc,
3496
+ "Cannot use inode64 with <64bit inums in kernel\n");
3497
+ }
3498
+ ctx->full_inums = true;
3499
+ ctx->seen |= SHMEM_SEEN_INUMS;
3500
+ break;
3501
+ }
3502
+ return 0;
3503
+
3504
+unsupported_parameter:
3505
+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
3506
+bad_value:
3507
+ return invalfc(fc, "Bad value for '%s'", param->key);
3508
+}
3509
+
3510
+static int shmem_parse_options(struct fs_context *fc, void *data)
3511
+{
3512
+ char *options = data;
3513
+
3514
+ if (options) {
3515
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
3516
+ if (err)
3517
+ return err;
3518
+ }
33623519
33633520 while (options != NULL) {
3364
- this_char = options;
3521
+ char *this_char = options;
33653522 for (;;) {
33663523 /*
33673524 * NUL-terminate this option: unfortunately,
....@@ -3377,139 +3534,93 @@
33773534 break;
33783535 }
33793536 }
3380
- if (!*this_char)
3381
- continue;
3382
- if ((value = strchr(this_char,'=')) != NULL) {
3383
- *value++ = 0;
3384
- } else {
3385
- pr_err("tmpfs: No value for mount option '%s'\n",
3386
- this_char);
3387
- goto error;
3388
- }
3537
+ if (*this_char) {
3538
+ char *value = strchr(this_char,'=');
3539
+ size_t len = 0;
3540
+ int err;
33893541
3390
- if (!strcmp(this_char,"size")) {
3391
- unsigned long long size;
3392
- size = memparse(value,&rest);
3393
- if (*rest == '%') {
3394
- size <<= PAGE_SHIFT;
3395
- size *= totalram_pages;
3396
- do_div(size, 100);
3397
- rest++;
3542
+ if (value) {
3543
+ *value++ = '\0';
3544
+ len = strlen(value);
33983545 }
3399
- if (*rest)
3400
- goto bad_val;
3401
- sbinfo->max_blocks =
3402
- DIV_ROUND_UP(size, PAGE_SIZE);
3403
- } else if (!strcmp(this_char,"nr_blocks")) {
3404
- sbinfo->max_blocks = memparse(value, &rest);
3405
- if (*rest)
3406
- goto bad_val;
3407
- } else if (!strcmp(this_char,"nr_inodes")) {
3408
- sbinfo->max_inodes = memparse(value, &rest);
3409
- if (*rest)
3410
- goto bad_val;
3411
- } else if (!strcmp(this_char,"mode")) {
3412
- if (remount)
3413
- continue;
3414
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3415
- if (*rest)
3416
- goto bad_val;
3417
- } else if (!strcmp(this_char,"uid")) {
3418
- if (remount)
3419
- continue;
3420
- uid = simple_strtoul(value, &rest, 0);
3421
- if (*rest)
3422
- goto bad_val;
3423
- sbinfo->uid = make_kuid(current_user_ns(), uid);
3424
- if (!uid_valid(sbinfo->uid))
3425
- goto bad_val;
3426
- } else if (!strcmp(this_char,"gid")) {
3427
- if (remount)
3428
- continue;
3429
- gid = simple_strtoul(value, &rest, 0);
3430
- if (*rest)
3431
- goto bad_val;
3432
- sbinfo->gid = make_kgid(current_user_ns(), gid);
3433
- if (!gid_valid(sbinfo->gid))
3434
- goto bad_val;
3435
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3436
- } else if (!strcmp(this_char, "huge")) {
3437
- int huge;
3438
- huge = shmem_parse_huge(value);
3439
- if (huge < 0)
3440
- goto bad_val;
3441
- if (!has_transparent_hugepage() &&
3442
- huge != SHMEM_HUGE_NEVER)
3443
- goto bad_val;
3444
- sbinfo->huge = huge;
3445
-#endif
3446
-#ifdef CONFIG_NUMA
3447
- } else if (!strcmp(this_char,"mpol")) {
3448
- mpol_put(mpol);
3449
- mpol = NULL;
3450
- if (mpol_parse_str(value, &mpol))
3451
- goto bad_val;
3452
-#endif
3453
- } else {
3454
- pr_err("tmpfs: Bad mount option %s\n", this_char);
3455
- goto error;
3546
+ err = vfs_parse_fs_string(fc, this_char, value, len);
3547
+ if (err < 0)
3548
+ return err;
34563549 }
34573550 }
3458
- sbinfo->mpol = mpol;
34593551 return 0;
3460
-
3461
-bad_val:
3462
- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3463
- value, this_char);
3464
-error:
3465
- mpol_put(mpol);
3466
- return 1;
3467
-
34683552 }
34693553
3470
-static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
3554
+/*
3555
+ * Reconfigure a shmem filesystem.
3556
+ *
3557
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
3558
+ * are in use; but we must separately disallow unlimited->limited, because in
3559
+ * that case we have no record of how much is already in use.
3560
+ */
3561
+static int shmem_reconfigure(struct fs_context *fc)
34713562 {
3472
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3473
- struct shmem_sb_info config = *sbinfo;
3563
+ struct shmem_options *ctx = fc->fs_private;
3564
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
34743565 unsigned long inodes;
3475
- int error = -EINVAL;
3566
+ struct mempolicy *mpol = NULL;
3567
+ const char *err;
34763568
3477
- config.mpol = NULL;
3478
- if (shmem_parse_options(data, &config, true))
3479
- return error;
3480
-
3481
- spin_lock(&sbinfo->stat_lock);
3569
+ raw_spin_lock(&sbinfo->stat_lock);
34823570 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3483
- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3484
- goto out;
3485
- if (config.max_inodes < inodes)
3486
- goto out;
3487
- /*
3488
- * Those tests disallow limited->unlimited while any are in use;
3489
- * but we must separately disallow unlimited->limited, because
3490
- * in that case we have no record of how much is already in use.
3491
- */
3492
- if (config.max_blocks && !sbinfo->max_blocks)
3493
- goto out;
3494
- if (config.max_inodes && !sbinfo->max_inodes)
3495
- goto out;
3571
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3572
+ if (!sbinfo->max_blocks) {
3573
+ err = "Cannot retroactively limit size";
3574
+ goto out;
3575
+ }
3576
+ if (percpu_counter_compare(&sbinfo->used_blocks,
3577
+ ctx->blocks) > 0) {
3578
+ err = "Too small a size for current use";
3579
+ goto out;
3580
+ }
3581
+ }
3582
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3583
+ if (!sbinfo->max_inodes) {
3584
+ err = "Cannot retroactively limit inodes";
3585
+ goto out;
3586
+ }
3587
+ if (ctx->inodes < inodes) {
3588
+ err = "Too few inodes for current use";
3589
+ goto out;
3590
+ }
3591
+ }
34963592
3497
- error = 0;
3498
- sbinfo->huge = config.huge;
3499
- sbinfo->max_blocks = config.max_blocks;
3500
- sbinfo->max_inodes = config.max_inodes;
3501
- sbinfo->free_inodes = config.max_inodes - inodes;
3593
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3594
+ sbinfo->next_ino > UINT_MAX) {
3595
+ err = "Current inum too high to switch to 32-bit inums";
3596
+ goto out;
3597
+ }
3598
+
3599
+ if (ctx->seen & SHMEM_SEEN_HUGE)
3600
+ sbinfo->huge = ctx->huge;
3601
+ if (ctx->seen & SHMEM_SEEN_INUMS)
3602
+ sbinfo->full_inums = ctx->full_inums;
3603
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
3604
+ sbinfo->max_blocks = ctx->blocks;
3605
+ if (ctx->seen & SHMEM_SEEN_INODES) {
3606
+ sbinfo->max_inodes = ctx->inodes;
3607
+ sbinfo->free_inodes = ctx->inodes - inodes;
3608
+ }
35023609
35033610 /*
35043611 * Preserve previous mempolicy unless mpol remount option was specified.
35053612 */
3506
- if (config.mpol) {
3507
- mpol_put(sbinfo->mpol);
3508
- sbinfo->mpol = config.mpol; /* transfers initial ref */
3613
+ if (ctx->mpol) {
3614
+ mpol = sbinfo->mpol;
3615
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
3616
+ ctx->mpol = NULL;
35093617 }
3618
+ raw_spin_unlock(&sbinfo->stat_lock);
3619
+ mpol_put(mpol);
3620
+ return 0;
35103621 out:
3511
- spin_unlock(&sbinfo->stat_lock);
3512
- return error;
3622
+ raw_spin_unlock(&sbinfo->stat_lock);
3623
+ return invalfc(fc, "%s", err);
35133624 }
35143625
35153626 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
....@@ -3529,7 +3640,30 @@
35293640 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
35303641 seq_printf(seq, ",gid=%u",
35313642 from_kgid_munged(&init_user_ns, sbinfo->gid));
3532
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3643
+
3644
+ /*
3645
+ * Showing inode{64,32} might be useful even if it's the system default,
3646
+ * since then people don't have to resort to checking both here and
3647
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
3648
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
3649
+ *
3650
+ * We hide it when inode64 isn't the default and we are using 32-bit
3651
+ * inodes, since that probably just means the feature isn't even under
3652
+ * consideration.
3653
+ *
3654
+ * As such:
3655
+ *
3656
+ * +-----------------+-----------------+
3657
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
3658
+ * +------------------+-----------------+-----------------+
3659
+ * | full_inums=true | show | show |
3660
+ * | full_inums=false | show | hide |
3661
+ * +------------------+-----------------+-----------------+
3662
+ *
3663
+ */
3664
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3665
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3666
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
35333667 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
35343668 if (sbinfo->huge)
35353669 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
....@@ -3544,14 +3678,16 @@
35443678 {
35453679 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
35463680
3681
+ free_percpu(sbinfo->ino_batch);
35473682 percpu_counter_destroy(&sbinfo->used_blocks);
35483683 mpol_put(sbinfo->mpol);
35493684 kfree(sbinfo);
35503685 sb->s_fs_info = NULL;
35513686 }
35523687
3553
-int shmem_fill_super(struct super_block *sb, void *data, int silent)
3688
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
35543689 {
3690
+ struct shmem_options *ctx = fc->fs_private;
35553691 struct inode *inode;
35563692 struct shmem_sb_info *sbinfo;
35573693 int err = -ENOMEM;
....@@ -3562,9 +3698,6 @@
35623698 if (!sbinfo)
35633699 return -ENOMEM;
35643700
3565
- sbinfo->mode = 0777 | S_ISVTX;
3566
- sbinfo->uid = current_fsuid();
3567
- sbinfo->gid = current_fsgid();
35683701 sb->s_fs_info = sbinfo;
35693702
35703703 #ifdef CONFIG_TMPFS
....@@ -3574,12 +3707,12 @@
35743707 * but the internal instance is left unlimited.
35753708 */
35763709 if (!(sb->s_flags & SB_KERNMOUNT)) {
3577
- sbinfo->max_blocks = shmem_default_max_blocks();
3578
- sbinfo->max_inodes = shmem_default_max_inodes();
3579
- if (shmem_parse_options(data, sbinfo, false)) {
3580
- err = -EINVAL;
3581
- goto failed;
3582
- }
3710
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3711
+ ctx->blocks = shmem_default_max_blocks();
3712
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
3713
+ ctx->inodes = shmem_default_max_inodes();
3714
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
3715
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
35833716 } else {
35843717 sb->s_flags |= SB_NOUSER;
35853718 }
....@@ -3588,11 +3721,24 @@
35883721 #else
35893722 sb->s_flags |= SB_NOUSER;
35903723 #endif
3724
+ sbinfo->max_blocks = ctx->blocks;
3725
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3726
+ if (sb->s_flags & SB_KERNMOUNT) {
3727
+ sbinfo->ino_batch = alloc_percpu(ino_t);
3728
+ if (!sbinfo->ino_batch)
3729
+ goto failed;
3730
+ }
3731
+ sbinfo->uid = ctx->uid;
3732
+ sbinfo->gid = ctx->gid;
3733
+ sbinfo->full_inums = ctx->full_inums;
3734
+ sbinfo->mode = ctx->mode;
3735
+ sbinfo->huge = ctx->huge;
3736
+ sbinfo->mpol = ctx->mpol;
3737
+ ctx->mpol = NULL;
35913738
3592
- spin_lock_init(&sbinfo->stat_lock);
3739
+ raw_spin_lock_init(&sbinfo->stat_lock);
35933740 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
35943741 goto failed;
3595
- sbinfo->free_inodes = sbinfo->max_inodes;
35963742 spin_lock_init(&sbinfo->shrinklist_lock);
35973743 INIT_LIST_HEAD(&sbinfo->shrinklist);
35983744
....@@ -3625,6 +3771,31 @@
36253771 return err;
36263772 }
36273773
3774
+static int shmem_get_tree(struct fs_context *fc)
3775
+{
3776
+ return get_tree_nodev(fc, shmem_fill_super);
3777
+}
3778
+
3779
+static void shmem_free_fc(struct fs_context *fc)
3780
+{
3781
+ struct shmem_options *ctx = fc->fs_private;
3782
+
3783
+ if (ctx) {
3784
+ mpol_put(ctx->mpol);
3785
+ kfree(ctx);
3786
+ }
3787
+}
3788
+
3789
+static const struct fs_context_operations shmem_fs_context_ops = {
3790
+ .free = shmem_free_fc,
3791
+ .get_tree = shmem_get_tree,
3792
+#ifdef CONFIG_TMPFS
3793
+ .parse_monolithic = shmem_parse_options,
3794
+ .parse_param = shmem_parse_one,
3795
+ .reconfigure = shmem_reconfigure,
3796
+#endif
3797
+};
3798
+
36283799 static struct kmem_cache *shmem_inode_cachep;
36293800
36303801 static struct inode *shmem_alloc_inode(struct super_block *sb)
....@@ -3636,9 +3807,8 @@
36363807 return &info->vfs_inode;
36373808 }
36383809
3639
-static void shmem_destroy_callback(struct rcu_head *head)
3810
+static void shmem_free_in_core_inode(struct inode *inode)
36403811 {
3641
- struct inode *inode = container_of(head, struct inode, i_rcu);
36423812 if (S_ISLNK(inode->i_mode))
36433813 kfree(inode->i_link);
36443814 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
....@@ -3648,7 +3818,6 @@
36483818 {
36493819 if (S_ISREG(inode->i_mode))
36503820 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3651
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
36523821 }
36533822
36543823 static void shmem_init_inode(void *foo)
....@@ -3739,16 +3908,16 @@
37393908
37403909 static const struct super_operations shmem_ops = {
37413910 .alloc_inode = shmem_alloc_inode,
3911
+ .free_inode = shmem_free_in_core_inode,
37423912 .destroy_inode = shmem_destroy_inode,
37433913 #ifdef CONFIG_TMPFS
37443914 .statfs = shmem_statfs,
3745
- .remount_fs = shmem_remount_fs,
37463915 .show_options = shmem_show_options,
37473916 #endif
37483917 .evict_inode = shmem_evict_inode,
37493918 .drop_inode = generic_delete_inode,
37503919 .put_super = shmem_put_super,
3751
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3920
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
37523921 .nr_cached_objects = shmem_unused_huge_count,
37533922 .free_cached_objects = shmem_unused_huge_scan,
37543923 #endif
....@@ -3761,29 +3930,42 @@
37613930 .set_policy = shmem_set_policy,
37623931 .get_policy = shmem_get_policy,
37633932 #endif
3933
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
3934
+ .allow_speculation = filemap_allow_speculation,
3935
+#endif
37643936 };
37653937
3766
-static struct dentry *shmem_mount(struct file_system_type *fs_type,
3767
- int flags, const char *dev_name, void *data)
3938
+int shmem_init_fs_context(struct fs_context *fc)
37683939 {
3769
- return mount_nodev(fs_type, flags, data, shmem_fill_super);
3940
+ struct shmem_options *ctx;
3941
+
3942
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3943
+ if (!ctx)
3944
+ return -ENOMEM;
3945
+
3946
+ ctx->mode = 0777 | S_ISVTX;
3947
+ ctx->uid = current_fsuid();
3948
+ ctx->gid = current_fsgid();
3949
+
3950
+ fc->fs_private = ctx;
3951
+ fc->ops = &shmem_fs_context_ops;
3952
+ return 0;
37703953 }
37713954
37723955 static struct file_system_type shmem_fs_type = {
37733956 .owner = THIS_MODULE,
37743957 .name = "tmpfs",
3775
- .mount = shmem_mount,
3958
+ .init_fs_context = shmem_init_fs_context,
3959
+#ifdef CONFIG_TMPFS
3960
+ .parameters = shmem_fs_parameters,
3961
+#endif
37763962 .kill_sb = kill_litter_super,
3777
- .fs_flags = FS_USERNS_MOUNT,
3963
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
37783964 };
37793965
37803966 int __init shmem_init(void)
37813967 {
37823968 int error;
3783
-
3784
- /* If rootfs called this, don't re-init */
3785
- if (shmem_inode_cachep)
3786
- return 0;
37873969
37883970 shmem_init_inodecache();
37893971
....@@ -3800,7 +3982,7 @@
38003982 goto out1;
38013983 }
38023984
3803
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3985
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
38043986 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
38053987 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
38063988 else
....@@ -3816,11 +3998,11 @@
38163998 return error;
38173999 }
38184000
3819
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
4001
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
38204002 static ssize_t shmem_enabled_show(struct kobject *kobj,
38214003 struct kobj_attribute *attr, char *buf)
38224004 {
3823
- int values[] = {
4005
+ static const int values[] = {
38244006 SHMEM_HUGE_ALWAYS,
38254007 SHMEM_HUGE_WITHIN_SIZE,
38264008 SHMEM_HUGE_ADVISE,
....@@ -3868,9 +4050,9 @@
38684050
38694051 struct kobj_attribute shmem_enabled_attr =
38704052 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3871
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
4053
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
38724054
3873
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
4055
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
38744056 bool shmem_huge_enabled(struct vm_area_struct *vma)
38754057 {
38764058 struct inode *inode = file_inode(vma->vm_file);
....@@ -3878,6 +4060,8 @@
38784060 loff_t i_size;
38794061 pgoff_t off;
38804062
4063
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
4064
+ return false;
38814065 if (shmem_huge == SHMEM_HUGE_FORCE)
38824066 return true;
38834067 if (shmem_huge == SHMEM_HUGE_DENY)
....@@ -3893,7 +4077,7 @@
38934077 if (i_size >= HPAGE_PMD_SIZE &&
38944078 i_size >> PAGE_SHIFT >= off)
38954079 return true;
3896
- /* fall through */
4080
+ fallthrough;
38974081 case SHMEM_HUGE_ADVISE:
38984082 /* TODO: implement fadvise() hints */
38994083 return (vma->vm_flags & VM_HUGEPAGE);
....@@ -3902,7 +4086,7 @@
39024086 return false;
39034087 }
39044088 }
3905
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
4089
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
39064090
39074091 #else /* !CONFIG_SHMEM */
39084092
....@@ -3917,7 +4101,8 @@
39174101
39184102 static struct file_system_type shmem_fs_type = {
39194103 .name = "tmpfs",
3920
- .mount = ramfs_mount,
4104
+ .init_fs_context = ramfs_init_fs_context,
4105
+ .parameters = ramfs_fs_parameters,
39214106 .kill_sb = kill_litter_super,
39224107 .fs_flags = FS_USERNS_MOUNT,
39234108 };
....@@ -3932,7 +4117,8 @@
39324117 return 0;
39334118 }
39344119
3935
-int shmem_unuse(swp_entry_t swap, struct page *page)
4120
+int shmem_unuse(unsigned int type, bool frontswap,
4121
+ unsigned long *fs_pages_to_unuse)
39364122 {
39374123 return 0;
39384124 }
....@@ -4047,7 +4233,7 @@
40474233
40484234 /**
40494235 * shmem_zero_setup - setup a shared anonymous mapping
4050
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
4236
+ * @vma: the vma to be mmapped is prepared by do_mmap
40514237 */
40524238 int shmem_zero_setup(struct vm_area_struct *vma)
40534239 {
....@@ -4055,7 +4241,7 @@
40554241 loff_t size = vma->vm_end - vma->vm_start;
40564242
40574243 /*
4058
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
4244
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
40594245 * between XFS directory reading and selinux: since this file is only
40604246 * accessible to the user through its mapping, use S_PRIVATE flag to
40614247 * bypass file security, in the same way as shmem_kernel_file_setup().
....@@ -4069,7 +4255,7 @@
40694255 vma->vm_file = file;
40704256 vma->vm_ops = &shmem_vm_ops;
40714257
4072
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4258
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
40734259 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
40744260 (vma->vm_end & HPAGE_PMD_MASK)) {
40754261 khugepaged_enter(vma, vma->vm_flags);
....@@ -4117,3 +4303,47 @@
41174303 #endif
41184304 }
41194305 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4306
+
4307
+void shmem_mark_page_lazyfree(struct page *page, bool tail)
4308
+{
4309
+ mark_page_lazyfree_movetail(page, tail);
4310
+}
4311
+EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree);
4312
+
4313
+int reclaim_shmem_address_space(struct address_space *mapping)
4314
+{
4315
+#ifdef CONFIG_SHMEM
4316
+ pgoff_t start = 0;
4317
+ struct page *page;
4318
+ LIST_HEAD(page_list);
4319
+ XA_STATE(xas, &mapping->i_pages, start);
4320
+
4321
+ if (!shmem_mapping(mapping))
4322
+ return -EINVAL;
4323
+
4324
+ lru_add_drain();
4325
+
4326
+ rcu_read_lock();
4327
+ xas_for_each(&xas, page, ULONG_MAX) {
4328
+ if (xas_retry(&xas, page))
4329
+ continue;
4330
+ if (xa_is_value(page))
4331
+ continue;
4332
+ if (isolate_lru_page(page))
4333
+ continue;
4334
+
4335
+ list_add(&page->lru, &page_list);
4336
+
4337
+ if (need_resched()) {
4338
+ xas_pause(&xas);
4339
+ cond_resched_rcu();
4340
+ }
4341
+ }
4342
+ rcu_read_unlock();
4343
+
4344
+ return reclaim_pages(&page_list);
4345
+#else
4346
+ return 0;
4347
+#endif
4348
+}
4349
+EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);