hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/shmem.c
....@@ -36,8 +36,17 @@
3636 #include <linux/uio.h>
3737 #include <linux/khugepaged.h>
3838 #include <linux/hugetlb.h>
39
+#include <linux/frontswap.h>
40
+#include <linux/fs_parser.h>
41
+#include <linux/mm_inline.h>
3942
4043 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
44
+
45
+#include "internal.h"
46
+
47
+#undef CREATE_TRACE_POINTS
48
+#include <trace/hooks/shmem_fs.h>
49
+#include <trace/hooks/mm.h>
4150
4251 static struct vfsmount *shm_mnt;
4352
....@@ -80,7 +89,6 @@
8089 #include <linux/uuid.h>
8190
8291 #include <linux/uaccess.h>
83
-#include <asm/pgtable.h>
8492
8593 #include "internal.h"
8694
....@@ -106,21 +114,43 @@
106114 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
107115 };
108116
117
+struct shmem_options {
118
+ unsigned long long blocks;
119
+ unsigned long long inodes;
120
+ struct mempolicy *mpol;
121
+ kuid_t uid;
122
+ kgid_t gid;
123
+ umode_t mode;
124
+ bool full_inums;
125
+ int huge;
126
+ int seen;
127
+#define SHMEM_SEEN_BLOCKS 1
128
+#define SHMEM_SEEN_INODES 2
129
+#define SHMEM_SEEN_HUGE 4
130
+#define SHMEM_SEEN_INUMS 8
131
+};
132
+
109133 #ifdef CONFIG_TMPFS
110134 static unsigned long shmem_default_max_blocks(void)
111135 {
112
- return totalram_pages / 2;
136
+ return totalram_pages() / 2;
113137 }
114138
115139 static unsigned long shmem_default_max_inodes(void)
116140 {
117
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
141
+ unsigned long nr_pages = totalram_pages();
142
+
143
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
118144 }
119145 #endif
120146
121147 static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
122148 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
123149 struct shmem_inode_info *info, pgoff_t index);
150
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
151
+ struct page **pagep, enum sgp_type sgp,
152
+ gfp_t gfp, struct vm_area_struct *vma,
153
+ vm_fault_t *fault_type);
124154 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
125155 struct page **pagep, enum sgp_type sgp,
126156 gfp_t gfp, struct vm_area_struct *vma,
....@@ -239,18 +269,78 @@
239269 static LIST_HEAD(shmem_swaplist);
240270 static DEFINE_MUTEX(shmem_swaplist_mutex);
241271
242
-static int shmem_reserve_inode(struct super_block *sb)
272
+/*
273
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
274
+ * produces a novel ino for the newly allocated inode.
275
+ *
276
+ * It may also be called when making a hard link to permit the space needed by
277
+ * each dentry. However, in that case, no new inode number is needed since that
278
+ * internally draws from another pool of inode numbers (currently global
279
+ * get_next_ino()). This case is indicated by passing NULL as inop.
280
+ */
281
+#define SHMEM_INO_BATCH 1024
282
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
243283 {
244284 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
245
- if (sbinfo->max_inodes) {
285
+ ino_t ino;
286
+
287
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
246288 spin_lock(&sbinfo->stat_lock);
247
- if (!sbinfo->free_inodes) {
248
- spin_unlock(&sbinfo->stat_lock);
249
- return -ENOSPC;
289
+ if (sbinfo->max_inodes) {
290
+ if (!sbinfo->free_inodes) {
291
+ spin_unlock(&sbinfo->stat_lock);
292
+ return -ENOSPC;
293
+ }
294
+ sbinfo->free_inodes--;
250295 }
251
- sbinfo->free_inodes--;
296
+ if (inop) {
297
+ ino = sbinfo->next_ino++;
298
+ if (unlikely(is_zero_ino(ino)))
299
+ ino = sbinfo->next_ino++;
300
+ if (unlikely(!sbinfo->full_inums &&
301
+ ino > UINT_MAX)) {
302
+ /*
303
+ * Emulate get_next_ino uint wraparound for
304
+ * compatibility
305
+ */
306
+ if (IS_ENABLED(CONFIG_64BIT))
307
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
308
+ __func__, MINOR(sb->s_dev));
309
+ sbinfo->next_ino = 1;
310
+ ino = sbinfo->next_ino++;
311
+ }
312
+ *inop = ino;
313
+ }
252314 spin_unlock(&sbinfo->stat_lock);
315
+ } else if (inop) {
316
+ /*
317
+ * __shmem_file_setup, one of our callers, is lock-free: it
318
+ * doesn't hold stat_lock in shmem_reserve_inode since
319
+ * max_inodes is always 0, and is called from potentially
320
+ * unknown contexts. As such, use a per-cpu batched allocator
321
+ * which doesn't require the per-sb stat_lock unless we are at
322
+ * the batch boundary.
323
+ *
324
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
325
+ * shmem mounts are not exposed to userspace, so we don't need
326
+ * to worry about things like glibc compatibility.
327
+ */
328
+ ino_t *next_ino;
329
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
330
+ ino = *next_ino;
331
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
332
+ spin_lock(&sbinfo->stat_lock);
333
+ ino = sbinfo->next_ino;
334
+ sbinfo->next_ino += SHMEM_INO_BATCH;
335
+ spin_unlock(&sbinfo->stat_lock);
336
+ if (unlikely(is_zero_ino(ino)))
337
+ ino++;
338
+ }
339
+ *inop = ino;
340
+ *next_ino = ++ino;
341
+ put_cpu();
253342 }
343
+
254344 return 0;
255345 }
256346
....@@ -326,24 +416,20 @@
326416 }
327417
328418 /*
329
- * Replace item expected in radix tree by a new item, while holding tree lock.
419
+ * Replace item expected in xarray by a new item, while holding xa_lock.
330420 */
331
-static int shmem_radix_tree_replace(struct address_space *mapping,
421
+static int shmem_replace_entry(struct address_space *mapping,
332422 pgoff_t index, void *expected, void *replacement)
333423 {
334
- struct radix_tree_node *node;
335
- void __rcu **pslot;
424
+ XA_STATE(xas, &mapping->i_pages, index);
336425 void *item;
337426
338427 VM_BUG_ON(!expected);
339428 VM_BUG_ON(!replacement);
340
- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
341
- if (!item)
342
- return -ENOENT;
429
+ item = xas_load(&xas);
343430 if (item != expected)
344431 return -ENOENT;
345
- __radix_tree_replace(&mapping->i_pages, node, pslot,
346
- replacement, NULL);
432
+ xas_store(&xas, replacement);
347433 return 0;
348434 }
349435
....@@ -357,12 +443,7 @@
357443 static bool shmem_confirm_swap(struct address_space *mapping,
358444 pgoff_t index, swp_entry_t swap)
359445 {
360
- void *item;
361
-
362
- rcu_read_lock();
363
- item = radix_tree_lookup(&mapping->i_pages, index);
364
- rcu_read_unlock();
365
- return item == swp_to_radix_entry(swap);
446
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
366447 }
367448
368449 /*
....@@ -397,12 +478,12 @@
397478 #define SHMEM_HUGE_DENY (-1)
398479 #define SHMEM_HUGE_FORCE (-2)
399480
400
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
481
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
401482 /* ifdef here to avoid bloating shmem.o when not necessary */
402483
403484 static int shmem_huge __read_mostly;
404485
405
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
486
+#if defined(CONFIG_SYSFS)
406487 static int shmem_parse_huge(const char *str)
407488 {
408489 if (!strcmp(str, "never"))
....@@ -419,7 +500,9 @@
419500 return SHMEM_HUGE_FORCE;
420501 return -EINVAL;
421502 }
503
+#endif
422504
505
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
423506 static const char *shmem_format_huge(int huge)
424507 {
425508 switch (huge) {
....@@ -570,7 +653,7 @@
570653 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
571654 return READ_ONCE(sbinfo->shrinklist_len);
572655 }
573
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
656
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
574657
575658 #define shmem_huge SHMEM_HUGE_DENY
576659
....@@ -579,11 +662,11 @@
579662 {
580663 return 0;
581664 }
582
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
665
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
583666
584667 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
585668 {
586
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
669
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
587670 (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
588671 shmem_huge != SHMEM_HUGE_DENY)
589672 return true;
....@@ -595,9 +678,13 @@
595678 */
596679 static int shmem_add_to_page_cache(struct page *page,
597680 struct address_space *mapping,
598
- pgoff_t index, void *expected)
681
+ pgoff_t index, void *expected, gfp_t gfp,
682
+ struct mm_struct *charge_mm)
599683 {
600
- int error, nr = hpage_nr_pages(page);
684
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
685
+ unsigned long i = 0;
686
+ unsigned long nr = compound_nr(page);
687
+ int error;
601688
602689 VM_BUG_ON_PAGE(PageTail(page), page);
603690 VM_BUG_ON_PAGE(index != round_down(index, nr), page);
....@@ -609,46 +696,53 @@
609696 page->mapping = mapping;
610697 page->index = index;
611698
612
- xa_lock_irq(&mapping->i_pages);
613
- if (PageTransHuge(page)) {
614
- void __rcu **results;
615
- pgoff_t idx;
616
- int i;
617
-
618
- error = 0;
619
- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
620
- &results, &idx, index, 1) &&
621
- idx < index + HPAGE_PMD_NR) {
622
- error = -EEXIST;
623
- }
624
-
625
- if (!error) {
626
- for (i = 0; i < HPAGE_PMD_NR; i++) {
627
- error = radix_tree_insert(&mapping->i_pages,
628
- index + i, page + i);
629
- VM_BUG_ON(error);
699
+ if (!PageSwapCache(page)) {
700
+ error = mem_cgroup_charge(page, charge_mm, gfp);
701
+ if (error) {
702
+ if (PageTransHuge(page)) {
703
+ count_vm_event(THP_FILE_FALLBACK);
704
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
630705 }
631
- count_vm_event(THP_FILE_ALLOC);
706
+ goto error;
632707 }
633
- } else if (!expected) {
634
- error = radix_tree_insert(&mapping->i_pages, index, page);
635
- } else {
636
- error = shmem_radix_tree_replace(mapping, index, expected,
637
- page);
708
+ }
709
+ cgroup_throttle_swaprate(page, gfp);
710
+
711
+ do {
712
+ void *entry;
713
+ xas_lock_irq(&xas);
714
+ entry = xas_find_conflict(&xas);
715
+ if (entry != expected)
716
+ xas_set_err(&xas, -EEXIST);
717
+ xas_create_range(&xas);
718
+ if (xas_error(&xas))
719
+ goto unlock;
720
+next:
721
+ xas_store(&xas, page);
722
+ if (++i < nr) {
723
+ xas_next(&xas);
724
+ goto next;
725
+ }
726
+ if (PageTransHuge(page)) {
727
+ count_vm_event(THP_FILE_ALLOC);
728
+ __inc_node_page_state(page, NR_SHMEM_THPS);
729
+ }
730
+ mapping->nrpages += nr;
731
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
732
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
733
+unlock:
734
+ xas_unlock_irq(&xas);
735
+ } while (xas_nomem(&xas, gfp));
736
+
737
+ if (xas_error(&xas)) {
738
+ error = xas_error(&xas);
739
+ goto error;
638740 }
639741
640
- if (!error) {
641
- mapping->nrpages += nr;
642
- if (PageTransHuge(page))
643
- __inc_node_page_state(page, NR_SHMEM_THPS);
644
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
645
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
646
- xa_unlock_irq(&mapping->i_pages);
647
- } else {
648
- page->mapping = NULL;
649
- xa_unlock_irq(&mapping->i_pages);
650
- page_ref_sub(page, nr);
651
- }
742
+ return 0;
743
+error:
744
+ page->mapping = NULL;
745
+ page_ref_sub(page, nr);
652746 return error;
653747 }
654748
....@@ -663,27 +757,25 @@
663757 VM_BUG_ON_PAGE(PageCompound(page), page);
664758
665759 xa_lock_irq(&mapping->i_pages);
666
- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
760
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
667761 page->mapping = NULL;
668762 mapping->nrpages--;
669
- __dec_node_page_state(page, NR_FILE_PAGES);
670
- __dec_node_page_state(page, NR_SHMEM);
763
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
764
+ __dec_lruvec_page_state(page, NR_SHMEM);
671765 xa_unlock_irq(&mapping->i_pages);
672766 put_page(page);
673767 BUG_ON(error);
674768 }
675769
676770 /*
677
- * Remove swap entry from radix tree, free the swap and its page cache.
771
+ * Remove swap entry from page cache, free the swap and its page cache.
678772 */
679773 static int shmem_free_swap(struct address_space *mapping,
680774 pgoff_t index, void *radswap)
681775 {
682776 void *old;
683777
684
- xa_lock_irq(&mapping->i_pages);
685
- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
686
- xa_unlock_irq(&mapping->i_pages);
778
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
687779 if (old != radswap)
688780 return -ENOENT;
689781 free_swap_and_cache(radix_to_swp_entry(radswap));
....@@ -700,29 +792,19 @@
700792 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
701793 pgoff_t start, pgoff_t end)
702794 {
703
- struct radix_tree_iter iter;
704
- void __rcu **slot;
795
+ XA_STATE(xas, &mapping->i_pages, start);
705796 struct page *page;
706797 unsigned long swapped = 0;
707798
708799 rcu_read_lock();
709
-
710
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
711
- if (iter.index >= end)
712
- break;
713
-
714
- page = radix_tree_deref_slot(slot);
715
-
716
- if (radix_tree_deref_retry(page)) {
717
- slot = radix_tree_iter_retry(&iter);
800
+ xas_for_each(&xas, page, end - 1) {
801
+ if (xas_retry(&xas, page))
718802 continue;
719
- }
720
-
721
- if (radix_tree_exceptional_entry(page))
803
+ if (xa_is_value(page))
722804 swapped++;
723805
724806 if (need_resched()) {
725
- slot = radix_tree_iter_resume(slot, &iter);
807
+ xas_pause(&xas);
726808 cond_resched_rcu();
727809 }
728810 }
....@@ -797,7 +879,33 @@
797879 }
798880
799881 /*
800
- * Remove range of pages and swap entries from radix tree, and free them.
882
+ * Check whether a hole-punch or truncation needs to split a huge page,
883
+ * returning true if no split was required, or the split has been successful.
884
+ *
885
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
886
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
887
+ * head, and then succeeded to trylock on tail.
888
+ *
889
+ * A split can only succeed when there are no additional references on the
890
+ * huge page: so the split below relies upon find_get_entries() having stopped
891
+ * when it found a subpage of the huge page, without getting further references.
892
+ */
893
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
894
+{
895
+ if (!PageTransCompound(page))
896
+ return true;
897
+
898
+ /* Just proceed to delete a huge page wholly within the range punched */
899
+ if (PageHead(page) &&
900
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
901
+ return true;
902
+
903
+ /* Try to split huge page, so we can truly punch the hole or truncate */
904
+ return split_huge_page(page) >= 0;
905
+}
906
+
907
+/*
908
+ * Remove range of pages and swap entries from page cache, and free them.
801909 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
802910 */
803911 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
....@@ -833,7 +941,7 @@
833941 if (index >= end)
834942 break;
835943
836
- if (radix_tree_exceptional_entry(page)) {
944
+ if (xa_is_value(page)) {
837945 if (unfalloc)
838946 continue;
839947 nr_swaps_freed += !shmem_free_swap(mapping,
....@@ -846,31 +954,11 @@
846954 if (!trylock_page(page))
847955 continue;
848956
849
- if (PageTransTail(page)) {
850
- /* Middle of THP: zero out the page */
851
- clear_highpage(page);
852
- unlock_page(page);
853
- continue;
854
- } else if (PageTransHuge(page)) {
855
- if (index == round_down(end, HPAGE_PMD_NR)) {
856
- /*
857
- * Range ends in the middle of THP:
858
- * zero out the page
859
- */
860
- clear_highpage(page);
861
- unlock_page(page);
862
- continue;
863
- }
864
- index += HPAGE_PMD_NR - 1;
865
- i += HPAGE_PMD_NR - 1;
866
- }
867
-
868
- if (!unfalloc || !PageUptodate(page)) {
869
- VM_BUG_ON_PAGE(PageTail(page), page);
870
- if (page_mapping(page) == mapping) {
871
- VM_BUG_ON_PAGE(PageWriteback(page), page);
957
+ if ((!unfalloc || !PageUptodate(page)) &&
958
+ page_mapping(page) == mapping) {
959
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
960
+ if (shmem_punch_compound(page, start, end))
872961 truncate_inode_page(mapping, page);
873
- }
874962 }
875963 unlock_page(page);
876964 }
....@@ -930,7 +1018,7 @@
9301018 if (index >= end)
9311019 break;
9321020
933
- if (radix_tree_exceptional_entry(page)) {
1021
+ if (xa_is_value(page)) {
9341022 if (unfalloc)
9351023 continue;
9361024 if (shmem_free_swap(mapping, index, page)) {
....@@ -944,42 +1032,24 @@
9441032
9451033 lock_page(page);
9461034
947
- if (PageTransTail(page)) {
948
- /* Middle of THP: zero out the page */
949
- clear_highpage(page);
950
- unlock_page(page);
951
- /*
952
- * Partial thp truncate due 'start' in middle
953
- * of THP: don't need to look on these pages
954
- * again on !pvec.nr restart.
955
- */
956
- if (index != round_down(end, HPAGE_PMD_NR))
957
- start++;
958
- continue;
959
- } else if (PageTransHuge(page)) {
960
- if (index == round_down(end, HPAGE_PMD_NR)) {
961
- /*
962
- * Range ends in the middle of THP:
963
- * zero out the page
964
- */
965
- clear_highpage(page);
966
- unlock_page(page);
967
- continue;
968
- }
969
- index += HPAGE_PMD_NR - 1;
970
- i += HPAGE_PMD_NR - 1;
971
- }
972
-
9731035 if (!unfalloc || !PageUptodate(page)) {
974
- VM_BUG_ON_PAGE(PageTail(page), page);
975
- if (page_mapping(page) == mapping) {
976
- VM_BUG_ON_PAGE(PageWriteback(page), page);
977
- truncate_inode_page(mapping, page);
978
- } else {
1036
+ if (page_mapping(page) != mapping) {
9791037 /* Page was replaced by swap: retry */
9801038 unlock_page(page);
9811039 index--;
9821040 break;
1041
+ }
1042
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
1043
+ if (shmem_punch_compound(page, start, end))
1044
+ truncate_inode_page(mapping, page);
1045
+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1046
+ /* Wipe the page and don't get stuck */
1047
+ clear_highpage(page);
1048
+ flush_dcache_page(page);
1049
+ set_page_dirty(page);
1050
+ if (index <
1051
+ round_up(start, HPAGE_PMD_NR))
1052
+ start = index + 1;
9831053 }
9841054 }
9851055 unlock_page(page);
....@@ -1067,7 +1137,7 @@
10671137 * Part of the huge page can be beyond i_size: subject
10681138 * to shrink under memory pressure.
10691139 */
1070
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1140
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
10711141 spin_lock(&sbinfo->shrinklist_lock);
10721142 /*
10731143 * _careful to defend against unlocked access to
....@@ -1106,9 +1176,14 @@
11061176 }
11071177 spin_unlock(&sbinfo->shrinklist_lock);
11081178 }
1109
- if (!list_empty(&info->swaplist)) {
1179
+ while (!list_empty(&info->swaplist)) {
1180
+ /* Wait while shmem_unuse() is scanning this inode... */
1181
+ wait_var_event(&info->stop_eviction,
1182
+ !atomic_read(&info->stop_eviction));
11101183 mutex_lock(&shmem_swaplist_mutex);
1111
- list_del_init(&info->swaplist);
1184
+ /* ...but beware of the race if we peeked too early */
1185
+ if (!atomic_read(&info->stop_eviction))
1186
+ list_del_init(&info->swaplist);
11121187 mutex_unlock(&shmem_swaplist_mutex);
11131188 }
11141189 }
....@@ -1119,166 +1194,174 @@
11191194 clear_inode(inode);
11201195 }
11211196
1122
-static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
1197
+extern struct swap_info_struct *swap_info[];
1198
+
1199
+static int shmem_find_swap_entries(struct address_space *mapping,
1200
+ pgoff_t start, unsigned int nr_entries,
1201
+ struct page **entries, pgoff_t *indices,
1202
+ unsigned int type, bool frontswap)
11231203 {
1124
- struct radix_tree_iter iter;
1125
- void __rcu **slot;
1126
- unsigned long found = -1;
1127
- unsigned int checked = 0;
1204
+ XA_STATE(xas, &mapping->i_pages, start);
1205
+ struct page *page;
1206
+ swp_entry_t entry;
1207
+ unsigned int ret = 0;
1208
+
1209
+ if (!nr_entries)
1210
+ return 0;
11281211
11291212 rcu_read_lock();
1130
- radix_tree_for_each_slot(slot, root, &iter, 0) {
1131
- void *entry = radix_tree_deref_slot(slot);
1132
-
1133
- if (radix_tree_deref_retry(entry)) {
1134
- slot = radix_tree_iter_retry(&iter);
1213
+ xas_for_each(&xas, page, ULONG_MAX) {
1214
+ if (xas_retry(&xas, page))
11351215 continue;
1216
+
1217
+ if (!xa_is_value(page))
1218
+ continue;
1219
+
1220
+ entry = radix_to_swp_entry(page);
1221
+ if (swp_type(entry) != type)
1222
+ continue;
1223
+ if (frontswap &&
1224
+ !frontswap_test(swap_info[type], swp_offset(entry)))
1225
+ continue;
1226
+
1227
+ indices[ret] = xas.xa_index;
1228
+ entries[ret] = page;
1229
+
1230
+ if (need_resched()) {
1231
+ xas_pause(&xas);
1232
+ cond_resched_rcu();
11361233 }
1137
- if (entry == item) {
1138
- found = iter.index;
1234
+ if (++ret == nr_entries)
11391235 break;
1140
- }
1141
- checked++;
1142
- if ((checked % 4096) != 0)
1143
- continue;
1144
- slot = radix_tree_iter_resume(slot, &iter);
1145
- cond_resched_rcu();
11461236 }
1147
-
11481237 rcu_read_unlock();
1149
- return found;
1238
+
1239
+ return ret;
1240
+}
1241
+
1242
+/*
1243
+ * Move the swapped pages for an inode to page cache. Returns the count
1244
+ * of pages swapped in, or the error in case of failure.
1245
+ */
1246
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1247
+ pgoff_t *indices)
1248
+{
1249
+ int i = 0;
1250
+ int ret = 0;
1251
+ int error = 0;
1252
+ struct address_space *mapping = inode->i_mapping;
1253
+
1254
+ for (i = 0; i < pvec.nr; i++) {
1255
+ struct page *page = pvec.pages[i];
1256
+
1257
+ if (!xa_is_value(page))
1258
+ continue;
1259
+ error = shmem_swapin_page(inode, indices[i],
1260
+ &page, SGP_CACHE,
1261
+ mapping_gfp_mask(mapping),
1262
+ NULL, NULL);
1263
+ if (error == 0) {
1264
+ unlock_page(page);
1265
+ put_page(page);
1266
+ ret++;
1267
+ }
1268
+ if (error == -ENOMEM)
1269
+ break;
1270
+ error = 0;
1271
+ }
1272
+ return error ? error : ret;
11501273 }
11511274
11521275 /*
11531276 * If swap found in inode, free it and move page from swapcache to filecache.
11541277 */
1155
-static int shmem_unuse_inode(struct shmem_inode_info *info,
1156
- swp_entry_t swap, struct page **pagep)
1278
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1279
+ bool frontswap, unsigned long *fs_pages_to_unuse)
11571280 {
1158
- struct address_space *mapping = info->vfs_inode.i_mapping;
1159
- void *radswap;
1160
- pgoff_t index;
1161
- gfp_t gfp;
1162
- int error = 0;
1281
+ struct address_space *mapping = inode->i_mapping;
1282
+ pgoff_t start = 0;
1283
+ struct pagevec pvec;
1284
+ pgoff_t indices[PAGEVEC_SIZE];
1285
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1286
+ int ret = 0;
11631287
1164
- radswap = swp_to_radix_entry(swap);
1165
- index = find_swap_entry(&mapping->i_pages, radswap);
1166
- if (index == -1)
1167
- return -EAGAIN; /* tell shmem_unuse we found nothing */
1288
+ pagevec_init(&pvec);
1289
+ do {
1290
+ unsigned int nr_entries = PAGEVEC_SIZE;
11681291
1169
- /*
1170
- * Move _head_ to start search for next from here.
1171
- * But be careful: shmem_evict_inode checks list_empty without taking
1172
- * mutex, and there's an instant in list_move_tail when info->swaplist
1173
- * would appear empty, if it were the only one on shmem_swaplist.
1174
- */
1175
- if (shmem_swaplist.next != &info->swaplist)
1176
- list_move_tail(&shmem_swaplist, &info->swaplist);
1292
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1293
+ nr_entries = *fs_pages_to_unuse;
11771294
1178
- gfp = mapping_gfp_mask(mapping);
1179
- if (shmem_should_replace_page(*pagep, gfp)) {
1180
- mutex_unlock(&shmem_swaplist_mutex);
1181
- error = shmem_replace_page(pagep, gfp, info, index);
1182
- mutex_lock(&shmem_swaplist_mutex);
1183
- /*
1184
- * We needed to drop mutex to make that restrictive page
1185
- * allocation, but the inode might have been freed while we
1186
- * dropped it: although a racing shmem_evict_inode() cannot
1187
- * complete without emptying the radix_tree, our page lock
1188
- * on this swapcache page is not enough to prevent that -
1189
- * free_swap_and_cache() of our swap entry will only
1190
- * trylock_page(), removing swap from radix_tree whatever.
1191
- *
1192
- * We must not proceed to shmem_add_to_page_cache() if the
1193
- * inode has been freed, but of course we cannot rely on
1194
- * inode or mapping or info to check that. However, we can
1195
- * safely check if our swap entry is still in use (and here
1196
- * it can't have got reused for another page): if it's still
1197
- * in use, then the inode cannot have been freed yet, and we
1198
- * can safely proceed (if it's no longer in use, that tells
1199
- * nothing about the inode, but we don't need to unuse swap).
1200
- */
1201
- if (!page_swapcount(*pagep))
1202
- error = -ENOENT;
1203
- }
1204
-
1205
- /*
1206
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
1207
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
1208
- * beneath us (pagelock doesn't help until the page is in pagecache).
1209
- */
1210
- if (!error)
1211
- error = shmem_add_to_page_cache(*pagep, mapping, index,
1212
- radswap);
1213
- if (error != -ENOMEM) {
1214
- /*
1215
- * Truncation and eviction use free_swap_and_cache(), which
1216
- * only does trylock page: if we raced, best clean up here.
1217
- */
1218
- delete_from_swap_cache(*pagep);
1219
- set_page_dirty(*pagep);
1220
- if (!error) {
1221
- spin_lock_irq(&info->lock);
1222
- info->swapped--;
1223
- spin_unlock_irq(&info->lock);
1224
- swap_free(swap);
1295
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1296
+ pvec.pages, indices,
1297
+ type, frontswap);
1298
+ if (pvec.nr == 0) {
1299
+ ret = 0;
1300
+ break;
12251301 }
1226
- }
1227
- return error;
1302
+
1303
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
1304
+ if (ret < 0)
1305
+ break;
1306
+
1307
+ if (frontswap_partial) {
1308
+ *fs_pages_to_unuse -= ret;
1309
+ if (*fs_pages_to_unuse == 0) {
1310
+ ret = FRONTSWAP_PAGES_UNUSED;
1311
+ break;
1312
+ }
1313
+ }
1314
+
1315
+ start = indices[pvec.nr - 1];
1316
+ } while (true);
1317
+
1318
+ return ret;
12281319 }
12291320
12301321 /*
1231
- * Search through swapped inodes to find and replace swap by page.
1322
+ * Read all the shared memory data that resides in the swap
1323
+ * device 'type' back into memory, so the swap device can be
1324
+ * unused.
12321325 */
1233
-int shmem_unuse(swp_entry_t swap, struct page *page)
1326
+int shmem_unuse(unsigned int type, bool frontswap,
1327
+ unsigned long *fs_pages_to_unuse)
12341328 {
1235
- struct list_head *this, *next;
1236
- struct shmem_inode_info *info;
1237
- struct mem_cgroup *memcg;
1329
+ struct shmem_inode_info *info, *next;
12381330 int error = 0;
12391331
1240
- /*
1241
- * There's a faint possibility that swap page was replaced before
1242
- * caller locked it: caller will come back later with the right page.
1243
- */
1244
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
1245
- goto out;
1246
-
1247
- /*
1248
- * Charge page using GFP_KERNEL while we can wait, before taking
1249
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
1250
- * Charged back to the user (not to caller) when swap account is used.
1251
- */
1252
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
1253
- &memcg, false);
1254
- if (error)
1255
- goto out;
1256
- /* No radix_tree_preload: swap entry keeps a place for page in tree */
1257
- error = -EAGAIN;
1332
+ if (list_empty(&shmem_swaplist))
1333
+ return 0;
12581334
12591335 mutex_lock(&shmem_swaplist_mutex);
1260
- list_for_each_safe(this, next, &shmem_swaplist) {
1261
- info = list_entry(this, struct shmem_inode_info, swaplist);
1262
- if (info->swapped)
1263
- error = shmem_unuse_inode(info, swap, &page);
1264
- else
1336
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1337
+ if (!info->swapped) {
12651338 list_del_init(&info->swaplist);
1339
+ continue;
1340
+ }
1341
+ /*
1342
+ * Drop the swaplist mutex while searching the inode for swap;
1343
+ * but before doing so, make sure shmem_evict_inode() will not
1344
+ * remove placeholder inode from swaplist, nor let it be freed
1345
+ * (igrab() would protect from unlink, but not from unmount).
1346
+ */
1347
+ atomic_inc(&info->stop_eviction);
1348
+ mutex_unlock(&shmem_swaplist_mutex);
1349
+
1350
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1351
+ fs_pages_to_unuse);
12661352 cond_resched();
1267
- if (error != -EAGAIN)
1353
+
1354
+ mutex_lock(&shmem_swaplist_mutex);
1355
+ next = list_next_entry(info, swaplist);
1356
+ if (!info->swapped)
1357
+ list_del_init(&info->swaplist);
1358
+ if (atomic_dec_and_test(&info->stop_eviction))
1359
+ wake_up_var(&info->stop_eviction);
1360
+ if (error)
12681361 break;
1269
- /* found nothing in this: move on to search the next */
12701362 }
12711363 mutex_unlock(&shmem_swaplist_mutex);
12721364
1273
- if (error) {
1274
- if (error != -ENOMEM)
1275
- error = 0;
1276
- mem_cgroup_cancel_charge(page, memcg, false);
1277
- } else
1278
- mem_cgroup_commit_charge(page, memcg, true, false);
1279
-out:
1280
- unlock_page(page);
1281
- put_page(page);
12821365 return error;
12831366 }
12841367
....@@ -1348,6 +1431,7 @@
13481431 SetPageUptodate(page);
13491432 }
13501433
1434
+ trace_android_vh_set_shmem_page_flag(page);
13511435 swap = get_swap_page(page);
13521436 if (!swap.val)
13531437 goto redirty;
....@@ -1362,9 +1446,11 @@
13621446 */
13631447 mutex_lock(&shmem_swaplist_mutex);
13641448 if (list_empty(&info->swaplist))
1365
- list_add_tail(&info->swaplist, &shmem_swaplist);
1449
+ list_add(&info->swaplist, &shmem_swaplist);
13661450
1367
- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1451
+ if (add_to_swap_cache(page, swap,
1452
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1453
+ NULL) == 0) {
13681454 spin_lock_irq(&info->lock);
13691455 shmem_recalc_inode(inode);
13701456 info->swapped++;
....@@ -1447,11 +1533,11 @@
14471533 {
14481534 struct vm_area_struct pvma;
14491535 struct page *page;
1450
- struct vm_fault vmf;
1536
+ struct vm_fault vmf = {
1537
+ .vma = &pvma,
1538
+ };
14511539
14521540 shmem_pseudo_vma_init(&pvma, info, index);
1453
- vmf.vma = &pvma;
1454
- vmf.address = 0;
14551541 page = swap_cluster_readahead(swap, gfp, &vmf);
14561542 shmem_pseudo_vma_destroy(&pvma);
14571543
....@@ -1462,23 +1548,14 @@
14621548 struct shmem_inode_info *info, pgoff_t index)
14631549 {
14641550 struct vm_area_struct pvma;
1465
- struct inode *inode = &info->vfs_inode;
1466
- struct address_space *mapping = inode->i_mapping;
1467
- pgoff_t idx, hindex;
1468
- void __rcu **results;
1551
+ struct address_space *mapping = info->vfs_inode.i_mapping;
1552
+ pgoff_t hindex;
14691553 struct page *page;
14701554
1471
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1472
- return NULL;
1473
-
14741555 hindex = round_down(index, HPAGE_PMD_NR);
1475
- rcu_read_lock();
1476
- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
1477
- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1478
- rcu_read_unlock();
1556
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1557
+ XA_PRESENT))
14791558 return NULL;
1480
- }
1481
- rcu_read_unlock();
14821559
14831560 shmem_pseudo_vma_init(&pvma, info, hindex);
14841561 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
....@@ -1486,6 +1563,8 @@
14861563 shmem_pseudo_vma_destroy(&pvma);
14871564 if (page)
14881565 prep_transhuge_page(page);
1566
+ else
1567
+ count_vm_event(THP_FILE_FALLBACK);
14891568 return page;
14901569 }
14911570
....@@ -1493,7 +1572,11 @@
14931572 struct shmem_inode_info *info, pgoff_t index)
14941573 {
14951574 struct vm_area_struct pvma;
1496
- struct page *page;
1575
+ struct page *page = NULL;
1576
+
1577
+ trace_android_vh_shmem_alloc_page(&page);
1578
+ if (page)
1579
+ return page;
14971580
14981581 shmem_pseudo_vma_init(&pvma, info, index);
14991582 page = alloc_page_vma(gfp, &pvma, 0);
....@@ -1511,7 +1594,7 @@
15111594 int nr;
15121595 int err = -ENOSPC;
15131596
1514
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1597
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
15151598 huge = false;
15161599 nr = huge ? HPAGE_PMD_NR : 1;
15171600
....@@ -1589,11 +1672,11 @@
15891672 * a nice clean interface for us to replace oldpage by newpage there.
15901673 */
15911674 xa_lock_irq(&swap_mapping->i_pages);
1592
- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1593
- newpage);
1675
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
15941676 if (!error) {
1595
- __inc_node_page_state(newpage, NR_FILE_PAGES);
1596
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
1677
+ mem_cgroup_migrate(oldpage, newpage);
1678
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
1679
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
15971680 }
15981681 xa_unlock_irq(&swap_mapping->i_pages);
15991682
....@@ -1605,8 +1688,7 @@
16051688 */
16061689 oldpage = newpage;
16071690 } else {
1608
- mem_cgroup_migrate(oldpage, newpage);
1609
- lru_cache_add_anon(newpage);
1691
+ lru_cache_add(newpage);
16101692 *pagep = newpage;
16111693 }
16121694
....@@ -1620,13 +1702,109 @@
16201702 }
16211703
16221704 /*
1705
+ * Swap in the page pointed to by *pagep.
1706
+ * Caller has to make sure that *pagep contains a valid swapped page.
1707
+ * Returns 0 and the page in pagep if success. On failure, returns the
1708
+ * error code and NULL in *pagep.
1709
+ */
1710
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1711
+ struct page **pagep, enum sgp_type sgp,
1712
+ gfp_t gfp, struct vm_area_struct *vma,
1713
+ vm_fault_t *fault_type)
1714
+{
1715
+ struct address_space *mapping = inode->i_mapping;
1716
+ struct shmem_inode_info *info = SHMEM_I(inode);
1717
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
1718
+ struct page *page;
1719
+ swp_entry_t swap;
1720
+ int error;
1721
+
1722
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1723
+ swap = radix_to_swp_entry(*pagep);
1724
+ *pagep = NULL;
1725
+
1726
+ /* Look it up and read it in.. */
1727
+ page = lookup_swap_cache(swap, NULL, 0);
1728
+ if (!page) {
1729
+ /* Or update major stats only when swapin succeeds?? */
1730
+ if (fault_type) {
1731
+ *fault_type |= VM_FAULT_MAJOR;
1732
+ count_vm_event(PGMAJFAULT);
1733
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
1734
+ }
1735
+ /* Here we actually start the io */
1736
+ page = shmem_swapin(swap, gfp, info, index);
1737
+ if (!page) {
1738
+ error = -ENOMEM;
1739
+ goto failed;
1740
+ }
1741
+ }
1742
+
1743
+ /* We have to do this with page locked to prevent races */
1744
+ lock_page(page);
1745
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
1746
+ !shmem_confirm_swap(mapping, index, swap)) {
1747
+ error = -EEXIST;
1748
+ goto unlock;
1749
+ }
1750
+ if (!PageUptodate(page)) {
1751
+ error = -EIO;
1752
+ goto failed;
1753
+ }
1754
+ wait_on_page_writeback(page);
1755
+
1756
+ /*
1757
+ * Some architectures may have to restore extra metadata to the
1758
+ * physical page after reading from swap.
1759
+ */
1760
+ arch_swap_restore(swap, page);
1761
+
1762
+ if (shmem_should_replace_page(page, gfp)) {
1763
+ error = shmem_replace_page(&page, gfp, info, index);
1764
+ if (error)
1765
+ goto failed;
1766
+ }
1767
+
1768
+ error = shmem_add_to_page_cache(page, mapping, index,
1769
+ swp_to_radix_entry(swap), gfp,
1770
+ charge_mm);
1771
+ if (error)
1772
+ goto failed;
1773
+
1774
+ spin_lock_irq(&info->lock);
1775
+ info->swapped--;
1776
+ shmem_recalc_inode(inode);
1777
+ spin_unlock_irq(&info->lock);
1778
+
1779
+ if (sgp == SGP_WRITE)
1780
+ mark_page_accessed(page);
1781
+
1782
+ delete_from_swap_cache(page);
1783
+ set_page_dirty(page);
1784
+ swap_free(swap);
1785
+
1786
+ *pagep = page;
1787
+ return 0;
1788
+failed:
1789
+ if (!shmem_confirm_swap(mapping, index, swap))
1790
+ error = -EEXIST;
1791
+unlock:
1792
+ if (page) {
1793
+ unlock_page(page);
1794
+ put_page(page);
1795
+ }
1796
+
1797
+ return error;
1798
+}
1799
+
1800
+/*
16231801 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
16241802 *
16251803 * If we allocate a new one we do not mark it dirty. That's up to the
16261804 * vm. If we swap it in we mark it dirty since we also free the swap
16271805 * entry since a page cannot live in both the swap and page cache.
16281806 *
1629
- * fault_mm and fault_type are only supplied by shmem_fault:
1807
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
16301808 * otherwise they are NULL.
16311809 */
16321810 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
....@@ -1638,9 +1816,7 @@
16381816 struct shmem_inode_info *info = SHMEM_I(inode);
16391817 struct shmem_sb_info *sbinfo;
16401818 struct mm_struct *charge_mm;
1641
- struct mem_cgroup *memcg;
16421819 struct page *page;
1643
- swp_entry_t swap;
16441820 enum sgp_type sgp_huge = sgp;
16451821 pgoff_t hindex = index;
16461822 int error;
....@@ -1652,19 +1828,37 @@
16521828 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
16531829 sgp = SGP_CACHE;
16541830 repeat:
1655
- swap.val = 0;
1656
- page = find_lock_entry(mapping, index);
1657
- if (radix_tree_exceptional_entry(page)) {
1658
- swap = radix_to_swp_entry(page);
1659
- page = NULL;
1660
- }
1661
-
16621831 if (sgp <= SGP_CACHE &&
16631832 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1664
- error = -EINVAL;
1665
- goto unlock;
1833
+ return -EINVAL;
16661834 }
16671835
1836
+ sbinfo = SHMEM_SB(inode->i_sb);
1837
+ charge_mm = vma ? vma->vm_mm : current->mm;
1838
+
1839
+ page = find_lock_entry(mapping, index);
1840
+
1841
+ if (page && vma && userfaultfd_minor(vma)) {
1842
+ if (!xa_is_value(page)) {
1843
+ unlock_page(page);
1844
+ put_page(page);
1845
+ }
1846
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1847
+ return 0;
1848
+ }
1849
+
1850
+ if (xa_is_value(page)) {
1851
+ error = shmem_swapin_page(inode, index, &page,
1852
+ sgp, gfp, vma, fault_type);
1853
+ if (error == -EEXIST)
1854
+ goto repeat;
1855
+
1856
+ *pagep = page;
1857
+ return error;
1858
+ }
1859
+
1860
+ if (page)
1861
+ hindex = page->index;
16681862 if (page && sgp == SGP_WRITE)
16691863 mark_page_accessed(page);
16701864
....@@ -1675,230 +1869,141 @@
16751869 unlock_page(page);
16761870 put_page(page);
16771871 page = NULL;
1872
+ hindex = index;
16781873 }
1679
- if (page || (sgp == SGP_READ && !swap.val)) {
1680
- *pagep = page;
1681
- return 0;
1682
- }
1874
+ if (page || sgp == SGP_READ)
1875
+ goto out;
16831876
16841877 /*
16851878 * Fast cache lookup did not find it:
16861879 * bring it back from swap or allocate.
16871880 */
1688
- sbinfo = SHMEM_SB(inode->i_sb);
1689
- charge_mm = vma ? vma->vm_mm : current->mm;
16901881
1691
- if (swap.val) {
1692
- /* Look it up and read it in.. */
1693
- page = lookup_swap_cache(swap, NULL, 0);
1694
- if (!page) {
1695
- /* Or update major stats only when swapin succeeds?? */
1696
- if (fault_type) {
1697
- *fault_type |= VM_FAULT_MAJOR;
1698
- count_vm_event(PGMAJFAULT);
1699
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
1700
- }
1701
- /* Here we actually start the io */
1702
- page = shmem_swapin(swap, gfp, info, index);
1703
- if (!page) {
1704
- error = -ENOMEM;
1705
- goto failed;
1706
- }
1707
- }
1882
+ if (vma && userfaultfd_missing(vma)) {
1883
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1884
+ return 0;
1885
+ }
17081886
1709
- /* We have to do this with page locked to prevent races */
1710
- lock_page(page);
1711
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
1712
- !shmem_confirm_swap(mapping, index, swap)) {
1713
- error = -EEXIST; /* try again */
1714
- goto unlock;
1715
- }
1716
- if (!PageUptodate(page)) {
1717
- error = -EIO;
1718
- goto failed;
1719
- }
1720
- wait_on_page_writeback(page);
1887
+ /* shmem_symlink() */
1888
+ if (mapping->a_ops != &shmem_aops)
1889
+ goto alloc_nohuge;
1890
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1891
+ goto alloc_nohuge;
1892
+ if (shmem_huge == SHMEM_HUGE_FORCE)
1893
+ goto alloc_huge;
1894
+ switch (sbinfo->huge) {
1895
+ case SHMEM_HUGE_NEVER:
1896
+ goto alloc_nohuge;
1897
+ case SHMEM_HUGE_WITHIN_SIZE: {
1898
+ loff_t i_size;
1899
+ pgoff_t off;
17211900
1722
- if (shmem_should_replace_page(page, gfp)) {
1723
- error = shmem_replace_page(&page, gfp, info, index);
1724
- if (error)
1725
- goto failed;
1726
- }
1727
-
1728
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1729
- false);
1730
- if (!error) {
1731
- error = shmem_add_to_page_cache(page, mapping, index,
1732
- swp_to_radix_entry(swap));
1733
- /*
1734
- * We already confirmed swap under page lock, and make
1735
- * no memory allocation here, so usually no possibility
1736
- * of error; but free_swap_and_cache() only trylocks a
1737
- * page, so it is just possible that the entry has been
1738
- * truncated or holepunched since swap was confirmed.
1739
- * shmem_undo_range() will have done some of the
1740
- * unaccounting, now delete_from_swap_cache() will do
1741
- * the rest.
1742
- * Reset swap.val? No, leave it so "failed" goes back to
1743
- * "repeat": reading a hole and writing should succeed.
1744
- */
1745
- if (error) {
1746
- mem_cgroup_cancel_charge(page, memcg, false);
1747
- delete_from_swap_cache(page);
1748
- }
1749
- }
1750
- if (error)
1751
- goto failed;
1752
-
1753
- mem_cgroup_commit_charge(page, memcg, true, false);
1754
-
1755
- spin_lock_irq(&info->lock);
1756
- info->swapped--;
1757
- shmem_recalc_inode(inode);
1758
- spin_unlock_irq(&info->lock);
1759
-
1760
- if (sgp == SGP_WRITE)
1761
- mark_page_accessed(page);
1762
-
1763
- delete_from_swap_cache(page);
1764
- set_page_dirty(page);
1765
- swap_free(swap);
1766
-
1767
- } else {
1768
- if (vma && userfaultfd_missing(vma)) {
1769
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1770
- return 0;
1771
- }
1772
-
1773
- /* shmem_symlink() */
1774
- if (mapping->a_ops != &shmem_aops)
1775
- goto alloc_nohuge;
1776
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1777
- goto alloc_nohuge;
1778
- if (shmem_huge == SHMEM_HUGE_FORCE)
1901
+ off = round_up(index, HPAGE_PMD_NR);
1902
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
1903
+ if (i_size >= HPAGE_PMD_SIZE &&
1904
+ i_size >> PAGE_SHIFT >= off)
17791905 goto alloc_huge;
1780
- switch (sbinfo->huge) {
1781
- loff_t i_size;
1782
- pgoff_t off;
1783
- case SHMEM_HUGE_NEVER:
1784
- goto alloc_nohuge;
1785
- case SHMEM_HUGE_WITHIN_SIZE:
1786
- off = round_up(index, HPAGE_PMD_NR);
1787
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
1788
- if (i_size >= HPAGE_PMD_SIZE &&
1789
- i_size >> PAGE_SHIFT >= off)
1790
- goto alloc_huge;
1791
- /* fallthrough */
1792
- case SHMEM_HUGE_ADVISE:
1793
- if (sgp_huge == SGP_HUGE)
1794
- goto alloc_huge;
1795
- /* TODO: implement fadvise() hints */
1796
- goto alloc_nohuge;
1797
- }
1906
+
1907
+ fallthrough;
1908
+ }
1909
+ case SHMEM_HUGE_ADVISE:
1910
+ if (sgp_huge == SGP_HUGE)
1911
+ goto alloc_huge;
1912
+ /* TODO: implement fadvise() hints */
1913
+ goto alloc_nohuge;
1914
+ }
17981915
17991916 alloc_huge:
1800
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1801
- if (IS_ERR(page)) {
1802
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1803
- index, false);
1804
- }
1805
- if (IS_ERR(page)) {
1806
- int retry = 5;
1807
- error = PTR_ERR(page);
1808
- page = NULL;
1809
- if (error != -ENOSPC)
1810
- goto failed;
1811
- /*
1812
- * Try to reclaim some spece by splitting a huge page
1813
- * beyond i_size on the filesystem.
1814
- */
1815
- while (retry--) {
1816
- int ret;
1817
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1818
- if (ret == SHRINK_STOP)
1819
- break;
1820
- if (ret)
1821
- goto alloc_nohuge;
1822
- }
1823
- goto failed;
1824
- }
1917
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1918
+ if (IS_ERR(page)) {
1919
+alloc_nohuge:
1920
+ page = shmem_alloc_and_acct_page(gfp, inode,
1921
+ index, false);
1922
+ }
1923
+ if (IS_ERR(page)) {
1924
+ int retry = 5;
18251925
1826
- if (PageTransHuge(page))
1827
- hindex = round_down(index, HPAGE_PMD_NR);
1828
- else
1829
- hindex = index;
1830
-
1831
- if (sgp == SGP_WRITE)
1832
- __SetPageReferenced(page);
1833
-
1834
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1835
- PageTransHuge(page));
1836
- if (error)
1837
- goto unacct;
1838
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1839
- compound_order(page));
1840
- if (!error) {
1841
- error = shmem_add_to_page_cache(page, mapping, hindex,
1842
- NULL);
1843
- radix_tree_preload_end();
1844
- }
1845
- if (error) {
1846
- mem_cgroup_cancel_charge(page, memcg,
1847
- PageTransHuge(page));
1848
- goto unacct;
1849
- }
1850
- mem_cgroup_commit_charge(page, memcg, false,
1851
- PageTransHuge(page));
1852
- lru_cache_add_anon(page);
1853
-
1854
- spin_lock_irq(&info->lock);
1855
- info->alloced += 1 << compound_order(page);
1856
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1857
- shmem_recalc_inode(inode);
1858
- spin_unlock_irq(&info->lock);
1859
- alloced = true;
1860
-
1861
- if (PageTransHuge(page) &&
1862
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1863
- hindex + HPAGE_PMD_NR - 1) {
1864
- /*
1865
- * Part of the huge page is beyond i_size: subject
1866
- * to shrink under memory pressure.
1867
- */
1868
- spin_lock(&sbinfo->shrinklist_lock);
1869
- /*
1870
- * _careful to defend against unlocked access to
1871
- * ->shrink_list in shmem_unused_huge_shrink()
1872
- */
1873
- if (list_empty_careful(&info->shrinklist)) {
1874
- list_add_tail(&info->shrinklist,
1875
- &sbinfo->shrinklist);
1876
- sbinfo->shrinklist_len++;
1877
- }
1878
- spin_unlock(&sbinfo->shrinklist_lock);
1879
- }
1880
-
1926
+ error = PTR_ERR(page);
1927
+ page = NULL;
1928
+ if (error != -ENOSPC)
1929
+ goto unlock;
18811930 /*
1882
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1931
+ * Try to reclaim some space by splitting a huge page
1932
+ * beyond i_size on the filesystem.
18831933 */
1884
- if (sgp == SGP_FALLOC)
1885
- sgp = SGP_WRITE;
1934
+ while (retry--) {
1935
+ int ret;
1936
+
1937
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1938
+ if (ret == SHRINK_STOP)
1939
+ break;
1940
+ if (ret)
1941
+ goto alloc_nohuge;
1942
+ }
1943
+ goto unlock;
1944
+ }
1945
+
1946
+ if (PageTransHuge(page))
1947
+ hindex = round_down(index, HPAGE_PMD_NR);
1948
+ else
1949
+ hindex = index;
1950
+
1951
+ if (sgp == SGP_WRITE)
1952
+ __SetPageReferenced(page);
1953
+
1954
+ error = shmem_add_to_page_cache(page, mapping, hindex,
1955
+ NULL, gfp & GFP_RECLAIM_MASK,
1956
+ charge_mm);
1957
+ if (error)
1958
+ goto unacct;
1959
+ lru_cache_add(page);
1960
+
1961
+ spin_lock_irq(&info->lock);
1962
+ info->alloced += compound_nr(page);
1963
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1964
+ shmem_recalc_inode(inode);
1965
+ spin_unlock_irq(&info->lock);
1966
+ alloced = true;
1967
+
1968
+ if (PageTransHuge(page) &&
1969
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1970
+ hindex + HPAGE_PMD_NR - 1) {
1971
+ /*
1972
+ * Part of the huge page is beyond i_size: subject
1973
+ * to shrink under memory pressure.
1974
+ */
1975
+ spin_lock(&sbinfo->shrinklist_lock);
1976
+ /*
1977
+ * _careful to defend against unlocked access to
1978
+ * ->shrink_list in shmem_unused_huge_shrink()
1979
+ */
1980
+ if (list_empty_careful(&info->shrinklist)) {
1981
+ list_add_tail(&info->shrinklist,
1982
+ &sbinfo->shrinklist);
1983
+ sbinfo->shrinklist_len++;
1984
+ }
1985
+ spin_unlock(&sbinfo->shrinklist_lock);
1986
+ }
1987
+
1988
+ /*
1989
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1990
+ */
1991
+ if (sgp == SGP_FALLOC)
1992
+ sgp = SGP_WRITE;
18861993 clear:
1887
- /*
1888
- * Let SGP_WRITE caller clear ends if write does not fill page;
1889
- * but SGP_FALLOC on a page fallocated earlier must initialize
1890
- * it now, lest undo on failure cancel our earlier guarantee.
1891
- */
1892
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
1893
- struct page *head = compound_head(page);
1894
- int i;
1994
+ /*
1995
+ * Let SGP_WRITE caller clear ends if write does not fill page;
1996
+ * but SGP_FALLOC on a page fallocated earlier must initialize
1997
+ * it now, lest undo on failure cancel our earlier guarantee.
1998
+ */
1999
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
2000
+ int i;
18952001
1896
- for (i = 0; i < (1 << compound_order(head)); i++) {
1897
- clear_highpage(head + i);
1898
- flush_dcache_page(head + i);
1899
- }
1900
- SetPageUptodate(head);
2002
+ for (i = 0; i < compound_nr(page); i++) {
2003
+ clear_highpage(page + i);
2004
+ flush_dcache_page(page + i);
19012005 }
2006
+ SetPageUptodate(page);
19022007 }
19032008
19042009 /* Perhaps the file has been truncated since we checked */
....@@ -1914,6 +2019,7 @@
19142019 error = -EINVAL;
19152020 goto unlock;
19162021 }
2022
+out:
19172023 *pagep = page + index - hindex;
19182024 return 0;
19192025
....@@ -1921,16 +2027,13 @@
19212027 * Error recovery.
19222028 */
19232029 unacct:
1924
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
2030
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
19252031
19262032 if (PageTransHuge(page)) {
19272033 unlock_page(page);
19282034 put_page(page);
19292035 goto alloc_nohuge;
19302036 }
1931
-failed:
1932
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1933
- error = -EEXIST;
19342037 unlock:
19352038 if (page) {
19362039 unlock_page(page);
....@@ -1942,7 +2045,7 @@
19422045 spin_unlock_irq(&info->lock);
19432046 goto repeat;
19442047 }
1945
- if (error == -EEXIST) /* from above or from radix_tree_insert */
2048
+ if (error == -EEXIST)
19462049 goto repeat;
19472050 return error;
19482051 }
....@@ -1994,16 +2097,14 @@
19942097 shmem_falloc->waitq &&
19952098 vmf->pgoff >= shmem_falloc->start &&
19962099 vmf->pgoff < shmem_falloc->next) {
2100
+ struct file *fpin;
19972101 wait_queue_head_t *shmem_falloc_waitq;
19982102 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
19992103
20002104 ret = VM_FAULT_NOPAGE;
2001
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
2002
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
2003
- /* It's polite to up mmap_sem if we can */
2004
- up_read(&vma->vm_mm->mmap_sem);
2105
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2106
+ if (fpin)
20052107 ret = VM_FAULT_RETRY;
2006
- }
20072108
20082109 shmem_falloc_waitq = shmem_falloc->waitq;
20092110 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
....@@ -2021,6 +2122,9 @@
20212122 spin_lock(&inode->i_lock);
20222123 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
20232124 spin_unlock(&inode->i_lock);
2125
+
2126
+ if (fpin)
2127
+ fput(fpin);
20242128 return ret;
20252129 }
20262130 spin_unlock(&inode->i_lock);
....@@ -2059,7 +2163,7 @@
20592163 get_area = current->mm->get_unmapped_area;
20602164 addr = get_area(file, uaddr, len, pgoff, flags);
20612165
2062
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
2166
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
20632167 return addr;
20642168 if (IS_ERR_VALUE(addr))
20652169 return addr;
....@@ -2179,26 +2283,18 @@
21792283 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
21802284 {
21812285 struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2286
+ int ret;
21822287
2183
- if (info->seals & F_SEAL_FUTURE_WRITE) {
2184
- /*
2185
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2186
- * "future write" seal active.
2187
- */
2188
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2189
- return -EPERM;
2288
+ ret = seal_check_future_write(info->seals, vma);
2289
+ if (ret)
2290
+ return ret;
21902291
2191
- /*
2192
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2193
- * read-only mapping, take care to not allow mprotect to revert
2194
- * protections.
2195
- */
2196
- vma->vm_flags &= ~(VM_MAYWRITE);
2197
- }
2292
+ /* arm64 - allow memory tagging on RAM-based files */
2293
+ vma->vm_flags |= VM_MTE_ALLOWED;
21982294
21992295 file_accessed(file);
22002296 vma->vm_ops = &shmem_vm_ops;
2201
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2297
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
22022298 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
22032299 (vma->vm_end & HPAGE_PMD_MASK)) {
22042300 khugepaged_enter(vma, vma->vm_flags);
....@@ -2212,13 +2308,14 @@
22122308 struct inode *inode;
22132309 struct shmem_inode_info *info;
22142310 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2311
+ ino_t ino;
22152312
2216
- if (shmem_reserve_inode(sb))
2313
+ if (shmem_reserve_inode(sb, &ino))
22172314 return NULL;
22182315
22192316 inode = new_inode(sb);
22202317 if (inode) {
2221
- inode->i_ino = get_next_ino();
2318
+ inode->i_ino = ino;
22222319 inode_init_owner(inode, dir, mode);
22232320 inode->i_blocks = 0;
22242321 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
....@@ -2226,6 +2323,7 @@
22262323 info = SHMEM_I(inode);
22272324 memset(info, 0, (char *)inode - (char *)info);
22282325 spin_lock_init(&info->lock);
2326
+ atomic_set(&info->stop_eviction, 0);
22292327 info->seals = F_SEAL_SEAL;
22302328 info->flags = flags & VM_NORESERVE;
22312329 INIT_LIST_HEAD(&info->shrinklist);
....@@ -2272,28 +2370,25 @@
22722370 return mapping->a_ops == &shmem_aops;
22732371 }
22742372
2275
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2276
- pmd_t *dst_pmd,
2277
- struct vm_area_struct *dst_vma,
2278
- unsigned long dst_addr,
2279
- unsigned long src_addr,
2280
- bool zeropage,
2281
- struct page **pagep)
2373
+#ifdef CONFIG_USERFAULTFD
2374
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2375
+ pmd_t *dst_pmd,
2376
+ struct vm_area_struct *dst_vma,
2377
+ unsigned long dst_addr,
2378
+ unsigned long src_addr,
2379
+ bool zeropage,
2380
+ struct page **pagep)
22822381 {
22832382 struct inode *inode = file_inode(dst_vma->vm_file);
22842383 struct shmem_inode_info *info = SHMEM_I(inode);
22852384 struct address_space *mapping = inode->i_mapping;
22862385 gfp_t gfp = mapping_gfp_mask(mapping);
22872386 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2288
- struct mem_cgroup *memcg;
2289
- spinlock_t *ptl;
22902387 void *page_kaddr;
22912388 struct page *page;
2292
- pte_t _dst_pte, *dst_pte;
22932389 int ret;
2294
- pgoff_t offset, max_off;
2390
+ pgoff_t max_off;
22952391
2296
- ret = -ENOMEM;
22972392 if (!shmem_inode_acct_block(inode, 1)) {
22982393 /*
22992394 * We may have got a page, returned -ENOENT triggering a retry,
....@@ -2304,29 +2399,30 @@
23042399 put_page(*pagep);
23052400 *pagep = NULL;
23062401 }
2307
- goto out;
2402
+ return -ENOMEM;
23082403 }
23092404
23102405 if (!*pagep) {
2406
+ ret = -ENOMEM;
23112407 page = shmem_alloc_page(gfp, info, pgoff);
23122408 if (!page)
23132409 goto out_unacct_blocks;
23142410
2315
- if (!zeropage) { /* mcopy_atomic */
2411
+ if (!zeropage) { /* COPY */
23162412 page_kaddr = kmap_atomic(page);
23172413 ret = copy_from_user(page_kaddr,
23182414 (const void __user *)src_addr,
23192415 PAGE_SIZE);
23202416 kunmap_atomic(page_kaddr);
23212417
2322
- /* fallback to copy_from_user outside mmap_sem */
2418
+ /* fallback to copy_from_user outside mmap_lock */
23232419 if (unlikely(ret)) {
23242420 *pagep = page;
2325
- shmem_inode_unacct_blocks(inode, 1);
2421
+ ret = -ENOENT;
23262422 /* don't free the page */
2327
- return -ENOENT;
2423
+ goto out_unacct_blocks;
23282424 }
2329
- } else { /* mfill_zeropage_atomic */
2425
+ } else { /* ZEROPAGE */
23302426 clear_highpage(page);
23312427 }
23322428 } else {
....@@ -2334,57 +2430,26 @@
23342430 *pagep = NULL;
23352431 }
23362432
2337
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
2433
+ VM_BUG_ON(PageLocked(page));
2434
+ VM_BUG_ON(PageSwapBacked(page));
23382435 __SetPageLocked(page);
23392436 __SetPageSwapBacked(page);
23402437 __SetPageUptodate(page);
23412438
23422439 ret = -EFAULT;
2343
- offset = linear_page_index(dst_vma, dst_addr);
23442440 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2345
- if (unlikely(offset >= max_off))
2441
+ if (unlikely(pgoff >= max_off))
23462442 goto out_release;
23472443
2348
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
2444
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2445
+ gfp & GFP_RECLAIM_MASK, dst_mm);
23492446 if (ret)
23502447 goto out_release;
23512448
2352
- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
2353
- if (!ret) {
2354
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
2355
- radix_tree_preload_end();
2356
- }
2449
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
2450
+ page, true, false);
23572451 if (ret)
2358
- goto out_release_uncharge;
2359
-
2360
- mem_cgroup_commit_charge(page, memcg, false, false);
2361
-
2362
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2363
- if (dst_vma->vm_flags & VM_WRITE)
2364
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2365
- else {
2366
- /*
2367
- * We don't set the pte dirty if the vma has no
2368
- * VM_WRITE permission, so mark the page dirty or it
2369
- * could be freed from under us. We could do it
2370
- * unconditionally before unlock_page(), but doing it
2371
- * only if VM_WRITE is not set is faster.
2372
- */
2373
- set_page_dirty(page);
2374
- }
2375
-
2376
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2377
-
2378
- ret = -EFAULT;
2379
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2380
- if (unlikely(offset >= max_off))
2381
- goto out_release_uncharge_unlock;
2382
-
2383
- ret = -EEXIST;
2384
- if (!pte_none(*dst_pte))
2385
- goto out_release_uncharge_unlock;
2386
-
2387
- lru_cache_add_anon(page);
2452
+ goto out_delete_from_cache;
23882453
23892454 spin_lock_irq(&info->lock);
23902455 info->alloced++;
....@@ -2392,52 +2457,19 @@
23922457 shmem_recalc_inode(inode);
23932458 spin_unlock_irq(&info->lock);
23942459
2395
- inc_mm_counter(dst_mm, mm_counter_file(page));
2396
- page_add_file_rmap(page, false);
2397
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2398
-
2399
- /* No need to invalidate - it was non-present before */
2400
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
2401
- pte_unmap_unlock(dst_pte, ptl);
2460
+ SetPageDirty(page);
24022461 unlock_page(page);
2403
- ret = 0;
2404
-out:
2405
- return ret;
2406
-out_release_uncharge_unlock:
2407
- pte_unmap_unlock(dst_pte, ptl);
2408
- ClearPageDirty(page);
2462
+ return 0;
2463
+out_delete_from_cache:
24092464 delete_from_page_cache(page);
2410
-out_release_uncharge:
2411
- mem_cgroup_cancel_charge(page, memcg, false);
24122465 out_release:
24132466 unlock_page(page);
24142467 put_page(page);
24152468 out_unacct_blocks:
24162469 shmem_inode_unacct_blocks(inode, 1);
2417
- goto out;
2470
+ return ret;
24182471 }
2419
-
2420
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2421
- pmd_t *dst_pmd,
2422
- struct vm_area_struct *dst_vma,
2423
- unsigned long dst_addr,
2424
- unsigned long src_addr,
2425
- struct page **pagep)
2426
-{
2427
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2428
- dst_addr, src_addr, false, pagep);
2429
-}
2430
-
2431
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2432
- pmd_t *dst_pmd,
2433
- struct vm_area_struct *dst_vma,
2434
- unsigned long dst_addr)
2435
-{
2436
- struct page *page = NULL;
2437
-
2438
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2439
- dst_addr, 0, true, &page);
2440
-}
2472
+#endif /* CONFIG_USERFAULTFD */
24412473
24422474 #ifdef CONFIG_TMPFS
24432475 static const struct inode_operations shmem_symlink_inode_operations;
....@@ -2617,7 +2649,7 @@
26172649 }
26182650
26192651 /*
2620
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
2652
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
26212653 */
26222654 static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
26232655 pgoff_t index, pgoff_t end, int whence)
....@@ -2647,7 +2679,7 @@
26472679 index = indices[i];
26482680 }
26492681 page = pvec.pages[i];
2650
- if (page && !radix_tree_exceptional_entry(page)) {
2682
+ if (page && !xa_is_value(page)) {
26512683 if (!PageUptodate(page))
26522684 page = NULL;
26532685 }
....@@ -2943,7 +2975,7 @@
29432975 * first link must skip that, to get the accounting right.
29442976 */
29452977 if (inode->i_nlink) {
2946
- ret = shmem_reserve_inode(inode->i_sb);
2978
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
29472979 if (ret)
29482980 goto out;
29492981 }
....@@ -3095,12 +3127,9 @@
30953127
30963128 error = security_inode_init_security(inode, dir, &dentry->d_name,
30973129 shmem_initxattrs, NULL);
3098
- if (error) {
3099
- if (error != -EOPNOTSUPP) {
3100
- iput(inode);
3101
- return error;
3102
- }
3103
- error = 0;
3130
+ if (error && error != -EOPNOTSUPP) {
3131
+ iput(inode);
3132
+ return error;
31043133 }
31053134
31063135 inode->i_size = len-1;
....@@ -3192,7 +3221,7 @@
31923221 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
31933222 GFP_KERNEL);
31943223 if (!new_xattr->name) {
3195
- kfree(new_xattr);
3224
+ kvfree(new_xattr);
31963225 return -ENOMEM;
31973226 }
31983227
....@@ -3209,7 +3238,8 @@
32093238
32103239 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
32113240 struct dentry *unused, struct inode *inode,
3212
- const char *name, void *buffer, size_t size)
3241
+ const char *name, void *buffer, size_t size,
3242
+ int flags)
32133243 {
32143244 struct shmem_inode_info *info = SHMEM_I(inode);
32153245
....@@ -3225,7 +3255,7 @@
32253255 struct shmem_inode_info *info = SHMEM_I(inode);
32263256
32273257 name = xattr_full_name(handler, name);
3228
- return simple_xattr_set(&info->xattrs, name, value, size, flags);
3258
+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
32293259 }
32303260
32313261 static const struct xattr_handler shmem_security_xattr_handler = {
....@@ -3352,16 +3382,162 @@
33523382 .fh_to_dentry = shmem_fh_to_dentry,
33533383 };
33543384
3355
-static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
3356
- bool remount)
3385
+enum shmem_param {
3386
+ Opt_gid,
3387
+ Opt_huge,
3388
+ Opt_mode,
3389
+ Opt_mpol,
3390
+ Opt_nr_blocks,
3391
+ Opt_nr_inodes,
3392
+ Opt_size,
3393
+ Opt_uid,
3394
+ Opt_inode32,
3395
+ Opt_inode64,
3396
+};
3397
+
3398
+static const struct constant_table shmem_param_enums_huge[] = {
3399
+ {"never", SHMEM_HUGE_NEVER },
3400
+ {"always", SHMEM_HUGE_ALWAYS },
3401
+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
3402
+ {"advise", SHMEM_HUGE_ADVISE },
3403
+ {}
3404
+};
3405
+
3406
+const struct fs_parameter_spec shmem_fs_parameters[] = {
3407
+ fsparam_u32 ("gid", Opt_gid),
3408
+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
3409
+ fsparam_u32oct("mode", Opt_mode),
3410
+ fsparam_string("mpol", Opt_mpol),
3411
+ fsparam_string("nr_blocks", Opt_nr_blocks),
3412
+ fsparam_string("nr_inodes", Opt_nr_inodes),
3413
+ fsparam_string("size", Opt_size),
3414
+ fsparam_u32 ("uid", Opt_uid),
3415
+ fsparam_flag ("inode32", Opt_inode32),
3416
+ fsparam_flag ("inode64", Opt_inode64),
3417
+ {}
3418
+};
3419
+
3420
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
33573421 {
3358
- char *this_char, *value, *rest;
3359
- struct mempolicy *mpol = NULL;
3360
- uid_t uid;
3361
- gid_t gid;
3422
+ struct shmem_options *ctx = fc->fs_private;
3423
+ struct fs_parse_result result;
3424
+ unsigned long long size;
3425
+ char *rest;
3426
+ int opt;
3427
+ kuid_t kuid;
3428
+ kgid_t kgid;
3429
+
3430
+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3431
+ if (opt < 0)
3432
+ return opt;
3433
+
3434
+ switch (opt) {
3435
+ case Opt_size:
3436
+ size = memparse(param->string, &rest);
3437
+ if (*rest == '%') {
3438
+ size <<= PAGE_SHIFT;
3439
+ size *= totalram_pages();
3440
+ do_div(size, 100);
3441
+ rest++;
3442
+ }
3443
+ if (*rest)
3444
+ goto bad_value;
3445
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3446
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
3447
+ break;
3448
+ case Opt_nr_blocks:
3449
+ ctx->blocks = memparse(param->string, &rest);
3450
+ if (*rest)
3451
+ goto bad_value;
3452
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
3453
+ break;
3454
+ case Opt_nr_inodes:
3455
+ ctx->inodes = memparse(param->string, &rest);
3456
+ if (*rest)
3457
+ goto bad_value;
3458
+ ctx->seen |= SHMEM_SEEN_INODES;
3459
+ break;
3460
+ case Opt_mode:
3461
+ ctx->mode = result.uint_32 & 07777;
3462
+ break;
3463
+ case Opt_uid:
3464
+ kuid = make_kuid(current_user_ns(), result.uint_32);
3465
+ if (!uid_valid(kuid))
3466
+ goto bad_value;
3467
+
3468
+ /*
3469
+ * The requested uid must be representable in the
3470
+ * filesystem's idmapping.
3471
+ */
3472
+ if (!kuid_has_mapping(fc->user_ns, kuid))
3473
+ goto bad_value;
3474
+
3475
+ ctx->uid = kuid;
3476
+ break;
3477
+ case Opt_gid:
3478
+ kgid = make_kgid(current_user_ns(), result.uint_32);
3479
+ if (!gid_valid(kgid))
3480
+ goto bad_value;
3481
+
3482
+ /*
3483
+ * The requested gid must be representable in the
3484
+ * filesystem's idmapping.
3485
+ */
3486
+ if (!kgid_has_mapping(fc->user_ns, kgid))
3487
+ goto bad_value;
3488
+
3489
+ ctx->gid = kgid;
3490
+ break;
3491
+ case Opt_huge:
3492
+ ctx->huge = result.uint_32;
3493
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
3494
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3495
+ has_transparent_hugepage()))
3496
+ goto unsupported_parameter;
3497
+ ctx->seen |= SHMEM_SEEN_HUGE;
3498
+ break;
3499
+ case Opt_mpol:
3500
+ if (IS_ENABLED(CONFIG_NUMA)) {
3501
+ mpol_put(ctx->mpol);
3502
+ ctx->mpol = NULL;
3503
+ if (mpol_parse_str(param->string, &ctx->mpol))
3504
+ goto bad_value;
3505
+ break;
3506
+ }
3507
+ goto unsupported_parameter;
3508
+ case Opt_inode32:
3509
+ ctx->full_inums = false;
3510
+ ctx->seen |= SHMEM_SEEN_INUMS;
3511
+ break;
3512
+ case Opt_inode64:
3513
+ if (sizeof(ino_t) < 8) {
3514
+ return invalfc(fc,
3515
+ "Cannot use inode64 with <64bit inums in kernel\n");
3516
+ }
3517
+ ctx->full_inums = true;
3518
+ ctx->seen |= SHMEM_SEEN_INUMS;
3519
+ break;
3520
+ }
3521
+ return 0;
3522
+
3523
+unsupported_parameter:
3524
+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
3525
+bad_value:
3526
+ return invalfc(fc, "Bad value for '%s'", param->key);
3527
+}
3528
+
3529
+static int shmem_parse_options(struct fs_context *fc, void *data)
3530
+{
3531
+ char *options = data;
3532
+
3533
+ if (options) {
3534
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
3535
+ if (err)
3536
+ return err;
3537
+ }
33623538
33633539 while (options != NULL) {
3364
- this_char = options;
3540
+ char *this_char = options;
33653541 for (;;) {
33663542 /*
33673543 * NUL-terminate this option: unfortunately,
....@@ -3377,139 +3553,91 @@
33773553 break;
33783554 }
33793555 }
3380
- if (!*this_char)
3381
- continue;
3382
- if ((value = strchr(this_char,'=')) != NULL) {
3383
- *value++ = 0;
3384
- } else {
3385
- pr_err("tmpfs: No value for mount option '%s'\n",
3386
- this_char);
3387
- goto error;
3388
- }
3556
+ if (*this_char) {
3557
+ char *value = strchr(this_char,'=');
3558
+ size_t len = 0;
3559
+ int err;
33893560
3390
- if (!strcmp(this_char,"size")) {
3391
- unsigned long long size;
3392
- size = memparse(value,&rest);
3393
- if (*rest == '%') {
3394
- size <<= PAGE_SHIFT;
3395
- size *= totalram_pages;
3396
- do_div(size, 100);
3397
- rest++;
3561
+ if (value) {
3562
+ *value++ = '\0';
3563
+ len = strlen(value);
33983564 }
3399
- if (*rest)
3400
- goto bad_val;
3401
- sbinfo->max_blocks =
3402
- DIV_ROUND_UP(size, PAGE_SIZE);
3403
- } else if (!strcmp(this_char,"nr_blocks")) {
3404
- sbinfo->max_blocks = memparse(value, &rest);
3405
- if (*rest)
3406
- goto bad_val;
3407
- } else if (!strcmp(this_char,"nr_inodes")) {
3408
- sbinfo->max_inodes = memparse(value, &rest);
3409
- if (*rest)
3410
- goto bad_val;
3411
- } else if (!strcmp(this_char,"mode")) {
3412
- if (remount)
3413
- continue;
3414
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3415
- if (*rest)
3416
- goto bad_val;
3417
- } else if (!strcmp(this_char,"uid")) {
3418
- if (remount)
3419
- continue;
3420
- uid = simple_strtoul(value, &rest, 0);
3421
- if (*rest)
3422
- goto bad_val;
3423
- sbinfo->uid = make_kuid(current_user_ns(), uid);
3424
- if (!uid_valid(sbinfo->uid))
3425
- goto bad_val;
3426
- } else if (!strcmp(this_char,"gid")) {
3427
- if (remount)
3428
- continue;
3429
- gid = simple_strtoul(value, &rest, 0);
3430
- if (*rest)
3431
- goto bad_val;
3432
- sbinfo->gid = make_kgid(current_user_ns(), gid);
3433
- if (!gid_valid(sbinfo->gid))
3434
- goto bad_val;
3435
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3436
- } else if (!strcmp(this_char, "huge")) {
3437
- int huge;
3438
- huge = shmem_parse_huge(value);
3439
- if (huge < 0)
3440
- goto bad_val;
3441
- if (!has_transparent_hugepage() &&
3442
- huge != SHMEM_HUGE_NEVER)
3443
- goto bad_val;
3444
- sbinfo->huge = huge;
3445
-#endif
3446
-#ifdef CONFIG_NUMA
3447
- } else if (!strcmp(this_char,"mpol")) {
3448
- mpol_put(mpol);
3449
- mpol = NULL;
3450
- if (mpol_parse_str(value, &mpol))
3451
- goto bad_val;
3452
-#endif
3453
- } else {
3454
- pr_err("tmpfs: Bad mount option %s\n", this_char);
3455
- goto error;
3565
+ err = vfs_parse_fs_string(fc, this_char, value, len);
3566
+ if (err < 0)
3567
+ return err;
34563568 }
34573569 }
3458
- sbinfo->mpol = mpol;
34593570 return 0;
3460
-
3461
-bad_val:
3462
- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3463
- value, this_char);
3464
-error:
3465
- mpol_put(mpol);
3466
- return 1;
3467
-
34683571 }
34693572
3470
-static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
3573
+/*
3574
+ * Reconfigure a shmem filesystem.
3575
+ *
3576
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
3577
+ * are in use; but we must separately disallow unlimited->limited, because in
3578
+ * that case we have no record of how much is already in use.
3579
+ */
3580
+static int shmem_reconfigure(struct fs_context *fc)
34713581 {
3472
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3473
- struct shmem_sb_info config = *sbinfo;
3582
+ struct shmem_options *ctx = fc->fs_private;
3583
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
34743584 unsigned long inodes;
3475
- int error = -EINVAL;
3476
-
3477
- config.mpol = NULL;
3478
- if (shmem_parse_options(data, &config, true))
3479
- return error;
3585
+ const char *err;
34803586
34813587 spin_lock(&sbinfo->stat_lock);
34823588 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3483
- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3484
- goto out;
3485
- if (config.max_inodes < inodes)
3486
- goto out;
3487
- /*
3488
- * Those tests disallow limited->unlimited while any are in use;
3489
- * but we must separately disallow unlimited->limited, because
3490
- * in that case we have no record of how much is already in use.
3491
- */
3492
- if (config.max_blocks && !sbinfo->max_blocks)
3493
- goto out;
3494
- if (config.max_inodes && !sbinfo->max_inodes)
3495
- goto out;
3589
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3590
+ if (!sbinfo->max_blocks) {
3591
+ err = "Cannot retroactively limit size";
3592
+ goto out;
3593
+ }
3594
+ if (percpu_counter_compare(&sbinfo->used_blocks,
3595
+ ctx->blocks) > 0) {
3596
+ err = "Too small a size for current use";
3597
+ goto out;
3598
+ }
3599
+ }
3600
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3601
+ if (!sbinfo->max_inodes) {
3602
+ err = "Cannot retroactively limit inodes";
3603
+ goto out;
3604
+ }
3605
+ if (ctx->inodes < inodes) {
3606
+ err = "Too few inodes for current use";
3607
+ goto out;
3608
+ }
3609
+ }
34963610
3497
- error = 0;
3498
- sbinfo->huge = config.huge;
3499
- sbinfo->max_blocks = config.max_blocks;
3500
- sbinfo->max_inodes = config.max_inodes;
3501
- sbinfo->free_inodes = config.max_inodes - inodes;
3611
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3612
+ sbinfo->next_ino > UINT_MAX) {
3613
+ err = "Current inum too high to switch to 32-bit inums";
3614
+ goto out;
3615
+ }
3616
+
3617
+ if (ctx->seen & SHMEM_SEEN_HUGE)
3618
+ sbinfo->huge = ctx->huge;
3619
+ if (ctx->seen & SHMEM_SEEN_INUMS)
3620
+ sbinfo->full_inums = ctx->full_inums;
3621
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
3622
+ sbinfo->max_blocks = ctx->blocks;
3623
+ if (ctx->seen & SHMEM_SEEN_INODES) {
3624
+ sbinfo->max_inodes = ctx->inodes;
3625
+ sbinfo->free_inodes = ctx->inodes - inodes;
3626
+ }
35023627
35033628 /*
35043629 * Preserve previous mempolicy unless mpol remount option was specified.
35053630 */
3506
- if (config.mpol) {
3631
+ if (ctx->mpol) {
35073632 mpol_put(sbinfo->mpol);
3508
- sbinfo->mpol = config.mpol; /* transfers initial ref */
3633
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
3634
+ ctx->mpol = NULL;
35093635 }
3636
+ spin_unlock(&sbinfo->stat_lock);
3637
+ return 0;
35103638 out:
35113639 spin_unlock(&sbinfo->stat_lock);
3512
- return error;
3640
+ return invalfc(fc, "%s", err);
35133641 }
35143642
35153643 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
....@@ -3529,7 +3657,30 @@
35293657 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
35303658 seq_printf(seq, ",gid=%u",
35313659 from_kgid_munged(&init_user_ns, sbinfo->gid));
3532
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3660
+
3661
+ /*
3662
+ * Showing inode{64,32} might be useful even if it's the system default,
3663
+ * since then people don't have to resort to checking both here and
3664
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
3665
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
3666
+ *
3667
+ * We hide it when inode64 isn't the default and we are using 32-bit
3668
+ * inodes, since that probably just means the feature isn't even under
3669
+ * consideration.
3670
+ *
3671
+ * As such:
3672
+ *
3673
+ * +-----------------+-----------------+
3674
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
3675
+ * +------------------+-----------------+-----------------+
3676
+ * | full_inums=true | show | show |
3677
+ * | full_inums=false | show | hide |
3678
+ * +------------------+-----------------+-----------------+
3679
+ *
3680
+ */
3681
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3682
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3683
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
35333684 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
35343685 if (sbinfo->huge)
35353686 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
....@@ -3544,14 +3695,16 @@
35443695 {
35453696 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
35463697
3698
+ free_percpu(sbinfo->ino_batch);
35473699 percpu_counter_destroy(&sbinfo->used_blocks);
35483700 mpol_put(sbinfo->mpol);
35493701 kfree(sbinfo);
35503702 sb->s_fs_info = NULL;
35513703 }
35523704
3553
-int shmem_fill_super(struct super_block *sb, void *data, int silent)
3705
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
35543706 {
3707
+ struct shmem_options *ctx = fc->fs_private;
35553708 struct inode *inode;
35563709 struct shmem_sb_info *sbinfo;
35573710 int err = -ENOMEM;
....@@ -3562,9 +3715,6 @@
35623715 if (!sbinfo)
35633716 return -ENOMEM;
35643717
3565
- sbinfo->mode = 0777 | S_ISVTX;
3566
- sbinfo->uid = current_fsuid();
3567
- sbinfo->gid = current_fsgid();
35683718 sb->s_fs_info = sbinfo;
35693719
35703720 #ifdef CONFIG_TMPFS
....@@ -3574,12 +3724,12 @@
35743724 * but the internal instance is left unlimited.
35753725 */
35763726 if (!(sb->s_flags & SB_KERNMOUNT)) {
3577
- sbinfo->max_blocks = shmem_default_max_blocks();
3578
- sbinfo->max_inodes = shmem_default_max_inodes();
3579
- if (shmem_parse_options(data, sbinfo, false)) {
3580
- err = -EINVAL;
3581
- goto failed;
3582
- }
3727
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3728
+ ctx->blocks = shmem_default_max_blocks();
3729
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
3730
+ ctx->inodes = shmem_default_max_inodes();
3731
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
3732
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
35833733 } else {
35843734 sb->s_flags |= SB_NOUSER;
35853735 }
....@@ -3588,11 +3738,24 @@
35883738 #else
35893739 sb->s_flags |= SB_NOUSER;
35903740 #endif
3741
+ sbinfo->max_blocks = ctx->blocks;
3742
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3743
+ if (sb->s_flags & SB_KERNMOUNT) {
3744
+ sbinfo->ino_batch = alloc_percpu(ino_t);
3745
+ if (!sbinfo->ino_batch)
3746
+ goto failed;
3747
+ }
3748
+ sbinfo->uid = ctx->uid;
3749
+ sbinfo->gid = ctx->gid;
3750
+ sbinfo->full_inums = ctx->full_inums;
3751
+ sbinfo->mode = ctx->mode;
3752
+ sbinfo->huge = ctx->huge;
3753
+ sbinfo->mpol = ctx->mpol;
3754
+ ctx->mpol = NULL;
35913755
35923756 spin_lock_init(&sbinfo->stat_lock);
35933757 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
35943758 goto failed;
3595
- sbinfo->free_inodes = sbinfo->max_inodes;
35963759 spin_lock_init(&sbinfo->shrinklist_lock);
35973760 INIT_LIST_HEAD(&sbinfo->shrinklist);
35983761
....@@ -3625,6 +3788,31 @@
36253788 return err;
36263789 }
36273790
3791
+static int shmem_get_tree(struct fs_context *fc)
3792
+{
3793
+ return get_tree_nodev(fc, shmem_fill_super);
3794
+}
3795
+
3796
+static void shmem_free_fc(struct fs_context *fc)
3797
+{
3798
+ struct shmem_options *ctx = fc->fs_private;
3799
+
3800
+ if (ctx) {
3801
+ mpol_put(ctx->mpol);
3802
+ kfree(ctx);
3803
+ }
3804
+}
3805
+
3806
+static const struct fs_context_operations shmem_fs_context_ops = {
3807
+ .free = shmem_free_fc,
3808
+ .get_tree = shmem_get_tree,
3809
+#ifdef CONFIG_TMPFS
3810
+ .parse_monolithic = shmem_parse_options,
3811
+ .parse_param = shmem_parse_one,
3812
+ .reconfigure = shmem_reconfigure,
3813
+#endif
3814
+};
3815
+
36283816 static struct kmem_cache *shmem_inode_cachep;
36293817
36303818 static struct inode *shmem_alloc_inode(struct super_block *sb)
....@@ -3636,9 +3824,8 @@
36363824 return &info->vfs_inode;
36373825 }
36383826
3639
-static void shmem_destroy_callback(struct rcu_head *head)
3827
+static void shmem_free_in_core_inode(struct inode *inode)
36403828 {
3641
- struct inode *inode = container_of(head, struct inode, i_rcu);
36423829 if (S_ISLNK(inode->i_mode))
36433830 kfree(inode->i_link);
36443831 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
....@@ -3648,7 +3835,6 @@
36483835 {
36493836 if (S_ISREG(inode->i_mode))
36503837 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3651
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
36523838 }
36533839
36543840 static void shmem_init_inode(void *foo)
....@@ -3739,16 +3925,16 @@
37393925
37403926 static const struct super_operations shmem_ops = {
37413927 .alloc_inode = shmem_alloc_inode,
3928
+ .free_inode = shmem_free_in_core_inode,
37423929 .destroy_inode = shmem_destroy_inode,
37433930 #ifdef CONFIG_TMPFS
37443931 .statfs = shmem_statfs,
3745
- .remount_fs = shmem_remount_fs,
37463932 .show_options = shmem_show_options,
37473933 #endif
37483934 .evict_inode = shmem_evict_inode,
37493935 .drop_inode = generic_delete_inode,
37503936 .put_super = shmem_put_super,
3751
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3937
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
37523938 .nr_cached_objects = shmem_unused_huge_count,
37533939 .free_cached_objects = shmem_unused_huge_scan,
37543940 #endif
....@@ -3761,29 +3947,42 @@
37613947 .set_policy = shmem_set_policy,
37623948 .get_policy = shmem_get_policy,
37633949 #endif
3950
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
3951
+ .allow_speculation = filemap_allow_speculation,
3952
+#endif
37643953 };
37653954
3766
-static struct dentry *shmem_mount(struct file_system_type *fs_type,
3767
- int flags, const char *dev_name, void *data)
3955
+int shmem_init_fs_context(struct fs_context *fc)
37683956 {
3769
- return mount_nodev(fs_type, flags, data, shmem_fill_super);
3957
+ struct shmem_options *ctx;
3958
+
3959
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3960
+ if (!ctx)
3961
+ return -ENOMEM;
3962
+
3963
+ ctx->mode = 0777 | S_ISVTX;
3964
+ ctx->uid = current_fsuid();
3965
+ ctx->gid = current_fsgid();
3966
+
3967
+ fc->fs_private = ctx;
3968
+ fc->ops = &shmem_fs_context_ops;
3969
+ return 0;
37703970 }
37713971
37723972 static struct file_system_type shmem_fs_type = {
37733973 .owner = THIS_MODULE,
37743974 .name = "tmpfs",
3775
- .mount = shmem_mount,
3975
+ .init_fs_context = shmem_init_fs_context,
3976
+#ifdef CONFIG_TMPFS
3977
+ .parameters = shmem_fs_parameters,
3978
+#endif
37763979 .kill_sb = kill_litter_super,
3777
- .fs_flags = FS_USERNS_MOUNT,
3980
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
37783981 };
37793982
37803983 int __init shmem_init(void)
37813984 {
37823985 int error;
3783
-
3784
- /* If rootfs called this, don't re-init */
3785
- if (shmem_inode_cachep)
3786
- return 0;
37873986
37883987 shmem_init_inodecache();
37893988
....@@ -3800,7 +3999,7 @@
38003999 goto out1;
38014000 }
38024001
3803
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
4002
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
38044003 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
38054004 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
38064005 else
....@@ -3816,11 +4015,11 @@
38164015 return error;
38174016 }
38184017
3819
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
4018
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
38204019 static ssize_t shmem_enabled_show(struct kobject *kobj,
38214020 struct kobj_attribute *attr, char *buf)
38224021 {
3823
- int values[] = {
4022
+ static const int values[] = {
38244023 SHMEM_HUGE_ALWAYS,
38254024 SHMEM_HUGE_WITHIN_SIZE,
38264025 SHMEM_HUGE_ADVISE,
....@@ -3868,9 +4067,9 @@
38684067
38694068 struct kobj_attribute shmem_enabled_attr =
38704069 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3871
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
4070
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
38724071
3873
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
4072
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
38744073 bool shmem_huge_enabled(struct vm_area_struct *vma)
38754074 {
38764075 struct inode *inode = file_inode(vma->vm_file);
....@@ -3878,6 +4077,8 @@
38784077 loff_t i_size;
38794078 pgoff_t off;
38804079
4080
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
4081
+ return false;
38814082 if (shmem_huge == SHMEM_HUGE_FORCE)
38824083 return true;
38834084 if (shmem_huge == SHMEM_HUGE_DENY)
....@@ -3893,7 +4094,7 @@
38934094 if (i_size >= HPAGE_PMD_SIZE &&
38944095 i_size >> PAGE_SHIFT >= off)
38954096 return true;
3896
- /* fall through */
4097
+ fallthrough;
38974098 case SHMEM_HUGE_ADVISE:
38984099 /* TODO: implement fadvise() hints */
38994100 return (vma->vm_flags & VM_HUGEPAGE);
....@@ -3902,7 +4103,7 @@
39024103 return false;
39034104 }
39044105 }
3905
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
4106
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
39064107
39074108 #else /* !CONFIG_SHMEM */
39084109
....@@ -3917,8 +4118,9 @@
39174118
39184119 static struct file_system_type shmem_fs_type = {
39194120 .name = "tmpfs",
3920
- .mount = ramfs_mount,
3921
- .kill_sb = kill_litter_super,
4121
+ .init_fs_context = ramfs_init_fs_context,
4122
+ .parameters = ramfs_fs_parameters,
4123
+ .kill_sb = ramfs_kill_sb,
39224124 .fs_flags = FS_USERNS_MOUNT,
39234125 };
39244126
....@@ -3932,7 +4134,8 @@
39324134 return 0;
39334135 }
39344136
3935
-int shmem_unuse(swp_entry_t swap, struct page *page)
4137
+int shmem_unuse(unsigned int type, bool frontswap,
4138
+ unsigned long *fs_pages_to_unuse)
39364139 {
39374140 return 0;
39384141 }
....@@ -4047,7 +4250,7 @@
40474250
40484251 /**
40494252 * shmem_zero_setup - setup a shared anonymous mapping
4050
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
4253
+ * @vma: the vma to be mmapped is prepared by do_mmap
40514254 */
40524255 int shmem_zero_setup(struct vm_area_struct *vma)
40534256 {
....@@ -4055,7 +4258,7 @@
40554258 loff_t size = vma->vm_end - vma->vm_start;
40564259
40574260 /*
4058
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
4261
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
40594262 * between XFS directory reading and selinux: since this file is only
40604263 * accessible to the user through its mapping, use S_PRIVATE flag to
40614264 * bypass file security, in the same way as shmem_kernel_file_setup().
....@@ -4069,7 +4272,7 @@
40694272 vma->vm_file = file;
40704273 vma->vm_ops = &shmem_vm_ops;
40714274
4072
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4275
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
40734276 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
40744277 (vma->vm_end & HPAGE_PMD_MASK)) {
40754278 khugepaged_enter(vma, vma->vm_flags);
....@@ -4117,3 +4320,47 @@
41174320 #endif
41184321 }
41194322 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4323
+
4324
+void shmem_mark_page_lazyfree(struct page *page, bool tail)
4325
+{
4326
+ mark_page_lazyfree_movetail(page, tail);
4327
+}
4328
+EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree);
4329
+
4330
+int reclaim_shmem_address_space(struct address_space *mapping)
4331
+{
4332
+#ifdef CONFIG_SHMEM
4333
+ pgoff_t start = 0;
4334
+ struct page *page;
4335
+ LIST_HEAD(page_list);
4336
+ XA_STATE(xas, &mapping->i_pages, start);
4337
+
4338
+ if (!shmem_mapping(mapping))
4339
+ return -EINVAL;
4340
+
4341
+ lru_add_drain();
4342
+
4343
+ rcu_read_lock();
4344
+ xas_for_each(&xas, page, ULONG_MAX) {
4345
+ if (xas_retry(&xas, page))
4346
+ continue;
4347
+ if (xa_is_value(page))
4348
+ continue;
4349
+ if (isolate_lru_page(page))
4350
+ continue;
4351
+
4352
+ list_add(&page->lru, &page_list);
4353
+
4354
+ if (need_resched()) {
4355
+ xas_pause(&xas);
4356
+ cond_resched_rcu();
4357
+ }
4358
+ }
4359
+ rcu_read_unlock();
4360
+
4361
+ return reclaim_pages(&page_list);
4362
+#else
4363
+ return 0;
4364
+#endif
4365
+}
4366
+EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);