~hc/RK356X_SDK_RELEASE.git

..	..	@@ -36,8 +36,17 @@
36	36	#include <linux/uio.h>
37	37	#include <linux/khugepaged.h>
38	38	#include <linux/hugetlb.h>
	39	+#include <linux/frontswap.h>
	40	+#include <linux/fs_parser.h>
	41	+#include <linux/mm_inline.h>
39	42
40	43	#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
	44	+
	45	+#include "internal.h"
	46	+
	47	+#undef CREATE_TRACE_POINTS
	48	+#include <trace/hooks/shmem_fs.h>
	49	+#include <trace/hooks/mm.h>
41	50
42	51	static struct vfsmount *shm_mnt;
43	52
..	..	@@ -80,7 +89,6 @@
80	89	#include <linux/uuid.h>
81	90
82	91	#include <linux/uaccess.h>
83		-#include <asm/pgtable.h>
84	92
85	93	#include "internal.h"
86	94
..	..	@@ -106,21 +114,43 @@
106	114	pgoff_t nr_unswapped; /* how often writepage refused to swap out */
107	115	};
108	116
	117	+struct shmem_options {
	118	+ unsigned long long blocks;
	119	+ unsigned long long inodes;
	120	+ struct mempolicy *mpol;
	121	+ kuid_t uid;
	122	+ kgid_t gid;
	123	+ umode_t mode;
	124	+ bool full_inums;
	125	+ int huge;
	126	+ int seen;
	127	+#define SHMEM_SEEN_BLOCKS 1
	128	+#define SHMEM_SEEN_INODES 2
	129	+#define SHMEM_SEEN_HUGE 4
	130	+#define SHMEM_SEEN_INUMS 8
	131	+};
	132	+
109	133	#ifdef CONFIG_TMPFS
110	134	static unsigned long shmem_default_max_blocks(void)
111	135	{
112		- return totalram_pages / 2;
	136	+ return totalram_pages() / 2;
113	137	}
114	138
115	139	static unsigned long shmem_default_max_inodes(void)
116	140	{
117		- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
	141	+ unsigned long nr_pages = totalram_pages();
	142	+
	143	+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
118	144	}
119	145	#endif
120	146
121	147	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
122	148	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
123	149	struct shmem_inode_info *info, pgoff_t index);
	150	+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
	151	+ struct page **pagep, enum sgp_type sgp,
	152	+ gfp_t gfp, struct vm_area_struct *vma,
	153	+ vm_fault_t *fault_type);
124	154	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
125	155	struct page **pagep, enum sgp_type sgp,
126	156	gfp_t gfp, struct vm_area_struct *vma,
..	..	@@ -239,18 +269,78 @@
239	269	static LIST_HEAD(shmem_swaplist);
240	270	static DEFINE_MUTEX(shmem_swaplist_mutex);
241	271
242		-static int shmem_reserve_inode(struct super_block *sb)
	272	+/*
	273	+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
	274	+ * produces a novel ino for the newly allocated inode.
	275	+ *
	276	+ * It may also be called when making a hard link to permit the space needed by
	277	+ * each dentry. However, in that case, no new inode number is needed since that
	278	+ * internally draws from another pool of inode numbers (currently global
	279	+ * get_next_ino()). This case is indicated by passing NULL as inop.
	280	+ */
	281	+#define SHMEM_INO_BATCH 1024
	282	+static int shmem_reserve_inode(struct super_block sb, ino_t inop)
243	283	{
244	284	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
245		- if (sbinfo->max_inodes) {
	285	+ ino_t ino;
	286	+
	287	+ if (!(sb->s_flags & SB_KERNMOUNT)) {
246	288	spin_lock(&sbinfo->stat_lock);
247		- if (!sbinfo->free_inodes) {
248		- spin_unlock(&sbinfo->stat_lock);
249		- return -ENOSPC;
	289	+ if (sbinfo->max_inodes) {
	290	+ if (!sbinfo->free_inodes) {
	291	+ spin_unlock(&sbinfo->stat_lock);
	292	+ return -ENOSPC;
	293	+ }
	294	+ sbinfo->free_inodes--;
250	295	}
251		- sbinfo->free_inodes--;
	296	+ if (inop) {
	297	+ ino = sbinfo->next_ino++;
	298	+ if (unlikely(is_zero_ino(ino)))
	299	+ ino = sbinfo->next_ino++;
	300	+ if (unlikely(!sbinfo->full_inums &&
	301	+ ino > UINT_MAX)) {
	302	+ /*
	303	+ * Emulate get_next_ino uint wraparound for
	304	+ * compatibility
	305	+ */
	306	+ if (IS_ENABLED(CONFIG_64BIT))
	307	+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
	308	+ __func__, MINOR(sb->s_dev));
	309	+ sbinfo->next_ino = 1;
	310	+ ino = sbinfo->next_ino++;
	311	+ }
	312	+ *inop = ino;
	313	+ }
252	314	spin_unlock(&sbinfo->stat_lock);
	315	+ } else if (inop) {
	316	+ /*
	317	+ * __shmem_file_setup, one of our callers, is lock-free: it
	318	+ * doesn't hold stat_lock in shmem_reserve_inode since
	319	+ * max_inodes is always 0, and is called from potentially
	320	+ * unknown contexts. As such, use a per-cpu batched allocator
	321	+ * which doesn't require the per-sb stat_lock unless we are at
	322	+ * the batch boundary.
	323	+ *
	324	+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
	325	+ * shmem mounts are not exposed to userspace, so we don't need
	326	+ * to worry about things like glibc compatibility.
	327	+ */
	328	+ ino_t *next_ino;
	329	+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
	330	+ ino = *next_ino;
	331	+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
	332	+ spin_lock(&sbinfo->stat_lock);
	333	+ ino = sbinfo->next_ino;
	334	+ sbinfo->next_ino += SHMEM_INO_BATCH;
	335	+ spin_unlock(&sbinfo->stat_lock);
	336	+ if (unlikely(is_zero_ino(ino)))
	337	+ ino++;
	338	+ }
	339	+ *inop = ino;
	340	+ *next_ino = ++ino;
	341	+ put_cpu();
253	342	}
	343	+
254	344	return 0;
255	345	}
256	346
..	..	@@ -326,24 +416,20 @@
326	416	}
327	417
328	418	/*
329		- * Replace item expected in radix tree by a new item, while holding tree lock.
	419	+ * Replace item expected in xarray by a new item, while holding xa_lock.
330	420	*/
331		-static int shmem_radix_tree_replace(struct address_space *mapping,
	421	+static int shmem_replace_entry(struct address_space *mapping,
332	422	pgoff_t index, void expected, void replacement)
333	423	{
334		- struct radix_tree_node *node;
335		- void __rcu **pslot;
	424	+ XA_STATE(xas, &mapping->i_pages, index);
336	425	void *item;
337	426
338	427	VM_BUG_ON(!expected);
339	428	VM_BUG_ON(!replacement);
340		- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
341		- if (!item)
342		- return -ENOENT;
	429	+ item = xas_load(&xas);
343	430	if (item != expected)
344	431	return -ENOENT;
345		- __radix_tree_replace(&mapping->i_pages, node, pslot,
346		- replacement, NULL);
	432	+ xas_store(&xas, replacement);
347	433	return 0;
348	434	}
349	435
..	..	@@ -357,12 +443,7 @@
357	443	static bool shmem_confirm_swap(struct address_space *mapping,
358	444	pgoff_t index, swp_entry_t swap)
359	445	{
360		- void *item;
361		-
362		- rcu_read_lock();
363		- item = radix_tree_lookup(&mapping->i_pages, index);
364		- rcu_read_unlock();
365		- return item == swp_to_radix_entry(swap);
	446	+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
366	447	}
367	448
368	449	/*
..	..	@@ -397,12 +478,12 @@
397	478	#define SHMEM_HUGE_DENY (-1)
398	479	#define SHMEM_HUGE_FORCE (-2)
399	480
400		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	481	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
401	482	/* ifdef here to avoid bloating shmem.o when not necessary */
402	483
403	484	static int shmem_huge __read_mostly;
404	485
405		-#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
	486	+#if defined(CONFIG_SYSFS)
406	487	static int shmem_parse_huge(const char *str)
407	488	{
408	489	if (!strcmp(str, "never"))
..	..	@@ -419,7 +500,9 @@
419	500	return SHMEM_HUGE_FORCE;
420	501	return -EINVAL;
421	502	}
	503	+#endif
422	504
	505	+#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
423	506	static const char *shmem_format_huge(int huge)
424	507	{
425	508	switch (huge) {
..	..	@@ -570,7 +653,7 @@
570	653	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
571	654	return READ_ONCE(sbinfo->shrinklist_len);
572	655	}
573		-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	656	+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
574	657
575	658	#define shmem_huge SHMEM_HUGE_DENY
576	659
..	..	@@ -579,11 +662,11 @@
579	662	{
580	663	return 0;
581	664	}
582		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	665	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
583	666
584	667	static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
585	668	{
586		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	669	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
587	670	(shmem_huge == SHMEM_HUGE_FORCE \|\| sbinfo->huge) &&
588	671	shmem_huge != SHMEM_HUGE_DENY)
589	672	return true;
..	..	@@ -595,9 +678,13 @@
595	678	*/
596	679	static int shmem_add_to_page_cache(struct page *page,
597	680	struct address_space *mapping,
598		- pgoff_t index, void *expected)
	681	+ pgoff_t index, void *expected, gfp_t gfp,
	682	+ struct mm_struct *charge_mm)
599	683	{
600		- int error, nr = hpage_nr_pages(page);
	684	+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
	685	+ unsigned long i = 0;
	686	+ unsigned long nr = compound_nr(page);
	687	+ int error;
601	688
602	689	VM_BUG_ON_PAGE(PageTail(page), page);
603	690	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
..	..	@@ -609,46 +696,53 @@
609	696	page->mapping = mapping;
610	697	page->index = index;
611	698
612		- xa_lock_irq(&mapping->i_pages);
613		- if (PageTransHuge(page)) {
614		- void __rcu **results;
615		- pgoff_t idx;
616		- int i;
617		-
618		- error = 0;
619		- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
620		- &results, &idx, index, 1) &&
621		- idx < index + HPAGE_PMD_NR) {
622		- error = -EEXIST;
623		- }
624		-
625		- if (!error) {
626		- for (i = 0; i < HPAGE_PMD_NR; i++) {
627		- error = radix_tree_insert(&mapping->i_pages,
628		- index + i, page + i);
629		- VM_BUG_ON(error);
	699	+ if (!PageSwapCache(page)) {
	700	+ error = mem_cgroup_charge(page, charge_mm, gfp);
	701	+ if (error) {
	702	+ if (PageTransHuge(page)) {
	703	+ count_vm_event(THP_FILE_FALLBACK);
	704	+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
630	705	}
631		- count_vm_event(THP_FILE_ALLOC);
	706	+ goto error;
632	707	}
633		- } else if (!expected) {
634		- error = radix_tree_insert(&mapping->i_pages, index, page);
635		- } else {
636		- error = shmem_radix_tree_replace(mapping, index, expected,
637		- page);
	708	+ }
	709	+ cgroup_throttle_swaprate(page, gfp);
	710	+
	711	+ do {
	712	+ void *entry;
	713	+ xas_lock_irq(&xas);
	714	+ entry = xas_find_conflict(&xas);
	715	+ if (entry != expected)
	716	+ xas_set_err(&xas, -EEXIST);
	717	+ xas_create_range(&xas);
	718	+ if (xas_error(&xas))
	719	+ goto unlock;
	720	+next:
	721	+ xas_store(&xas, page);
	722	+ if (++i < nr) {
	723	+ xas_next(&xas);
	724	+ goto next;
	725	+ }
	726	+ if (PageTransHuge(page)) {
	727	+ count_vm_event(THP_FILE_ALLOC);
	728	+ __inc_node_page_state(page, NR_SHMEM_THPS);
	729	+ }
	730	+ mapping->nrpages += nr;
	731	+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
	732	+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
	733	+unlock:
	734	+ xas_unlock_irq(&xas);
	735	+ } while (xas_nomem(&xas, gfp));
	736	+
	737	+ if (xas_error(&xas)) {
	738	+ error = xas_error(&xas);
	739	+ goto error;
638	740	}
639	741
640		- if (!error) {
641		- mapping->nrpages += nr;
642		- if (PageTransHuge(page))
643		- __inc_node_page_state(page, NR_SHMEM_THPS);
644		- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
645		- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
646		- xa_unlock_irq(&mapping->i_pages);
647		- } else {
648		- page->mapping = NULL;
649		- xa_unlock_irq(&mapping->i_pages);
650		- page_ref_sub(page, nr);
651		- }
	742	+ return 0;
	743	+error:
	744	+ page->mapping = NULL;
	745	+ page_ref_sub(page, nr);
652	746	return error;
653	747	}
654	748
..	..	@@ -663,27 +757,25 @@
663	757	VM_BUG_ON_PAGE(PageCompound(page), page);
664	758
665	759	xa_lock_irq(&mapping->i_pages);
666		- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
	760	+ error = shmem_replace_entry(mapping, page->index, page, radswap);
667	761	page->mapping = NULL;
668	762	mapping->nrpages--;
669		- __dec_node_page_state(page, NR_FILE_PAGES);
670		- __dec_node_page_state(page, NR_SHMEM);
	763	+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
	764	+ __dec_lruvec_page_state(page, NR_SHMEM);
671	765	xa_unlock_irq(&mapping->i_pages);
672	766	put_page(page);
673	767	BUG_ON(error);
674	768	}
675	769
676	770	/*
677		- * Remove swap entry from radix tree, free the swap and its page cache.
	771	+ * Remove swap entry from page cache, free the swap and its page cache.
678	772	*/
679	773	static int shmem_free_swap(struct address_space *mapping,
680	774	pgoff_t index, void *radswap)
681	775	{
682	776	void *old;
683	777
684		- xa_lock_irq(&mapping->i_pages);
685		- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
686		- xa_unlock_irq(&mapping->i_pages);
	778	+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
687	779	if (old != radswap)
688	780	return -ENOENT;
689	781	free_swap_and_cache(radix_to_swp_entry(radswap));
..	..	@@ -700,29 +792,19 @@
700	792	unsigned long shmem_partial_swap_usage(struct address_space *mapping,
701	793	pgoff_t start, pgoff_t end)
702	794	{
703		- struct radix_tree_iter iter;
704		- void __rcu **slot;
	795	+ XA_STATE(xas, &mapping->i_pages, start);
705	796	struct page *page;
706	797	unsigned long swapped = 0;
707	798
708	799	rcu_read_lock();
709		-
710		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
711		- if (iter.index >= end)
712		- break;
713		-
714		- page = radix_tree_deref_slot(slot);
715		-
716		- if (radix_tree_deref_retry(page)) {
717		- slot = radix_tree_iter_retry(&iter);
	800	+ xas_for_each(&xas, page, end - 1) {
	801	+ if (xas_retry(&xas, page))
718	802	continue;
719		- }
720		-
721		- if (radix_tree_exceptional_entry(page))
	803	+ if (xa_is_value(page))
722	804	swapped++;
723	805
724	806	if (need_resched()) {
725		- slot = radix_tree_iter_resume(slot, &iter);
	807	+ xas_pause(&xas);
726	808	cond_resched_rcu();
727	809	}
728	810	}
..	..	@@ -797,7 +879,33 @@
797	879	}
798	880
799	881	/*
800		- * Remove range of pages and swap entries from radix tree, and free them.
	882	+ * Check whether a hole-punch or truncation needs to split a huge page,
	883	+ * returning true if no split was required, or the split has been successful.
	884	+ *
	885	+ * Eviction (or truncation to 0 size) should never need to split a huge page;
	886	+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
	887	+ * head, and then succeeded to trylock on tail.
	888	+ *
	889	+ * A split can only succeed when there are no additional references on the
	890	+ * huge page: so the split below relies upon find_get_entries() having stopped
	891	+ * when it found a subpage of the huge page, without getting further references.
	892	+ */
	893	+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
	894	+{
	895	+ if (!PageTransCompound(page))
	896	+ return true;
	897	+
	898	+ /* Just proceed to delete a huge page wholly within the range punched */
	899	+ if (PageHead(page) &&
	900	+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
	901	+ return true;
	902	+
	903	+ /* Try to split huge page, so we can truly punch the hole or truncate */
	904	+ return split_huge_page(page) >= 0;
	905	+}
	906	+
	907	+/*
	908	+ * Remove range of pages and swap entries from page cache, and free them.
801	909	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
802	910	*/
803	911	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
..	..	@@ -833,7 +941,7 @@
833	941	if (index >= end)
834	942	break;
835	943
836		- if (radix_tree_exceptional_entry(page)) {
	944	+ if (xa_is_value(page)) {
837	945	if (unfalloc)
838	946	continue;
839	947	nr_swaps_freed += !shmem_free_swap(mapping,
..	..	@@ -846,31 +954,11 @@
846	954	if (!trylock_page(page))
847	955	continue;
848	956
849		- if (PageTransTail(page)) {
850		- /* Middle of THP: zero out the page */
851		- clear_highpage(page);
852		- unlock_page(page);
853		- continue;
854		- } else if (PageTransHuge(page)) {
855		- if (index == round_down(end, HPAGE_PMD_NR)) {
856		- /*
857		- * Range ends in the middle of THP:
858		- * zero out the page
859		- */
860		- clear_highpage(page);
861		- unlock_page(page);
862		- continue;
863		- }
864		- index += HPAGE_PMD_NR - 1;
865		- i += HPAGE_PMD_NR - 1;
866		- }
867		-
868		- if (!unfalloc \|\| !PageUptodate(page)) {
869		- VM_BUG_ON_PAGE(PageTail(page), page);
870		- if (page_mapping(page) == mapping) {
871		- VM_BUG_ON_PAGE(PageWriteback(page), page);
	957	+ if ((!unfalloc \|\| !PageUptodate(page)) &&
	958	+ page_mapping(page) == mapping) {
	959	+ VM_BUG_ON_PAGE(PageWriteback(page), page);
	960	+ if (shmem_punch_compound(page, start, end))
872	961	truncate_inode_page(mapping, page);
873		- }
874	962	}
875	963	unlock_page(page);
876	964	}
..	..	@@ -930,7 +1018,7 @@
930	1018	if (index >= end)
931	1019	break;
932	1020
933		- if (radix_tree_exceptional_entry(page)) {
	1021	+ if (xa_is_value(page)) {
934	1022	if (unfalloc)
935	1023	continue;
936	1024	if (shmem_free_swap(mapping, index, page)) {
..	..	@@ -944,42 +1032,24 @@
944	1032
945	1033	lock_page(page);
946	1034
947		- if (PageTransTail(page)) {
948		- /* Middle of THP: zero out the page */
949		- clear_highpage(page);
950		- unlock_page(page);
951		- /*
952		- * Partial thp truncate due 'start' in middle
953		- * of THP: don't need to look on these pages
954		- * again on !pvec.nr restart.
955		- */
956		- if (index != round_down(end, HPAGE_PMD_NR))
957		- start++;
958		- continue;
959		- } else if (PageTransHuge(page)) {
960		- if (index == round_down(end, HPAGE_PMD_NR)) {
961		- /*
962		- * Range ends in the middle of THP:
963		- * zero out the page
964		- */
965		- clear_highpage(page);
966		- unlock_page(page);
967		- continue;
968		- }
969		- index += HPAGE_PMD_NR - 1;
970		- i += HPAGE_PMD_NR - 1;
971		- }
972		-
973	1035	if (!unfalloc \|\| !PageUptodate(page)) {
974		- VM_BUG_ON_PAGE(PageTail(page), page);
975		- if (page_mapping(page) == mapping) {
976		- VM_BUG_ON_PAGE(PageWriteback(page), page);
977		- truncate_inode_page(mapping, page);
978		- } else {
	1036	+ if (page_mapping(page) != mapping) {
979	1037	/* Page was replaced by swap: retry */
980	1038	unlock_page(page);
981	1039	index--;
982	1040	break;
	1041	+ }
	1042	+ VM_BUG_ON_PAGE(PageWriteback(page), page);
	1043	+ if (shmem_punch_compound(page, start, end))
	1044	+ truncate_inode_page(mapping, page);
	1045	+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
	1046	+ /* Wipe the page and don't get stuck */
	1047	+ clear_highpage(page);
	1048	+ flush_dcache_page(page);
	1049	+ set_page_dirty(page);
	1050	+ if (index <
	1051	+ round_up(start, HPAGE_PMD_NR))
	1052	+ start = index + 1;
983	1053	}
984	1054	}
985	1055	unlock_page(page);
..	..	@@ -1067,7 +1137,7 @@
1067	1137	* Part of the huge page can be beyond i_size: subject
1068	1138	* to shrink under memory pressure.
1069	1139	*/
1070		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
	1140	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1071	1141	spin_lock(&sbinfo->shrinklist_lock);
1072	1142	/*
1073	1143	* _careful to defend against unlocked access to
..	..	@@ -1106,9 +1176,14 @@
1106	1176	}
1107	1177	spin_unlock(&sbinfo->shrinklist_lock);
1108	1178	}
1109		- if (!list_empty(&info->swaplist)) {
	1179	+ while (!list_empty(&info->swaplist)) {
	1180	+ /* Wait while shmem_unuse() is scanning this inode... */
	1181	+ wait_var_event(&info->stop_eviction,
	1182	+ !atomic_read(&info->stop_eviction));
1110	1183	mutex_lock(&shmem_swaplist_mutex);
1111		- list_del_init(&info->swaplist);
	1184	+ /* ...but beware of the race if we peeked too early */
	1185	+ if (!atomic_read(&info->stop_eviction))
	1186	+ list_del_init(&info->swaplist);
1112	1187	mutex_unlock(&shmem_swaplist_mutex);
1113	1188	}
1114	1189	}
..	..	@@ -1119,166 +1194,174 @@
1119	1194	clear_inode(inode);
1120	1195	}
1121	1196
1122		-static unsigned long find_swap_entry(struct radix_tree_root root, void item)
	1197	+extern struct swap_info_struct *swap_info[];
	1198	+
	1199	+static int shmem_find_swap_entries(struct address_space *mapping,
	1200	+ pgoff_t start, unsigned int nr_entries,
	1201	+ struct page *entries, pgoff_t indices,
	1202	+ unsigned int type, bool frontswap)
1123	1203	{
1124		- struct radix_tree_iter iter;
1125		- void __rcu **slot;
1126		- unsigned long found = -1;
1127		- unsigned int checked = 0;
	1204	+ XA_STATE(xas, &mapping->i_pages, start);
	1205	+ struct page *page;
	1206	+ swp_entry_t entry;
	1207	+ unsigned int ret = 0;
	1208	+
	1209	+ if (!nr_entries)
	1210	+ return 0;
1128	1211
1129	1212	rcu_read_lock();
1130		- radix_tree_for_each_slot(slot, root, &iter, 0) {
1131		- void *entry = radix_tree_deref_slot(slot);
1132		-
1133		- if (radix_tree_deref_retry(entry)) {
1134		- slot = radix_tree_iter_retry(&iter);
	1213	+ xas_for_each(&xas, page, ULONG_MAX) {
	1214	+ if (xas_retry(&xas, page))
1135	1215	continue;
	1216	+
	1217	+ if (!xa_is_value(page))
	1218	+ continue;
	1219	+
	1220	+ entry = radix_to_swp_entry(page);
	1221	+ if (swp_type(entry) != type)
	1222	+ continue;
	1223	+ if (frontswap &&
	1224	+ !frontswap_test(swap_info[type], swp_offset(entry)))
	1225	+ continue;
	1226	+
	1227	+ indices[ret] = xas.xa_index;
	1228	+ entries[ret] = page;
	1229	+
	1230	+ if (need_resched()) {
	1231	+ xas_pause(&xas);
	1232	+ cond_resched_rcu();
1136	1233	}
1137		- if (entry == item) {
1138		- found = iter.index;
	1234	+ if (++ret == nr_entries)
1139	1235	break;
1140		- }
1141		- checked++;
1142		- if ((checked % 4096) != 0)
1143		- continue;
1144		- slot = radix_tree_iter_resume(slot, &iter);
1145		- cond_resched_rcu();
1146	1236	}
1147		-
1148	1237	rcu_read_unlock();
1149		- return found;
	1238	+
	1239	+ return ret;
	1240	+}
	1241	+
	1242	+/*
	1243	+ * Move the swapped pages for an inode to page cache. Returns the count
	1244	+ * of pages swapped in, or the error in case of failure.
	1245	+ */
	1246	+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
	1247	+ pgoff_t *indices)
	1248	+{
	1249	+ int i = 0;
	1250	+ int ret = 0;
	1251	+ int error = 0;
	1252	+ struct address_space *mapping = inode->i_mapping;
	1253	+
	1254	+ for (i = 0; i < pvec.nr; i++) {
	1255	+ struct page *page = pvec.pages[i];
	1256	+
	1257	+ if (!xa_is_value(page))
	1258	+ continue;
	1259	+ error = shmem_swapin_page(inode, indices[i],
	1260	+ &page, SGP_CACHE,
	1261	+ mapping_gfp_mask(mapping),
	1262	+ NULL, NULL);
	1263	+ if (error == 0) {
	1264	+ unlock_page(page);
	1265	+ put_page(page);
	1266	+ ret++;
	1267	+ }
	1268	+ if (error == -ENOMEM)
	1269	+ break;
	1270	+ error = 0;
	1271	+ }
	1272	+ return error ? error : ret;
1150	1273	}
1151	1274
1152	1275	/*
1153	1276	* If swap found in inode, free it and move page from swapcache to filecache.
1154	1277	*/
1155		-static int shmem_unuse_inode(struct shmem_inode_info *info,
1156		- swp_entry_t swap, struct page **pagep)
	1278	+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
	1279	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1157	1280	{
1158		- struct address_space *mapping = info->vfs_inode.i_mapping;
1159		- void *radswap;
1160		- pgoff_t index;
1161		- gfp_t gfp;
1162		- int error = 0;
	1281	+ struct address_space *mapping = inode->i_mapping;
	1282	+ pgoff_t start = 0;
	1283	+ struct pagevec pvec;
	1284	+ pgoff_t indices[PAGEVEC_SIZE];
	1285	+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
	1286	+ int ret = 0;
1163	1287
1164		- radswap = swp_to_radix_entry(swap);
1165		- index = find_swap_entry(&mapping->i_pages, radswap);
1166		- if (index == -1)
1167		- return -EAGAIN; /* tell shmem_unuse we found nothing */
	1288	+ pagevec_init(&pvec);
	1289	+ do {
	1290	+ unsigned int nr_entries = PAGEVEC_SIZE;
1168	1291
1169		- /*
1170		- * Move _head_ to start search for next from here.
1171		- * But be careful: shmem_evict_inode checks list_empty without taking
1172		- * mutex, and there's an instant in list_move_tail when info->swaplist
1173		- * would appear empty, if it were the only one on shmem_swaplist.
1174		- */
1175		- if (shmem_swaplist.next != &info->swaplist)
1176		- list_move_tail(&shmem_swaplist, &info->swaplist);
	1292	+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
	1293	+ nr_entries = *fs_pages_to_unuse;
1177	1294
1178		- gfp = mapping_gfp_mask(mapping);
1179		- if (shmem_should_replace_page(*pagep, gfp)) {
1180		- mutex_unlock(&shmem_swaplist_mutex);
1181		- error = shmem_replace_page(pagep, gfp, info, index);
1182		- mutex_lock(&shmem_swaplist_mutex);
1183		- /*
1184		- * We needed to drop mutex to make that restrictive page
1185		- * allocation, but the inode might have been freed while we
1186		- * dropped it: although a racing shmem_evict_inode() cannot
1187		- * complete without emptying the radix_tree, our page lock
1188		- * on this swapcache page is not enough to prevent that -
1189		- * free_swap_and_cache() of our swap entry will only
1190		- * trylock_page(), removing swap from radix_tree whatever.
1191		- *
1192		- * We must not proceed to shmem_add_to_page_cache() if the
1193		- * inode has been freed, but of course we cannot rely on
1194		- * inode or mapping or info to check that. However, we can
1195		- * safely check if our swap entry is still in use (and here
1196		- * it can't have got reused for another page): if it's still
1197		- * in use, then the inode cannot have been freed yet, and we
1198		- * can safely proceed (if it's no longer in use, that tells
1199		- * nothing about the inode, but we don't need to unuse swap).
1200		- */
1201		- if (!page_swapcount(*pagep))
1202		- error = -ENOENT;
1203		- }
1204		-
1205		- /*
1206		- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
1207		- * but also to hold up shmem_evict_inode(): so inode cannot be freed
1208		- * beneath us (pagelock doesn't help until the page is in pagecache).
1209		- */
1210		- if (!error)
1211		- error = shmem_add_to_page_cache(*pagep, mapping, index,
1212		- radswap);
1213		- if (error != -ENOMEM) {
1214		- /*
1215		- * Truncation and eviction use free_swap_and_cache(), which
1216		- * only does trylock page: if we raced, best clean up here.
1217		- */
1218		- delete_from_swap_cache(*pagep);
1219		- set_page_dirty(*pagep);
1220		- if (!error) {
1221		- spin_lock_irq(&info->lock);
1222		- info->swapped--;
1223		- spin_unlock_irq(&info->lock);
1224		- swap_free(swap);
	1295	+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
	1296	+ pvec.pages, indices,
	1297	+ type, frontswap);
	1298	+ if (pvec.nr == 0) {
	1299	+ ret = 0;
	1300	+ break;
1225	1301	}
1226		- }
1227		- return error;
	1302	+
	1303	+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
	1304	+ if (ret < 0)
	1305	+ break;
	1306	+
	1307	+ if (frontswap_partial) {
	1308	+ *fs_pages_to_unuse -= ret;
	1309	+ if (*fs_pages_to_unuse == 0) {
	1310	+ ret = FRONTSWAP_PAGES_UNUSED;
	1311	+ break;
	1312	+ }
	1313	+ }
	1314	+
	1315	+ start = indices[pvec.nr - 1];
	1316	+ } while (true);
	1317	+
	1318	+ return ret;
1228	1319	}
1229	1320
1230	1321	/*
1231		- * Search through swapped inodes to find and replace swap by page.
	1322	+ * Read all the shared memory data that resides in the swap
	1323	+ * device 'type' back into memory, so the swap device can be
	1324	+ * unused.
1232	1325	*/
1233		-int shmem_unuse(swp_entry_t swap, struct page *page)
	1326	+int shmem_unuse(unsigned int type, bool frontswap,
	1327	+ unsigned long *fs_pages_to_unuse)
1234	1328	{
1235		- struct list_head this, next;
1236		- struct shmem_inode_info *info;
1237		- struct mem_cgroup *memcg;
	1329	+ struct shmem_inode_info info, next;
1238	1330	int error = 0;
1239	1331
1240		- /*
1241		- * There's a faint possibility that swap page was replaced before
1242		- * caller locked it: caller will come back later with the right page.
1243		- */
1244		- if (unlikely(!PageSwapCache(page) \|\| page_private(page) != swap.val))
1245		- goto out;
1246		-
1247		- /*
1248		- * Charge page using GFP_KERNEL while we can wait, before taking
1249		- * the shmem_swaplist_mutex which might hold up shmem_writepage().
1250		- * Charged back to the user (not to caller) when swap account is used.
1251		- */
1252		- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
1253		- &memcg, false);
1254		- if (error)
1255		- goto out;
1256		- /* No radix_tree_preload: swap entry keeps a place for page in tree */
1257		- error = -EAGAIN;
	1332	+ if (list_empty(&shmem_swaplist))
	1333	+ return 0;
1258	1334
1259	1335	mutex_lock(&shmem_swaplist_mutex);
1260		- list_for_each_safe(this, next, &shmem_swaplist) {
1261		- info = list_entry(this, struct shmem_inode_info, swaplist);
1262		- if (info->swapped)
1263		- error = shmem_unuse_inode(info, swap, &page);
1264		- else
	1336	+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
	1337	+ if (!info->swapped) {
1265	1338	list_del_init(&info->swaplist);
	1339	+ continue;
	1340	+ }
	1341	+ /*
	1342	+ * Drop the swaplist mutex while searching the inode for swap;
	1343	+ * but before doing so, make sure shmem_evict_inode() will not
	1344	+ * remove placeholder inode from swaplist, nor let it be freed
	1345	+ * (igrab() would protect from unlink, but not from unmount).
	1346	+ */
	1347	+ atomic_inc(&info->stop_eviction);
	1348	+ mutex_unlock(&shmem_swaplist_mutex);
	1349	+
	1350	+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
	1351	+ fs_pages_to_unuse);
1266	1352	cond_resched();
1267		- if (error != -EAGAIN)
	1353	+
	1354	+ mutex_lock(&shmem_swaplist_mutex);
	1355	+ next = list_next_entry(info, swaplist);
	1356	+ if (!info->swapped)
	1357	+ list_del_init(&info->swaplist);
	1358	+ if (atomic_dec_and_test(&info->stop_eviction))
	1359	+ wake_up_var(&info->stop_eviction);
	1360	+ if (error)
1268	1361	break;
1269		- /* found nothing in this: move on to search the next */
1270	1362	}
1271	1363	mutex_unlock(&shmem_swaplist_mutex);
1272	1364
1273		- if (error) {
1274		- if (error != -ENOMEM)
1275		- error = 0;
1276		- mem_cgroup_cancel_charge(page, memcg, false);
1277		- } else
1278		- mem_cgroup_commit_charge(page, memcg, true, false);
1279		-out:
1280		- unlock_page(page);
1281		- put_page(page);
1282	1365	return error;
1283	1366	}
1284	1367
..	..	@@ -1348,6 +1431,7 @@
1348	1431	SetPageUptodate(page);
1349	1432	}
1350	1433
	1434	+ trace_android_vh_set_shmem_page_flag(page);
1351	1435	swap = get_swap_page(page);
1352	1436	if (!swap.val)
1353	1437	goto redirty;
..	..	@@ -1362,9 +1446,11 @@
1362	1446	*/
1363	1447	mutex_lock(&shmem_swaplist_mutex);
1364	1448	if (list_empty(&info->swaplist))
1365		- list_add_tail(&info->swaplist, &shmem_swaplist);
	1449	+ list_add(&info->swaplist, &shmem_swaplist);
1366	1450
1367		- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
	1451	+ if (add_to_swap_cache(page, swap,
	1452	+ __GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN,
	1453	+ NULL) == 0) {
1368	1454	spin_lock_irq(&info->lock);
1369	1455	shmem_recalc_inode(inode);
1370	1456	info->swapped++;
..	..	@@ -1447,11 +1533,11 @@
1447	1533	{
1448	1534	struct vm_area_struct pvma;
1449	1535	struct page *page;
1450		- struct vm_fault vmf;
	1536	+ struct vm_fault vmf = {
	1537	+ .vma = &pvma,
	1538	+ };
1451	1539
1452	1540	shmem_pseudo_vma_init(&pvma, info, index);
1453		- vmf.vma = &pvma;
1454		- vmf.address = 0;
1455	1541	page = swap_cluster_readahead(swap, gfp, &vmf);
1456	1542	shmem_pseudo_vma_destroy(&pvma);
1457	1543
..	..	@@ -1462,23 +1548,14 @@
1462	1548	struct shmem_inode_info *info, pgoff_t index)
1463	1549	{
1464	1550	struct vm_area_struct pvma;
1465		- struct inode *inode = &info->vfs_inode;
1466		- struct address_space *mapping = inode->i_mapping;
1467		- pgoff_t idx, hindex;
1468		- void __rcu **results;
	1551	+ struct address_space *mapping = info->vfs_inode.i_mapping;
	1552	+ pgoff_t hindex;
1469	1553	struct page *page;
1470	1554
1471		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1472		- return NULL;
1473		-
1474	1555	hindex = round_down(index, HPAGE_PMD_NR);
1475		- rcu_read_lock();
1476		- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
1477		- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1478		- rcu_read_unlock();
	1556	+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
	1557	+ XA_PRESENT))
1479	1558	return NULL;
1480		- }
1481		- rcu_read_unlock();
1482	1559
1483	1560	shmem_pseudo_vma_init(&pvma, info, hindex);
1484	1561	page = alloc_pages_vma(gfp \| __GFP_COMP \| __GFP_NORETRY \| __GFP_NOWARN,
..	..	@@ -1486,6 +1563,8 @@
1486	1563	shmem_pseudo_vma_destroy(&pvma);
1487	1564	if (page)
1488	1565	prep_transhuge_page(page);
	1566	+ else
	1567	+ count_vm_event(THP_FILE_FALLBACK);
1489	1568	return page;
1490	1569	}
1491	1570
..	..	@@ -1493,7 +1572,11 @@
1493	1572	struct shmem_inode_info *info, pgoff_t index)
1494	1573	{
1495	1574	struct vm_area_struct pvma;
1496		- struct page *page;
	1575	+ struct page *page = NULL;
	1576	+
	1577	+ trace_android_vh_shmem_alloc_page(&page);
	1578	+ if (page)
	1579	+ return page;
1497	1580
1498	1581	shmem_pseudo_vma_init(&pvma, info, index);
1499	1582	page = alloc_page_vma(gfp, &pvma, 0);
..	..	@@ -1511,7 +1594,7 @@
1511	1594	int nr;
1512	1595	int err = -ENOSPC;
1513	1596
1514		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
	1597	+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1515	1598	huge = false;
1516	1599	nr = huge ? HPAGE_PMD_NR : 1;
1517	1600
..	..	@@ -1589,11 +1672,11 @@
1589	1672	* a nice clean interface for us to replace oldpage by newpage there.
1590	1673	*/
1591	1674	xa_lock_irq(&swap_mapping->i_pages);
1592		- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1593		- newpage);
	1675	+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1594	1676	if (!error) {
1595		- __inc_node_page_state(newpage, NR_FILE_PAGES);
1596		- __dec_node_page_state(oldpage, NR_FILE_PAGES);
	1677	+ mem_cgroup_migrate(oldpage, newpage);
	1678	+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
	1679	+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
1597	1680	}
1598	1681	xa_unlock_irq(&swap_mapping->i_pages);
1599	1682
..	..	@@ -1605,8 +1688,7 @@
1605	1688	*/
1606	1689	oldpage = newpage;
1607	1690	} else {
1608		- mem_cgroup_migrate(oldpage, newpage);
1609		- lru_cache_add_anon(newpage);
	1691	+ lru_cache_add(newpage);
1610	1692	*pagep = newpage;
1611	1693	}
1612	1694
..	..	@@ -1620,13 +1702,109 @@
1620	1702	}
1621	1703
1622	1704	/*
	1705	+ * Swap in the page pointed to by *pagep.
	1706	+ * Caller has to make sure that *pagep contains a valid swapped page.
	1707	+ * Returns 0 and the page in pagep if success. On failure, returns the
	1708	+ * error code and NULL in *pagep.
	1709	+ */
	1710	+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
	1711	+ struct page **pagep, enum sgp_type sgp,
	1712	+ gfp_t gfp, struct vm_area_struct *vma,
	1713	+ vm_fault_t *fault_type)
	1714	+{
	1715	+ struct address_space *mapping = inode->i_mapping;
	1716	+ struct shmem_inode_info *info = SHMEM_I(inode);
	1717	+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
	1718	+ struct page *page;
	1719	+ swp_entry_t swap;
	1720	+ int error;
	1721	+
	1722	+ VM_BUG_ON(!pagep \|\| !xa_is_value(pagep));
	1723	+ swap = radix_to_swp_entry(*pagep);
	1724	+ *pagep = NULL;
	1725	+
	1726	+ /* Look it up and read it in.. */
	1727	+ page = lookup_swap_cache(swap, NULL, 0);
	1728	+ if (!page) {
	1729	+ /* Or update major stats only when swapin succeeds?? */
	1730	+ if (fault_type) {
	1731	+ *fault_type \|= VM_FAULT_MAJOR;
	1732	+ count_vm_event(PGMAJFAULT);
	1733	+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
	1734	+ }
	1735	+ /* Here we actually start the io */
	1736	+ page = shmem_swapin(swap, gfp, info, index);
	1737	+ if (!page) {
	1738	+ error = -ENOMEM;
	1739	+ goto failed;
	1740	+ }
	1741	+ }
	1742	+
	1743	+ /* We have to do this with page locked to prevent races */
	1744	+ lock_page(page);
	1745	+ if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
	1746	+ !shmem_confirm_swap(mapping, index, swap)) {
	1747	+ error = -EEXIST;
	1748	+ goto unlock;
	1749	+ }
	1750	+ if (!PageUptodate(page)) {
	1751	+ error = -EIO;
	1752	+ goto failed;
	1753	+ }
	1754	+ wait_on_page_writeback(page);
	1755	+
	1756	+ /*
	1757	+ * Some architectures may have to restore extra metadata to the
	1758	+ * physical page after reading from swap.
	1759	+ */
	1760	+ arch_swap_restore(swap, page);
	1761	+
	1762	+ if (shmem_should_replace_page(page, gfp)) {
	1763	+ error = shmem_replace_page(&page, gfp, info, index);
	1764	+ if (error)
	1765	+ goto failed;
	1766	+ }
	1767	+
	1768	+ error = shmem_add_to_page_cache(page, mapping, index,
	1769	+ swp_to_radix_entry(swap), gfp,
	1770	+ charge_mm);
	1771	+ if (error)
	1772	+ goto failed;
	1773	+
	1774	+ spin_lock_irq(&info->lock);
	1775	+ info->swapped--;
	1776	+ shmem_recalc_inode(inode);
	1777	+ spin_unlock_irq(&info->lock);
	1778	+
	1779	+ if (sgp == SGP_WRITE)
	1780	+ mark_page_accessed(page);
	1781	+
	1782	+ delete_from_swap_cache(page);
	1783	+ set_page_dirty(page);
	1784	+ swap_free(swap);
	1785	+
	1786	+ *pagep = page;
	1787	+ return 0;
	1788	+failed:
	1789	+ if (!shmem_confirm_swap(mapping, index, swap))
	1790	+ error = -EEXIST;
	1791	+unlock:
	1792	+ if (page) {
	1793	+ unlock_page(page);
	1794	+ put_page(page);
	1795	+ }
	1796	+
	1797	+ return error;
	1798	+}
	1799	+
	1800	+/*
1623	1801	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1624	1802	*
1625	1803	* If we allocate a new one we do not mark it dirty. That's up to the
1626	1804	* vm. If we swap it in we mark it dirty since we also free the swap
1627	1805	* entry since a page cannot live in both the swap and page cache.
1628	1806	*
1629		- * fault_mm and fault_type are only supplied by shmem_fault:
	1807	+ * vma, vmf, and fault_type are only supplied by shmem_fault:
1630	1808	* otherwise they are NULL.
1631	1809	*/
1632	1810	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
..	..	@@ -1638,9 +1816,7 @@
1638	1816	struct shmem_inode_info *info = SHMEM_I(inode);
1639	1817	struct shmem_sb_info *sbinfo;
1640	1818	struct mm_struct *charge_mm;
1641		- struct mem_cgroup *memcg;
1642	1819	struct page *page;
1643		- swp_entry_t swap;
1644	1820	enum sgp_type sgp_huge = sgp;
1645	1821	pgoff_t hindex = index;
1646	1822	int error;
..	..	@@ -1652,19 +1828,37 @@
1652	1828	if (sgp == SGP_NOHUGE \|\| sgp == SGP_HUGE)
1653	1829	sgp = SGP_CACHE;
1654	1830	repeat:
1655		- swap.val = 0;
1656		- page = find_lock_entry(mapping, index);
1657		- if (radix_tree_exceptional_entry(page)) {
1658		- swap = radix_to_swp_entry(page);
1659		- page = NULL;
1660		- }
1661		-
1662	1831	if (sgp <= SGP_CACHE &&
1663	1832	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1664		- error = -EINVAL;
1665		- goto unlock;
	1833	+ return -EINVAL;
1666	1834	}
1667	1835
	1836	+ sbinfo = SHMEM_SB(inode->i_sb);
	1837	+ charge_mm = vma ? vma->vm_mm : current->mm;
	1838	+
	1839	+ page = find_lock_entry(mapping, index);
	1840	+
	1841	+ if (page && vma && userfaultfd_minor(vma)) {
	1842	+ if (!xa_is_value(page)) {
	1843	+ unlock_page(page);
	1844	+ put_page(page);
	1845	+ }
	1846	+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
	1847	+ return 0;
	1848	+ }
	1849	+
	1850	+ if (xa_is_value(page)) {
	1851	+ error = shmem_swapin_page(inode, index, &page,
	1852	+ sgp, gfp, vma, fault_type);
	1853	+ if (error == -EEXIST)
	1854	+ goto repeat;
	1855	+
	1856	+ *pagep = page;
	1857	+ return error;
	1858	+ }
	1859	+
	1860	+ if (page)
	1861	+ hindex = page->index;
1668	1862	if (page && sgp == SGP_WRITE)
1669	1863	mark_page_accessed(page);
1670	1864
..	..	@@ -1675,230 +1869,141 @@
1675	1869	unlock_page(page);
1676	1870	put_page(page);
1677	1871	page = NULL;
	1872	+ hindex = index;
1678	1873	}
1679		- if (page \|\| (sgp == SGP_READ && !swap.val)) {
1680		- *pagep = page;
1681		- return 0;
1682		- }
	1874	+ if (page \|\| sgp == SGP_READ)
	1875	+ goto out;
1683	1876
1684	1877	/*
1685	1878	* Fast cache lookup did not find it:
1686	1879	* bring it back from swap or allocate.
1687	1880	*/
1688		- sbinfo = SHMEM_SB(inode->i_sb);
1689		- charge_mm = vma ? vma->vm_mm : current->mm;
1690	1881
1691		- if (swap.val) {
1692		- /* Look it up and read it in.. */
1693		- page = lookup_swap_cache(swap, NULL, 0);
1694		- if (!page) {
1695		- /* Or update major stats only when swapin succeeds?? */
1696		- if (fault_type) {
1697		- *fault_type \|= VM_FAULT_MAJOR;
1698		- count_vm_event(PGMAJFAULT);
1699		- count_memcg_event_mm(charge_mm, PGMAJFAULT);
1700		- }
1701		- /* Here we actually start the io */
1702		- page = shmem_swapin(swap, gfp, info, index);
1703		- if (!page) {
1704		- error = -ENOMEM;
1705		- goto failed;
1706		- }
1707		- }
	1882	+ if (vma && userfaultfd_missing(vma)) {
	1883	+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
	1884	+ return 0;
	1885	+ }
1708	1886
1709		- /* We have to do this with page locked to prevent races */
1710		- lock_page(page);
1711		- if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
1712		- !shmem_confirm_swap(mapping, index, swap)) {
1713		- error = -EEXIST; /* try again */
1714		- goto unlock;
1715		- }
1716		- if (!PageUptodate(page)) {
1717		- error = -EIO;
1718		- goto failed;
1719		- }
1720		- wait_on_page_writeback(page);
	1887	+ /* shmem_symlink() */
	1888	+ if (mapping->a_ops != &shmem_aops)
	1889	+ goto alloc_nohuge;
	1890	+ if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
	1891	+ goto alloc_nohuge;
	1892	+ if (shmem_huge == SHMEM_HUGE_FORCE)
	1893	+ goto alloc_huge;
	1894	+ switch (sbinfo->huge) {
	1895	+ case SHMEM_HUGE_NEVER:
	1896	+ goto alloc_nohuge;
	1897	+ case SHMEM_HUGE_WITHIN_SIZE: {
	1898	+ loff_t i_size;
	1899	+ pgoff_t off;
1721	1900
1722		- if (shmem_should_replace_page(page, gfp)) {
1723		- error = shmem_replace_page(&page, gfp, info, index);
1724		- if (error)
1725		- goto failed;
1726		- }
1727		-
1728		- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1729		- false);
1730		- if (!error) {
1731		- error = shmem_add_to_page_cache(page, mapping, index,
1732		- swp_to_radix_entry(swap));
1733		- /*
1734		- * We already confirmed swap under page lock, and make
1735		- * no memory allocation here, so usually no possibility
1736		- * of error; but free_swap_and_cache() only trylocks a
1737		- * page, so it is just possible that the entry has been
1738		- * truncated or holepunched since swap was confirmed.
1739		- * shmem_undo_range() will have done some of the
1740		- * unaccounting, now delete_from_swap_cache() will do
1741		- * the rest.
1742		- * Reset swap.val? No, leave it so "failed" goes back to
1743		- * "repeat": reading a hole and writing should succeed.
1744		- */
1745		- if (error) {
1746		- mem_cgroup_cancel_charge(page, memcg, false);
1747		- delete_from_swap_cache(page);
1748		- }
1749		- }
1750		- if (error)
1751		- goto failed;
1752		-
1753		- mem_cgroup_commit_charge(page, memcg, true, false);
1754		-
1755		- spin_lock_irq(&info->lock);
1756		- info->swapped--;
1757		- shmem_recalc_inode(inode);
1758		- spin_unlock_irq(&info->lock);
1759		-
1760		- if (sgp == SGP_WRITE)
1761		- mark_page_accessed(page);
1762		-
1763		- delete_from_swap_cache(page);
1764		- set_page_dirty(page);
1765		- swap_free(swap);
1766		-
1767		- } else {
1768		- if (vma && userfaultfd_missing(vma)) {
1769		- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1770		- return 0;
1771		- }
1772		-
1773		- /* shmem_symlink() */
1774		- if (mapping->a_ops != &shmem_aops)
1775		- goto alloc_nohuge;
1776		- if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
1777		- goto alloc_nohuge;
1778		- if (shmem_huge == SHMEM_HUGE_FORCE)
	1901	+ off = round_up(index, HPAGE_PMD_NR);
	1902	+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
	1903	+ if (i_size >= HPAGE_PMD_SIZE &&
	1904	+ i_size >> PAGE_SHIFT >= off)
1779	1905	goto alloc_huge;
1780		- switch (sbinfo->huge) {
1781		- loff_t i_size;
1782		- pgoff_t off;
1783		- case SHMEM_HUGE_NEVER:
1784		- goto alloc_nohuge;
1785		- case SHMEM_HUGE_WITHIN_SIZE:
1786		- off = round_up(index, HPAGE_PMD_NR);
1787		- i_size = round_up(i_size_read(inode), PAGE_SIZE);
1788		- if (i_size >= HPAGE_PMD_SIZE &&
1789		- i_size >> PAGE_SHIFT >= off)
1790		- goto alloc_huge;
1791		- /* fallthrough */
1792		- case SHMEM_HUGE_ADVISE:
1793		- if (sgp_huge == SGP_HUGE)
1794		- goto alloc_huge;
1795		- /* TODO: implement fadvise() hints */
1796		- goto alloc_nohuge;
1797		- }
	1906	+
	1907	+ fallthrough;
	1908	+ }
	1909	+ case SHMEM_HUGE_ADVISE:
	1910	+ if (sgp_huge == SGP_HUGE)
	1911	+ goto alloc_huge;
	1912	+ /* TODO: implement fadvise() hints */
	1913	+ goto alloc_nohuge;
	1914	+ }
1798	1915
1799	1916	alloc_huge:
1800		- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1801		- if (IS_ERR(page)) {
1802		-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1803		- index, false);
1804		- }
1805		- if (IS_ERR(page)) {
1806		- int retry = 5;
1807		- error = PTR_ERR(page);
1808		- page = NULL;
1809		- if (error != -ENOSPC)
1810		- goto failed;
1811		- /*
1812		- * Try to reclaim some spece by splitting a huge page
1813		- * beyond i_size on the filesystem.
1814		- */
1815		- while (retry--) {
1816		- int ret;
1817		- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1818		- if (ret == SHRINK_STOP)
1819		- break;
1820		- if (ret)
1821		- goto alloc_nohuge;
1822		- }
1823		- goto failed;
1824		- }
	1917	+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
	1918	+ if (IS_ERR(page)) {
	1919	+alloc_nohuge:
	1920	+ page = shmem_alloc_and_acct_page(gfp, inode,
	1921	+ index, false);
	1922	+ }
	1923	+ if (IS_ERR(page)) {
	1924	+ int retry = 5;
1825	1925
1826		- if (PageTransHuge(page))
1827		- hindex = round_down(index, HPAGE_PMD_NR);
1828		- else
1829		- hindex = index;
1830		-
1831		- if (sgp == SGP_WRITE)
1832		- __SetPageReferenced(page);
1833		-
1834		- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1835		- PageTransHuge(page));
1836		- if (error)
1837		- goto unacct;
1838		- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1839		- compound_order(page));
1840		- if (!error) {
1841		- error = shmem_add_to_page_cache(page, mapping, hindex,
1842		- NULL);
1843		- radix_tree_preload_end();
1844		- }
1845		- if (error) {
1846		- mem_cgroup_cancel_charge(page, memcg,
1847		- PageTransHuge(page));
1848		- goto unacct;
1849		- }
1850		- mem_cgroup_commit_charge(page, memcg, false,
1851		- PageTransHuge(page));
1852		- lru_cache_add_anon(page);
1853		-
1854		- spin_lock_irq(&info->lock);
1855		- info->alloced += 1 << compound_order(page);
1856		- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1857		- shmem_recalc_inode(inode);
1858		- spin_unlock_irq(&info->lock);
1859		- alloced = true;
1860		-
1861		- if (PageTransHuge(page) &&
1862		- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1863		- hindex + HPAGE_PMD_NR - 1) {
1864		- /*
1865		- * Part of the huge page is beyond i_size: subject
1866		- * to shrink under memory pressure.
1867		- */
1868		- spin_lock(&sbinfo->shrinklist_lock);
1869		- /*
1870		- * _careful to defend against unlocked access to
1871		- * ->shrink_list in shmem_unused_huge_shrink()
1872		- */
1873		- if (list_empty_careful(&info->shrinklist)) {
1874		- list_add_tail(&info->shrinklist,
1875		- &sbinfo->shrinklist);
1876		- sbinfo->shrinklist_len++;
1877		- }
1878		- spin_unlock(&sbinfo->shrinklist_lock);
1879		- }
1880		-
	1926	+ error = PTR_ERR(page);
	1927	+ page = NULL;
	1928	+ if (error != -ENOSPC)
	1929	+ goto unlock;
1881	1930	/*
1882		- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
	1931	+ * Try to reclaim some space by splitting a huge page
	1932	+ * beyond i_size on the filesystem.
1883	1933	*/
1884		- if (sgp == SGP_FALLOC)
1885		- sgp = SGP_WRITE;
	1934	+ while (retry--) {
	1935	+ int ret;
	1936	+
	1937	+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
	1938	+ if (ret == SHRINK_STOP)
	1939	+ break;
	1940	+ if (ret)
	1941	+ goto alloc_nohuge;
	1942	+ }
	1943	+ goto unlock;
	1944	+ }
	1945	+
	1946	+ if (PageTransHuge(page))
	1947	+ hindex = round_down(index, HPAGE_PMD_NR);
	1948	+ else
	1949	+ hindex = index;
	1950	+
	1951	+ if (sgp == SGP_WRITE)
	1952	+ __SetPageReferenced(page);
	1953	+
	1954	+ error = shmem_add_to_page_cache(page, mapping, hindex,
	1955	+ NULL, gfp & GFP_RECLAIM_MASK,
	1956	+ charge_mm);
	1957	+ if (error)
	1958	+ goto unacct;
	1959	+ lru_cache_add(page);
	1960	+
	1961	+ spin_lock_irq(&info->lock);
	1962	+ info->alloced += compound_nr(page);
	1963	+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
	1964	+ shmem_recalc_inode(inode);
	1965	+ spin_unlock_irq(&info->lock);
	1966	+ alloced = true;
	1967	+
	1968	+ if (PageTransHuge(page) &&
	1969	+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
	1970	+ hindex + HPAGE_PMD_NR - 1) {
	1971	+ /*
	1972	+ * Part of the huge page is beyond i_size: subject
	1973	+ * to shrink under memory pressure.
	1974	+ */
	1975	+ spin_lock(&sbinfo->shrinklist_lock);
	1976	+ /*
	1977	+ * _careful to defend against unlocked access to
	1978	+ * ->shrink_list in shmem_unused_huge_shrink()
	1979	+ */
	1980	+ if (list_empty_careful(&info->shrinklist)) {
	1981	+ list_add_tail(&info->shrinklist,
	1982	+ &sbinfo->shrinklist);
	1983	+ sbinfo->shrinklist_len++;
	1984	+ }
	1985	+ spin_unlock(&sbinfo->shrinklist_lock);
	1986	+ }
	1987	+
	1988	+ /*
	1989	+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
	1990	+ */
	1991	+ if (sgp == SGP_FALLOC)
	1992	+ sgp = SGP_WRITE;
1886	1993	clear:
1887		- /*
1888		- * Let SGP_WRITE caller clear ends if write does not fill page;
1889		- * but SGP_FALLOC on a page fallocated earlier must initialize
1890		- * it now, lest undo on failure cancel our earlier guarantee.
1891		- */
1892		- if (sgp != SGP_WRITE && !PageUptodate(page)) {
1893		- struct page *head = compound_head(page);
1894		- int i;
	1994	+ /*
	1995	+ * Let SGP_WRITE caller clear ends if write does not fill page;
	1996	+ * but SGP_FALLOC on a page fallocated earlier must initialize
	1997	+ * it now, lest undo on failure cancel our earlier guarantee.
	1998	+ */
	1999	+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
	2000	+ int i;
1895	2001
1896		- for (i = 0; i < (1 << compound_order(head)); i++) {
1897		- clear_highpage(head + i);
1898		- flush_dcache_page(head + i);
1899		- }
1900		- SetPageUptodate(head);
	2002	+ for (i = 0; i < compound_nr(page); i++) {
	2003	+ clear_highpage(page + i);
	2004	+ flush_dcache_page(page + i);
1901	2005	}
	2006	+ SetPageUptodate(page);
1902	2007	}
1903	2008
1904	2009	/* Perhaps the file has been truncated since we checked */
..	..	@@ -1914,6 +2019,7 @@
1914	2019	error = -EINVAL;
1915	2020	goto unlock;
1916	2021	}
	2022	+out:
1917	2023	*pagep = page + index - hindex;
1918	2024	return 0;
1919	2025
..	..	@@ -1921,16 +2027,13 @@
1921	2027	* Error recovery.
1922	2028	*/
1923	2029	unacct:
1924		- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
	2030	+ shmem_inode_unacct_blocks(inode, compound_nr(page));
1925	2031
1926	2032	if (PageTransHuge(page)) {
1927	2033	unlock_page(page);
1928	2034	put_page(page);
1929	2035	goto alloc_nohuge;
1930	2036	}
1931		-failed:
1932		- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1933		- error = -EEXIST;
1934	2037	unlock:
1935	2038	if (page) {
1936	2039	unlock_page(page);
..	..	@@ -1942,7 +2045,7 @@
1942	2045	spin_unlock_irq(&info->lock);
1943	2046	goto repeat;
1944	2047	}
1945		- if (error == -EEXIST) /* from above or from radix_tree_insert */
	2048	+ if (error == -EEXIST)
1946	2049	goto repeat;
1947	2050	return error;
1948	2051	}
..	..	@@ -1994,16 +2097,14 @@
1994	2097	shmem_falloc->waitq &&
1995	2098	vmf->pgoff >= shmem_falloc->start &&
1996	2099	vmf->pgoff < shmem_falloc->next) {
	2100	+ struct file *fpin;
1997	2101	wait_queue_head_t *shmem_falloc_waitq;
1998	2102	DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
1999	2103
2000	2104	ret = VM_FAULT_NOPAGE;
2001		- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
2002		- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
2003		- /* It's polite to up mmap_sem if we can */
2004		- up_read(&vma->vm_mm->mmap_sem);
	2105	+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
	2106	+ if (fpin)
2005	2107	ret = VM_FAULT_RETRY;
2006		- }
2007	2108
2008	2109	shmem_falloc_waitq = shmem_falloc->waitq;
2009	2110	prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
..	..	@@ -2021,6 +2122,9 @@
2021	2122	spin_lock(&inode->i_lock);
2022	2123	finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2023	2124	spin_unlock(&inode->i_lock);
	2125	+
	2126	+ if (fpin)
	2127	+ fput(fpin);
2024	2128	return ret;
2025	2129	}
2026	2130	spin_unlock(&inode->i_lock);
..	..	@@ -2059,7 +2163,7 @@
2059	2163	get_area = current->mm->get_unmapped_area;
2060	2164	addr = get_area(file, uaddr, len, pgoff, flags);
2061	2165
2062		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
	2166	+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2063	2167	return addr;
2064	2168	if (IS_ERR_VALUE(addr))
2065	2169	return addr;
..	..	@@ -2179,26 +2283,18 @@
2179	2283	static int shmem_mmap(struct file file, struct vm_area_struct vma)
2180	2284	{
2181	2285	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
	2286	+ int ret;
2182	2287
2183		- if (info->seals & F_SEAL_FUTURE_WRITE) {
2184		- /*
2185		- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2186		- * "future write" seal active.
2187		- */
2188		- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2189		- return -EPERM;
	2288	+ ret = seal_check_future_write(info->seals, vma);
	2289	+ if (ret)
	2290	+ return ret;
2190	2291
2191		- /*
2192		- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2193		- * read-only mapping, take care to not allow mprotect to revert
2194		- * protections.
2195		- */
2196		- vma->vm_flags &= ~(VM_MAYWRITE);
2197		- }
	2292	+ /* arm64 - allow memory tagging on RAM-based files */
	2293	+ vma->vm_flags \|= VM_MTE_ALLOWED;
2198	2294
2199	2295	file_accessed(file);
2200	2296	vma->vm_ops = &shmem_vm_ops;
2201		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	2297	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2202	2298	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2203	2299	(vma->vm_end & HPAGE_PMD_MASK)) {
2204	2300	khugepaged_enter(vma, vma->vm_flags);
..	..	@@ -2212,13 +2308,14 @@
2212	2308	struct inode *inode;
2213	2309	struct shmem_inode_info *info;
2214	2310	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	2311	+ ino_t ino;
2215	2312
2216		- if (shmem_reserve_inode(sb))
	2313	+ if (shmem_reserve_inode(sb, &ino))
2217	2314	return NULL;
2218	2315
2219	2316	inode = new_inode(sb);
2220	2317	if (inode) {
2221		- inode->i_ino = get_next_ino();
	2318	+ inode->i_ino = ino;
2222	2319	inode_init_owner(inode, dir, mode);
2223	2320	inode->i_blocks = 0;
2224	2321	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
..	..	@@ -2226,6 +2323,7 @@
2226	2323	info = SHMEM_I(inode);
2227	2324	memset(info, 0, (char )inode - (char )info);
2228	2325	spin_lock_init(&info->lock);
	2326	+ atomic_set(&info->stop_eviction, 0);
2229	2327	info->seals = F_SEAL_SEAL;
2230	2328	info->flags = flags & VM_NORESERVE;
2231	2329	INIT_LIST_HEAD(&info->shrinklist);
..	..	@@ -2272,28 +2370,25 @@
2272	2370	return mapping->a_ops == &shmem_aops;
2273	2371	}
2274	2372
2275		-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2276		- pmd_t *dst_pmd,
2277		- struct vm_area_struct *dst_vma,
2278		- unsigned long dst_addr,
2279		- unsigned long src_addr,
2280		- bool zeropage,
2281		- struct page **pagep)
	2373	+#ifdef CONFIG_USERFAULTFD
	2374	+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
	2375	+ pmd_t *dst_pmd,
	2376	+ struct vm_area_struct *dst_vma,
	2377	+ unsigned long dst_addr,
	2378	+ unsigned long src_addr,
	2379	+ bool zeropage,
	2380	+ struct page **pagep)
2282	2381	{
2283	2382	struct inode *inode = file_inode(dst_vma->vm_file);
2284	2383	struct shmem_inode_info *info = SHMEM_I(inode);
2285	2384	struct address_space *mapping = inode->i_mapping;
2286	2385	gfp_t gfp = mapping_gfp_mask(mapping);
2287	2386	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2288		- struct mem_cgroup *memcg;
2289		- spinlock_t *ptl;
2290	2387	void *page_kaddr;
2291	2388	struct page *page;
2292		- pte_t _dst_pte, *dst_pte;
2293	2389	int ret;
2294		- pgoff_t offset, max_off;
	2390	+ pgoff_t max_off;
2295	2391
2296		- ret = -ENOMEM;
2297	2392	if (!shmem_inode_acct_block(inode, 1)) {
2298	2393	/*
2299	2394	* We may have got a page, returned -ENOENT triggering a retry,
..	..	@@ -2304,29 +2399,30 @@
2304	2399	put_page(*pagep);
2305	2400	*pagep = NULL;
2306	2401	}
2307		- goto out;
	2402	+ return -ENOMEM;
2308	2403	}
2309	2404
2310	2405	if (!*pagep) {
	2406	+ ret = -ENOMEM;
2311	2407	page = shmem_alloc_page(gfp, info, pgoff);
2312	2408	if (!page)
2313	2409	goto out_unacct_blocks;
2314	2410
2315		- if (!zeropage) { /* mcopy_atomic */
	2411	+ if (!zeropage) { /* COPY */
2316	2412	page_kaddr = kmap_atomic(page);
2317	2413	ret = copy_from_user(page_kaddr,
2318	2414	(const void __user *)src_addr,
2319	2415	PAGE_SIZE);
2320	2416	kunmap_atomic(page_kaddr);
2321	2417
2322		- /* fallback to copy_from_user outside mmap_sem */
	2418	+ /* fallback to copy_from_user outside mmap_lock */
2323	2419	if (unlikely(ret)) {
2324	2420	*pagep = page;
2325		- shmem_inode_unacct_blocks(inode, 1);
	2421	+ ret = -ENOENT;
2326	2422	/* don't free the page */
2327		- return -ENOENT;
	2423	+ goto out_unacct_blocks;
2328	2424	}
2329		- } else { /* mfill_zeropage_atomic */
	2425	+ } else { /* ZEROPAGE */
2330	2426	clear_highpage(page);
2331	2427	}
2332	2428	} else {
..	..	@@ -2334,57 +2430,26 @@
2334	2430	*pagep = NULL;
2335	2431	}
2336	2432
2337		- VM_BUG_ON(PageLocked(page) \|\| PageSwapBacked(page));
	2433	+ VM_BUG_ON(PageLocked(page));
	2434	+ VM_BUG_ON(PageSwapBacked(page));
2338	2435	__SetPageLocked(page);
2339	2436	__SetPageSwapBacked(page);
2340	2437	__SetPageUptodate(page);
2341	2438
2342	2439	ret = -EFAULT;
2343		- offset = linear_page_index(dst_vma, dst_addr);
2344	2440	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2345		- if (unlikely(offset >= max_off))
	2441	+ if (unlikely(pgoff >= max_off))
2346	2442	goto out_release;
2347	2443
2348		- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
	2444	+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
	2445	+ gfp & GFP_RECLAIM_MASK, dst_mm);
2349	2446	if (ret)
2350	2447	goto out_release;
2351	2448
2352		- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
2353		- if (!ret) {
2354		- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
2355		- radix_tree_preload_end();
2356		- }
	2449	+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
	2450	+ page, true, false);
2357	2451	if (ret)
2358		- goto out_release_uncharge;
2359		-
2360		- mem_cgroup_commit_charge(page, memcg, false, false);
2361		-
2362		- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2363		- if (dst_vma->vm_flags & VM_WRITE)
2364		- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2365		- else {
2366		- /*
2367		- * We don't set the pte dirty if the vma has no
2368		- * VM_WRITE permission, so mark the page dirty or it
2369		- * could be freed from under us. We could do it
2370		- * unconditionally before unlock_page(), but doing it
2371		- * only if VM_WRITE is not set is faster.
2372		- */
2373		- set_page_dirty(page);
2374		- }
2375		-
2376		- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2377		-
2378		- ret = -EFAULT;
2379		- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2380		- if (unlikely(offset >= max_off))
2381		- goto out_release_uncharge_unlock;
2382		-
2383		- ret = -EEXIST;
2384		- if (!pte_none(*dst_pte))
2385		- goto out_release_uncharge_unlock;
2386		-
2387		- lru_cache_add_anon(page);
	2452	+ goto out_delete_from_cache;
2388	2453
2389	2454	spin_lock_irq(&info->lock);
2390	2455	info->alloced++;
..	..	@@ -2392,52 +2457,19 @@
2392	2457	shmem_recalc_inode(inode);
2393	2458	spin_unlock_irq(&info->lock);
2394	2459
2395		- inc_mm_counter(dst_mm, mm_counter_file(page));
2396		- page_add_file_rmap(page, false);
2397		- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2398		-
2399		- /* No need to invalidate - it was non-present before */
2400		- update_mmu_cache(dst_vma, dst_addr, dst_pte);
2401		- pte_unmap_unlock(dst_pte, ptl);
	2460	+ SetPageDirty(page);
2402	2461	unlock_page(page);
2403		- ret = 0;
2404		-out:
2405		- return ret;
2406		-out_release_uncharge_unlock:
2407		- pte_unmap_unlock(dst_pte, ptl);
2408		- ClearPageDirty(page);
	2462	+ return 0;
	2463	+out_delete_from_cache:
2409	2464	delete_from_page_cache(page);
2410		-out_release_uncharge:
2411		- mem_cgroup_cancel_charge(page, memcg, false);
2412	2465	out_release:
2413	2466	unlock_page(page);
2414	2467	put_page(page);
2415	2468	out_unacct_blocks:
2416	2469	shmem_inode_unacct_blocks(inode, 1);
2417		- goto out;
	2470	+ return ret;
2418	2471	}
2419		-
2420		-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2421		- pmd_t *dst_pmd,
2422		- struct vm_area_struct *dst_vma,
2423		- unsigned long dst_addr,
2424		- unsigned long src_addr,
2425		- struct page **pagep)
2426		-{
2427		- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2428		- dst_addr, src_addr, false, pagep);
2429		-}
2430		-
2431		-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2432		- pmd_t *dst_pmd,
2433		- struct vm_area_struct *dst_vma,
2434		- unsigned long dst_addr)
2435		-{
2436		- struct page *page = NULL;
2437		-
2438		- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2439		- dst_addr, 0, true, &page);
2440		-}
	2472	+#endif /* CONFIG_USERFAULTFD */
2441	2473
2442	2474	#ifdef CONFIG_TMPFS
2443	2475	static const struct inode_operations shmem_symlink_inode_operations;
..	..	@@ -2617,7 +2649,7 @@
2617	2649	}
2618	2650
2619	2651	/*
2620		- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
	2652	+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
2621	2653	*/
2622	2654	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2623	2655	pgoff_t index, pgoff_t end, int whence)
..	..	@@ -2647,7 +2679,7 @@
2647	2679	index = indices[i];
2648	2680	}
2649	2681	page = pvec.pages[i];
2650		- if (page && !radix_tree_exceptional_entry(page)) {
	2682	+ if (page && !xa_is_value(page)) {
2651	2683	if (!PageUptodate(page))
2652	2684	page = NULL;
2653	2685	}
..	..	@@ -2943,7 +2975,7 @@
2943	2975	* first link must skip that, to get the accounting right.
2944	2976	*/
2945	2977	if (inode->i_nlink) {
2946		- ret = shmem_reserve_inode(inode->i_sb);
	2978	+ ret = shmem_reserve_inode(inode->i_sb, NULL);
2947	2979	if (ret)
2948	2980	goto out;
2949	2981	}
..	..	@@ -3095,12 +3127,9 @@
3095	3127
3096	3128	error = security_inode_init_security(inode, dir, &dentry->d_name,
3097	3129	shmem_initxattrs, NULL);
3098		- if (error) {
3099		- if (error != -EOPNOTSUPP) {
3100		- iput(inode);
3101		- return error;
3102		- }
3103		- error = 0;
	3130	+ if (error && error != -EOPNOTSUPP) {
	3131	+ iput(inode);
	3132	+ return error;
3104	3133	}
3105	3134
3106	3135	inode->i_size = len-1;
..	..	@@ -3192,7 +3221,7 @@
3192	3221	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3193	3222	GFP_KERNEL);
3194	3223	if (!new_xattr->name) {
3195		- kfree(new_xattr);
	3224	+ kvfree(new_xattr);
3196	3225	return -ENOMEM;
3197	3226	}
3198	3227
..	..	@@ -3209,7 +3238,8 @@
3209	3238
3210	3239	static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3211	3240	struct dentry unused, struct inode inode,
3212		- const char name, void buffer, size_t size)
	3241	+ const char name, void buffer, size_t size,
	3242	+ int flags)
3213	3243	{
3214	3244	struct shmem_inode_info *info = SHMEM_I(inode);
3215	3245
..	..	@@ -3225,7 +3255,7 @@
3225	3255	struct shmem_inode_info *info = SHMEM_I(inode);
3226	3256
3227	3257	name = xattr_full_name(handler, name);
3228		- return simple_xattr_set(&info->xattrs, name, value, size, flags);
	3258	+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3229	3259	}
3230	3260
3231	3261	static const struct xattr_handler shmem_security_xattr_handler = {
..	..	@@ -3352,16 +3382,162 @@
3352	3382	.fh_to_dentry = shmem_fh_to_dentry,
3353	3383	};
3354	3384
3355		-static int shmem_parse_options(char options, struct shmem_sb_info sbinfo,
3356		- bool remount)
	3385	+enum shmem_param {
	3386	+ Opt_gid,
	3387	+ Opt_huge,
	3388	+ Opt_mode,
	3389	+ Opt_mpol,
	3390	+ Opt_nr_blocks,
	3391	+ Opt_nr_inodes,
	3392	+ Opt_size,
	3393	+ Opt_uid,
	3394	+ Opt_inode32,
	3395	+ Opt_inode64,
	3396	+};
	3397	+
	3398	+static const struct constant_table shmem_param_enums_huge[] = {
	3399	+ {"never", SHMEM_HUGE_NEVER },
	3400	+ {"always", SHMEM_HUGE_ALWAYS },
	3401	+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
	3402	+ {"advise", SHMEM_HUGE_ADVISE },
	3403	+ {}
	3404	+};
	3405	+
	3406	+const struct fs_parameter_spec shmem_fs_parameters[] = {
	3407	+ fsparam_u32 ("gid", Opt_gid),
	3408	+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
	3409	+ fsparam_u32oct("mode", Opt_mode),
	3410	+ fsparam_string("mpol", Opt_mpol),
	3411	+ fsparam_string("nr_blocks", Opt_nr_blocks),
	3412	+ fsparam_string("nr_inodes", Opt_nr_inodes),
	3413	+ fsparam_string("size", Opt_size),
	3414	+ fsparam_u32 ("uid", Opt_uid),
	3415	+ fsparam_flag ("inode32", Opt_inode32),
	3416	+ fsparam_flag ("inode64", Opt_inode64),
	3417	+ {}
	3418	+};
	3419	+
	3420	+static int shmem_parse_one(struct fs_context fc, struct fs_parameter param)
3357	3421	{
3358		- char this_char, value, *rest;
3359		- struct mempolicy *mpol = NULL;
3360		- uid_t uid;
3361		- gid_t gid;
	3422	+ struct shmem_options *ctx = fc->fs_private;
	3423	+ struct fs_parse_result result;
	3424	+ unsigned long long size;
	3425	+ char *rest;
	3426	+ int opt;
	3427	+ kuid_t kuid;
	3428	+ kgid_t kgid;
	3429	+
	3430	+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
	3431	+ if (opt < 0)
	3432	+ return opt;
	3433	+
	3434	+ switch (opt) {
	3435	+ case Opt_size:
	3436	+ size = memparse(param->string, &rest);
	3437	+ if (*rest == '%') {
	3438	+ size <<= PAGE_SHIFT;
	3439	+ size *= totalram_pages();
	3440	+ do_div(size, 100);
	3441	+ rest++;
	3442	+ }
	3443	+ if (*rest)
	3444	+ goto bad_value;
	3445	+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
	3446	+ ctx->seen \|= SHMEM_SEEN_BLOCKS;
	3447	+ break;
	3448	+ case Opt_nr_blocks:
	3449	+ ctx->blocks = memparse(param->string, &rest);
	3450	+ if (*rest)
	3451	+ goto bad_value;
	3452	+ ctx->seen \|= SHMEM_SEEN_BLOCKS;
	3453	+ break;
	3454	+ case Opt_nr_inodes:
	3455	+ ctx->inodes = memparse(param->string, &rest);
	3456	+ if (*rest)
	3457	+ goto bad_value;
	3458	+ ctx->seen \|= SHMEM_SEEN_INODES;
	3459	+ break;
	3460	+ case Opt_mode:
	3461	+ ctx->mode = result.uint_32 & 07777;
	3462	+ break;
	3463	+ case Opt_uid:
	3464	+ kuid = make_kuid(current_user_ns(), result.uint_32);
	3465	+ if (!uid_valid(kuid))
	3466	+ goto bad_value;
	3467	+
	3468	+ /*
	3469	+ * The requested uid must be representable in the
	3470	+ * filesystem's idmapping.
	3471	+ */
	3472	+ if (!kuid_has_mapping(fc->user_ns, kuid))
	3473	+ goto bad_value;
	3474	+
	3475	+ ctx->uid = kuid;
	3476	+ break;
	3477	+ case Opt_gid:
	3478	+ kgid = make_kgid(current_user_ns(), result.uint_32);
	3479	+ if (!gid_valid(kgid))
	3480	+ goto bad_value;
	3481	+
	3482	+ /*
	3483	+ * The requested gid must be representable in the
	3484	+ * filesystem's idmapping.
	3485	+ */
	3486	+ if (!kgid_has_mapping(fc->user_ns, kgid))
	3487	+ goto bad_value;
	3488	+
	3489	+ ctx->gid = kgid;
	3490	+ break;
	3491	+ case Opt_huge:
	3492	+ ctx->huge = result.uint_32;
	3493	+ if (ctx->huge != SHMEM_HUGE_NEVER &&
	3494	+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
	3495	+ has_transparent_hugepage()))
	3496	+ goto unsupported_parameter;
	3497	+ ctx->seen \|= SHMEM_SEEN_HUGE;
	3498	+ break;
	3499	+ case Opt_mpol:
	3500	+ if (IS_ENABLED(CONFIG_NUMA)) {
	3501	+ mpol_put(ctx->mpol);
	3502	+ ctx->mpol = NULL;
	3503	+ if (mpol_parse_str(param->string, &ctx->mpol))
	3504	+ goto bad_value;
	3505	+ break;
	3506	+ }
	3507	+ goto unsupported_parameter;
	3508	+ case Opt_inode32:
	3509	+ ctx->full_inums = false;
	3510	+ ctx->seen \|= SHMEM_SEEN_INUMS;
	3511	+ break;
	3512	+ case Opt_inode64:
	3513	+ if (sizeof(ino_t) < 8) {
	3514	+ return invalfc(fc,
	3515	+ "Cannot use inode64 with <64bit inums in kernel\n");
	3516	+ }
	3517	+ ctx->full_inums = true;
	3518	+ ctx->seen \|= SHMEM_SEEN_INUMS;
	3519	+ break;
	3520	+ }
	3521	+ return 0;
	3522	+
	3523	+unsupported_parameter:
	3524	+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
	3525	+bad_value:
	3526	+ return invalfc(fc, "Bad value for '%s'", param->key);
	3527	+}
	3528	+
	3529	+static int shmem_parse_options(struct fs_context fc, void data)
	3530	+{
	3531	+ char *options = data;
	3532	+
	3533	+ if (options) {
	3534	+ int err = security_sb_eat_lsm_opts(options, &fc->security);
	3535	+ if (err)
	3536	+ return err;
	3537	+ }
3362	3538
3363	3539	while (options != NULL) {
3364		- this_char = options;
	3540	+ char *this_char = options;
3365	3541	for (;;) {
3366	3542	/*
3367	3543	* NUL-terminate this option: unfortunately,
..	..	@@ -3377,139 +3553,91 @@
3377	3553	break;
3378	3554	}
3379	3555	}
3380		- if (!*this_char)
3381		- continue;
3382		- if ((value = strchr(this_char,'=')) != NULL) {
3383		- *value++ = 0;
3384		- } else {
3385		- pr_err("tmpfs: No value for mount option '%s'\n",
3386		- this_char);
3387		- goto error;
3388		- }
	3556	+ if (*this_char) {
	3557	+ char *value = strchr(this_char,'=');
	3558	+ size_t len = 0;
	3559	+ int err;
3389	3560
3390		- if (!strcmp(this_char,"size")) {
3391		- unsigned long long size;
3392		- size = memparse(value,&rest);
3393		- if (*rest == '%') {
3394		- size <<= PAGE_SHIFT;
3395		- size *= totalram_pages;
3396		- do_div(size, 100);
3397		- rest++;
	3561	+ if (value) {
	3562	+ *value++ = '\0';
	3563	+ len = strlen(value);
3398	3564	}
3399		- if (*rest)
3400		- goto bad_val;
3401		- sbinfo->max_blocks =
3402		- DIV_ROUND_UP(size, PAGE_SIZE);
3403		- } else if (!strcmp(this_char,"nr_blocks")) {
3404		- sbinfo->max_blocks = memparse(value, &rest);
3405		- if (*rest)
3406		- goto bad_val;
3407		- } else if (!strcmp(this_char,"nr_inodes")) {
3408		- sbinfo->max_inodes = memparse(value, &rest);
3409		- if (*rest)
3410		- goto bad_val;
3411		- } else if (!strcmp(this_char,"mode")) {
3412		- if (remount)
3413		- continue;
3414		- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3415		- if (*rest)
3416		- goto bad_val;
3417		- } else if (!strcmp(this_char,"uid")) {
3418		- if (remount)
3419		- continue;
3420		- uid = simple_strtoul(value, &rest, 0);
3421		- if (*rest)
3422		- goto bad_val;
3423		- sbinfo->uid = make_kuid(current_user_ns(), uid);
3424		- if (!uid_valid(sbinfo->uid))
3425		- goto bad_val;
3426		- } else if (!strcmp(this_char,"gid")) {
3427		- if (remount)
3428		- continue;
3429		- gid = simple_strtoul(value, &rest, 0);
3430		- if (*rest)
3431		- goto bad_val;
3432		- sbinfo->gid = make_kgid(current_user_ns(), gid);
3433		- if (!gid_valid(sbinfo->gid))
3434		- goto bad_val;
3435		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3436		- } else if (!strcmp(this_char, "huge")) {
3437		- int huge;
3438		- huge = shmem_parse_huge(value);
3439		- if (huge < 0)
3440		- goto bad_val;
3441		- if (!has_transparent_hugepage() &&
3442		- huge != SHMEM_HUGE_NEVER)
3443		- goto bad_val;
3444		- sbinfo->huge = huge;
3445		-#endif
3446		-#ifdef CONFIG_NUMA
3447		- } else if (!strcmp(this_char,"mpol")) {
3448		- mpol_put(mpol);
3449		- mpol = NULL;
3450		- if (mpol_parse_str(value, &mpol))
3451		- goto bad_val;
3452		-#endif
3453		- } else {
3454		- pr_err("tmpfs: Bad mount option %s\n", this_char);
3455		- goto error;
	3565	+ err = vfs_parse_fs_string(fc, this_char, value, len);
	3566	+ if (err < 0)
	3567	+ return err;
3456	3568	}
3457	3569	}
3458		- sbinfo->mpol = mpol;
3459	3570	return 0;
3460		-
3461		-bad_val:
3462		- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3463		- value, this_char);
3464		-error:
3465		- mpol_put(mpol);
3466		- return 1;
3467		-
3468	3571	}
3469	3572
3470		-static int shmem_remount_fs(struct super_block sb, int flags, char *data)
	3573	+/*
	3574	+ * Reconfigure a shmem filesystem.
	3575	+ *
	3576	+ * Note that we disallow change from limited->unlimited blocks/inodes while any
	3577	+ * are in use; but we must separately disallow unlimited->limited, because in
	3578	+ * that case we have no record of how much is already in use.
	3579	+ */
	3580	+static int shmem_reconfigure(struct fs_context *fc)
3471	3581	{
3472		- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3473		- struct shmem_sb_info config = *sbinfo;
	3582	+ struct shmem_options *ctx = fc->fs_private;
	3583	+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3474	3584	unsigned long inodes;
3475		- int error = -EINVAL;
3476		-
3477		- config.mpol = NULL;
3478		- if (shmem_parse_options(data, &config, true))
3479		- return error;
	3585	+ const char *err;
3480	3586
3481	3587	spin_lock(&sbinfo->stat_lock);
3482	3588	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3483		- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3484		- goto out;
3485		- if (config.max_inodes < inodes)
3486		- goto out;
3487		- /*
3488		- * Those tests disallow limited->unlimited while any are in use;
3489		- * but we must separately disallow unlimited->limited, because
3490		- * in that case we have no record of how much is already in use.
3491		- */
3492		- if (config.max_blocks && !sbinfo->max_blocks)
3493		- goto out;
3494		- if (config.max_inodes && !sbinfo->max_inodes)
3495		- goto out;
	3589	+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
	3590	+ if (!sbinfo->max_blocks) {
	3591	+ err = "Cannot retroactively limit size";
	3592	+ goto out;
	3593	+ }
	3594	+ if (percpu_counter_compare(&sbinfo->used_blocks,
	3595	+ ctx->blocks) > 0) {
	3596	+ err = "Too small a size for current use";
	3597	+ goto out;
	3598	+ }
	3599	+ }
	3600	+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
	3601	+ if (!sbinfo->max_inodes) {
	3602	+ err = "Cannot retroactively limit inodes";
	3603	+ goto out;
	3604	+ }
	3605	+ if (ctx->inodes < inodes) {
	3606	+ err = "Too few inodes for current use";
	3607	+ goto out;
	3608	+ }
	3609	+ }
3496	3610
3497		- error = 0;
3498		- sbinfo->huge = config.huge;
3499		- sbinfo->max_blocks = config.max_blocks;
3500		- sbinfo->max_inodes = config.max_inodes;
3501		- sbinfo->free_inodes = config.max_inodes - inodes;
	3611	+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
	3612	+ sbinfo->next_ino > UINT_MAX) {
	3613	+ err = "Current inum too high to switch to 32-bit inums";
	3614	+ goto out;
	3615	+ }
	3616	+
	3617	+ if (ctx->seen & SHMEM_SEEN_HUGE)
	3618	+ sbinfo->huge = ctx->huge;
	3619	+ if (ctx->seen & SHMEM_SEEN_INUMS)
	3620	+ sbinfo->full_inums = ctx->full_inums;
	3621	+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
	3622	+ sbinfo->max_blocks = ctx->blocks;
	3623	+ if (ctx->seen & SHMEM_SEEN_INODES) {
	3624	+ sbinfo->max_inodes = ctx->inodes;
	3625	+ sbinfo->free_inodes = ctx->inodes - inodes;
	3626	+ }
3502	3627
3503	3628	/*
3504	3629	* Preserve previous mempolicy unless mpol remount option was specified.
3505	3630	*/
3506		- if (config.mpol) {
	3631	+ if (ctx->mpol) {
3507	3632	mpol_put(sbinfo->mpol);
3508		- sbinfo->mpol = config.mpol; /* transfers initial ref */
	3633	+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
	3634	+ ctx->mpol = NULL;
3509	3635	}
	3636	+ spin_unlock(&sbinfo->stat_lock);
	3637	+ return 0;
3510	3638	out:
3511	3639	spin_unlock(&sbinfo->stat_lock);
3512		- return error;
	3640	+ return invalfc(fc, "%s", err);
3513	3641	}
3514	3642
3515	3643	static int shmem_show_options(struct seq_file seq, struct dentry root)
..	..	@@ -3529,7 +3657,30 @@
3529	3657	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3530	3658	seq_printf(seq, ",gid=%u",
3531	3659	from_kgid_munged(&init_user_ns, sbinfo->gid));
3532		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	3660	+
	3661	+ /*
	3662	+ * Showing inode{64,32} might be useful even if it's the system default,
	3663	+ * since then people don't have to resort to checking both here and
	3664	+ * /proc/config.gz to confirm 64-bit inums were successfully applied
	3665	+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
	3666	+ *
	3667	+ * We hide it when inode64 isn't the default and we are using 32-bit
	3668	+ * inodes, since that probably just means the feature isn't even under
	3669	+ * consideration.
	3670	+ *
	3671	+ * As such:
	3672	+ *
	3673	+ * +-----------------+-----------------+
	3674	+ * \| TMPFS_INODE64=y \| TMPFS_INODE64=n \|
	3675	+ * +------------------+-----------------+-----------------+
	3676	+ * \| full_inums=true \| show \| show \|
	3677	+ * \| full_inums=false \| show \| hide \|
	3678	+ * +------------------+-----------------+-----------------+
	3679	+ *
	3680	+ */
	3681	+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) \|\| sbinfo->full_inums)
	3682	+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
	3683	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3533	3684	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3534	3685	if (sbinfo->huge)
3535	3686	seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
..	..	@@ -3544,14 +3695,16 @@
3544	3695	{
3545	3696	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3546	3697
	3698	+ free_percpu(sbinfo->ino_batch);
3547	3699	percpu_counter_destroy(&sbinfo->used_blocks);
3548	3700	mpol_put(sbinfo->mpol);
3549	3701	kfree(sbinfo);
3550	3702	sb->s_fs_info = NULL;
3551	3703	}
3552	3704
3553		-int shmem_fill_super(struct super_block sb, void data, int silent)
	3705	+static int shmem_fill_super(struct super_block sb, struct fs_context fc)
3554	3706	{
	3707	+ struct shmem_options *ctx = fc->fs_private;
3555	3708	struct inode *inode;
3556	3709	struct shmem_sb_info *sbinfo;
3557	3710	int err = -ENOMEM;
..	..	@@ -3562,9 +3715,6 @@
3562	3715	if (!sbinfo)
3563	3716	return -ENOMEM;
3564	3717
3565		- sbinfo->mode = 0777 \| S_ISVTX;
3566		- sbinfo->uid = current_fsuid();
3567		- sbinfo->gid = current_fsgid();
3568	3718	sb->s_fs_info = sbinfo;
3569	3719
3570	3720	#ifdef CONFIG_TMPFS
..	..	@@ -3574,12 +3724,12 @@
3574	3724	* but the internal instance is left unlimited.
3575	3725	*/
3576	3726	if (!(sb->s_flags & SB_KERNMOUNT)) {
3577		- sbinfo->max_blocks = shmem_default_max_blocks();
3578		- sbinfo->max_inodes = shmem_default_max_inodes();
3579		- if (shmem_parse_options(data, sbinfo, false)) {
3580		- err = -EINVAL;
3581		- goto failed;
3582		- }
	3727	+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
	3728	+ ctx->blocks = shmem_default_max_blocks();
	3729	+ if (!(ctx->seen & SHMEM_SEEN_INODES))
	3730	+ ctx->inodes = shmem_default_max_inodes();
	3731	+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
	3732	+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3583	3733	} else {
3584	3734	sb->s_flags \|= SB_NOUSER;
3585	3735	}
..	..	@@ -3588,11 +3738,24 @@
3588	3738	#else
3589	3739	sb->s_flags \|= SB_NOUSER;
3590	3740	#endif
	3741	+ sbinfo->max_blocks = ctx->blocks;
	3742	+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
	3743	+ if (sb->s_flags & SB_KERNMOUNT) {
	3744	+ sbinfo->ino_batch = alloc_percpu(ino_t);
	3745	+ if (!sbinfo->ino_batch)
	3746	+ goto failed;
	3747	+ }
	3748	+ sbinfo->uid = ctx->uid;
	3749	+ sbinfo->gid = ctx->gid;
	3750	+ sbinfo->full_inums = ctx->full_inums;
	3751	+ sbinfo->mode = ctx->mode;
	3752	+ sbinfo->huge = ctx->huge;
	3753	+ sbinfo->mpol = ctx->mpol;
	3754	+ ctx->mpol = NULL;
3591	3755
3592	3756	spin_lock_init(&sbinfo->stat_lock);
3593	3757	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3594	3758	goto failed;
3595		- sbinfo->free_inodes = sbinfo->max_inodes;
3596	3759	spin_lock_init(&sbinfo->shrinklist_lock);
3597	3760	INIT_LIST_HEAD(&sbinfo->shrinklist);
3598	3761
..	..	@@ -3625,6 +3788,31 @@
3625	3788	return err;
3626	3789	}
3627	3790
	3791	+static int shmem_get_tree(struct fs_context *fc)
	3792	+{
	3793	+ return get_tree_nodev(fc, shmem_fill_super);
	3794	+}
	3795	+
	3796	+static void shmem_free_fc(struct fs_context *fc)
	3797	+{
	3798	+ struct shmem_options *ctx = fc->fs_private;
	3799	+
	3800	+ if (ctx) {
	3801	+ mpol_put(ctx->mpol);
	3802	+ kfree(ctx);
	3803	+ }
	3804	+}
	3805	+
	3806	+static const struct fs_context_operations shmem_fs_context_ops = {
	3807	+ .free = shmem_free_fc,
	3808	+ .get_tree = shmem_get_tree,
	3809	+#ifdef CONFIG_TMPFS
	3810	+ .parse_monolithic = shmem_parse_options,
	3811	+ .parse_param = shmem_parse_one,
	3812	+ .reconfigure = shmem_reconfigure,
	3813	+#endif
	3814	+};
	3815	+
3628	3816	static struct kmem_cache *shmem_inode_cachep;
3629	3817
3630	3818	static struct inode shmem_alloc_inode(struct super_block sb)
..	..	@@ -3636,9 +3824,8 @@
3636	3824	return &info->vfs_inode;
3637	3825	}
3638	3826
3639		-static void shmem_destroy_callback(struct rcu_head *head)
	3827	+static void shmem_free_in_core_inode(struct inode *inode)
3640	3828	{
3641		- struct inode *inode = container_of(head, struct inode, i_rcu);
3642	3829	if (S_ISLNK(inode->i_mode))
3643	3830	kfree(inode->i_link);
3644	3831	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
..	..	@@ -3648,7 +3835,6 @@
3648	3835	{
3649	3836	if (S_ISREG(inode->i_mode))
3650	3837	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3651		- call_rcu(&inode->i_rcu, shmem_destroy_callback);
3652	3838	}
3653	3839
3654	3840	static void shmem_init_inode(void *foo)
..	..	@@ -3739,16 +3925,16 @@
3739	3925
3740	3926	static const struct super_operations shmem_ops = {
3741	3927	.alloc_inode = shmem_alloc_inode,
	3928	+ .free_inode = shmem_free_in_core_inode,
3742	3929	.destroy_inode = shmem_destroy_inode,
3743	3930	#ifdef CONFIG_TMPFS
3744	3931	.statfs = shmem_statfs,
3745		- .remount_fs = shmem_remount_fs,
3746	3932	.show_options = shmem_show_options,
3747	3933	#endif
3748	3934	.evict_inode = shmem_evict_inode,
3749	3935	.drop_inode = generic_delete_inode,
3750	3936	.put_super = shmem_put_super,
3751		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	3937	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3752	3938	.nr_cached_objects = shmem_unused_huge_count,
3753	3939	.free_cached_objects = shmem_unused_huge_scan,
3754	3940	#endif
..	..	@@ -3761,29 +3947,42 @@
3761	3947	.set_policy = shmem_set_policy,
3762	3948	.get_policy = shmem_get_policy,
3763	3949	#endif
	3950	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	3951	+ .allow_speculation = filemap_allow_speculation,
	3952	+#endif
3764	3953	};
3765	3954
3766		-static struct dentry shmem_mount(struct file_system_type fs_type,
3767		- int flags, const char dev_name, void data)
	3955	+int shmem_init_fs_context(struct fs_context *fc)
3768	3956	{
3769		- return mount_nodev(fs_type, flags, data, shmem_fill_super);
	3957	+ struct shmem_options *ctx;
	3958	+
	3959	+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
	3960	+ if (!ctx)
	3961	+ return -ENOMEM;
	3962	+
	3963	+ ctx->mode = 0777 \| S_ISVTX;
	3964	+ ctx->uid = current_fsuid();
	3965	+ ctx->gid = current_fsgid();
	3966	+
	3967	+ fc->fs_private = ctx;
	3968	+ fc->ops = &shmem_fs_context_ops;
	3969	+ return 0;
3770	3970	}
3771	3971
3772	3972	static struct file_system_type shmem_fs_type = {
3773	3973	.owner = THIS_MODULE,
3774	3974	.name = "tmpfs",
3775		- .mount = shmem_mount,
	3975	+ .init_fs_context = shmem_init_fs_context,
	3976	+#ifdef CONFIG_TMPFS
	3977	+ .parameters = shmem_fs_parameters,
	3978	+#endif
3776	3979	.kill_sb = kill_litter_super,
3777		- .fs_flags = FS_USERNS_MOUNT,
	3980	+ .fs_flags = FS_USERNS_MOUNT \| FS_THP_SUPPORT,
3778	3981	};
3779	3982
3780	3983	int __init shmem_init(void)
3781	3984	{
3782	3985	int error;
3783		-
3784		- /* If rootfs called this, don't re-init */
3785		- if (shmem_inode_cachep)
3786		- return 0;
3787	3986
3788	3987	shmem_init_inodecache();
3789	3988
..	..	@@ -3800,7 +3999,7 @@
3800	3999	goto out1;
3801	4000	}
3802	4001
3803		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	4002	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3804	4003	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3805	4004	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3806	4005	else
..	..	@@ -3816,11 +4015,11 @@
3816	4015	return error;
3817	4016	}
3818	4017
3819		-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
	4018	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
3820	4019	static ssize_t shmem_enabled_show(struct kobject *kobj,
3821	4020	struct kobj_attribute attr, char buf)
3822	4021	{
3823		- int values[] = {
	4022	+ static const int values[] = {
3824	4023	SHMEM_HUGE_ALWAYS,
3825	4024	SHMEM_HUGE_WITHIN_SIZE,
3826	4025	SHMEM_HUGE_ADVISE,
..	..	@@ -3868,9 +4067,9 @@
3868	4067
3869	4068	struct kobj_attribute shmem_enabled_attr =
3870	4069	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3871		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
	4070	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
3872	4071
3873		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	4072	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3874	4073	bool shmem_huge_enabled(struct vm_area_struct *vma)
3875	4074	{
3876	4075	struct inode *inode = file_inode(vma->vm_file);
..	..	@@ -3878,6 +4077,8 @@
3878	4077	loff_t i_size;
3879	4078	pgoff_t off;
3880	4079
	4080	+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
	4081	+ return false;
3881	4082	if (shmem_huge == SHMEM_HUGE_FORCE)
3882	4083	return true;
3883	4084	if (shmem_huge == SHMEM_HUGE_DENY)
..	..	@@ -3893,7 +4094,7 @@
3893	4094	if (i_size >= HPAGE_PMD_SIZE &&
3894	4095	i_size >> PAGE_SHIFT >= off)
3895	4096	return true;
3896		- /* fall through */
	4097	+ fallthrough;
3897	4098	case SHMEM_HUGE_ADVISE:
3898	4099	/* TODO: implement fadvise() hints */
3899	4100	return (vma->vm_flags & VM_HUGEPAGE);
..	..	@@ -3902,7 +4103,7 @@
3902	4103	return false;
3903	4104	}
3904	4105	}
3905		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	4106	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3906	4107
3907	4108	#else /* !CONFIG_SHMEM */
3908	4109
..	..	@@ -3917,8 +4118,9 @@
3917	4118
3918	4119	static struct file_system_type shmem_fs_type = {
3919	4120	.name = "tmpfs",
3920		- .mount = ramfs_mount,
3921		- .kill_sb = kill_litter_super,
	4121	+ .init_fs_context = ramfs_init_fs_context,
	4122	+ .parameters = ramfs_fs_parameters,
	4123	+ .kill_sb = ramfs_kill_sb,
3922	4124	.fs_flags = FS_USERNS_MOUNT,
3923	4125	};
3924	4126
..	..	@@ -3932,7 +4134,8 @@
3932	4134	return 0;
3933	4135	}
3934	4136
3935		-int shmem_unuse(swp_entry_t swap, struct page *page)
	4137	+int shmem_unuse(unsigned int type, bool frontswap,
	4138	+ unsigned long *fs_pages_to_unuse)
3936	4139	{
3937	4140	return 0;
3938	4141	}
..	..	@@ -4047,7 +4250,7 @@
4047	4250
4048	4251	/**
4049	4252	* shmem_zero_setup - setup a shared anonymous mapping
4050		- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
	4253	+ * @vma: the vma to be mmapped is prepared by do_mmap
4051	4254	*/
4052	4255	int shmem_zero_setup(struct vm_area_struct *vma)
4053	4256	{
..	..	@@ -4055,7 +4258,7 @@
4055	4258	loff_t size = vma->vm_end - vma->vm_start;
4056	4259
4057	4260	/*
4058		- * Cloning a new file under mmap_sem leads to a lock ordering conflict
	4261	+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
4059	4262	* between XFS directory reading and selinux: since this file is only
4060	4263	* accessible to the user through its mapping, use S_PRIVATE flag to
4061	4264	* bypass file security, in the same way as shmem_kernel_file_setup().
..	..	@@ -4069,7 +4272,7 @@
4069	4272	vma->vm_file = file;
4070	4273	vma->vm_ops = &shmem_vm_ops;
4071	4274
4072		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	4275	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4073	4276	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4074	4277	(vma->vm_end & HPAGE_PMD_MASK)) {
4075	4278	khugepaged_enter(vma, vma->vm_flags);
..	..	@@ -4117,3 +4320,47 @@
4117	4320	#endif
4118	4321	}
4119	4322	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
	4323	+
	4324	+void shmem_mark_page_lazyfree(struct page *page, bool tail)
	4325	+{
	4326	+ mark_page_lazyfree_movetail(page, tail);
	4327	+}
	4328	+EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree);
	4329	+
	4330	+int reclaim_shmem_address_space(struct address_space *mapping)
	4331	+{
	4332	+#ifdef CONFIG_SHMEM
	4333	+ pgoff_t start = 0;
	4334	+ struct page *page;
	4335	+ LIST_HEAD(page_list);
	4336	+ XA_STATE(xas, &mapping->i_pages, start);
	4337	+
	4338	+ if (!shmem_mapping(mapping))
	4339	+ return -EINVAL;
	4340	+
	4341	+ lru_add_drain();
	4342	+
	4343	+ rcu_read_lock();
	4344	+ xas_for_each(&xas, page, ULONG_MAX) {
	4345	+ if (xas_retry(&xas, page))
	4346	+ continue;
	4347	+ if (xa_is_value(page))
	4348	+ continue;
	4349	+ if (isolate_lru_page(page))
	4350	+ continue;
	4351	+
	4352	+ list_add(&page->lru, &page_list);
	4353	+
	4354	+ if (need_resched()) {
	4355	+ xas_pause(&xas);
	4356	+ cond_resched_rcu();
	4357	+ }
	4358	+ }
	4359	+ rcu_read_unlock();
	4360	+
	4361	+ return reclaim_pages(&page_list);
	4362	+#else
	4363	+ return 0;
	4364	+#endif
	4365	+}
	4366	+EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);