~hc/RK356X_SDK_RELEASE.git

..	..	@@ -36,8 +36,17 @@
36	36	#include <linux/uio.h>
37	37	#include <linux/khugepaged.h>
38	38	#include <linux/hugetlb.h>
	39	+#include <linux/frontswap.h>
	40	+#include <linux/fs_parser.h>
	41	+#include <linux/mm_inline.h>
39	42
40	43	#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
	44	+
	45	+#include "internal.h"
	46	+
	47	+#undef CREATE_TRACE_POINTS
	48	+#include <trace/hooks/shmem_fs.h>
	49	+#include <trace/hooks/mm.h>
41	50
42	51	static struct vfsmount *shm_mnt;
43	52
..	..	@@ -80,7 +89,6 @@
80	89	#include <linux/uuid.h>
81	90
82	91	#include <linux/uaccess.h>
83		-#include <asm/pgtable.h>
84	92
85	93	#include "internal.h"
86	94
..	..	@@ -106,21 +114,43 @@
106	114	pgoff_t nr_unswapped; /* how often writepage refused to swap out */
107	115	};
108	116
	117	+struct shmem_options {
	118	+ unsigned long long blocks;
	119	+ unsigned long long inodes;
	120	+ struct mempolicy *mpol;
	121	+ kuid_t uid;
	122	+ kgid_t gid;
	123	+ umode_t mode;
	124	+ bool full_inums;
	125	+ int huge;
	126	+ int seen;
	127	+#define SHMEM_SEEN_BLOCKS 1
	128	+#define SHMEM_SEEN_INODES 2
	129	+#define SHMEM_SEEN_HUGE 4
	130	+#define SHMEM_SEEN_INUMS 8
	131	+};
	132	+
109	133	#ifdef CONFIG_TMPFS
110	134	static unsigned long shmem_default_max_blocks(void)
111	135	{
112		- return totalram_pages / 2;
	136	+ return totalram_pages() / 2;
113	137	}
114	138
115	139	static unsigned long shmem_default_max_inodes(void)
116	140	{
117		- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
	141	+ unsigned long nr_pages = totalram_pages();
	142	+
	143	+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
118	144	}
119	145	#endif
120	146
121	147	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
122	148	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
123	149	struct shmem_inode_info *info, pgoff_t index);
	150	+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
	151	+ struct page **pagep, enum sgp_type sgp,
	152	+ gfp_t gfp, struct vm_area_struct *vma,
	153	+ vm_fault_t *fault_type);
124	154	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
125	155	struct page **pagep, enum sgp_type sgp,
126	156	gfp_t gfp, struct vm_area_struct *vma,
..	..	@@ -239,18 +269,79 @@
239	269	static LIST_HEAD(shmem_swaplist);
240	270	static DEFINE_MUTEX(shmem_swaplist_mutex);
241	271
242		-static int shmem_reserve_inode(struct super_block *sb)
	272	+/*
	273	+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
	274	+ * produces a novel ino for the newly allocated inode.
	275	+ *
	276	+ * It may also be called when making a hard link to permit the space needed by
	277	+ * each dentry. However, in that case, no new inode number is needed since that
	278	+ * internally draws from another pool of inode numbers (currently global
	279	+ * get_next_ino()). This case is indicated by passing NULL as inop.
	280	+ */
	281	+#define SHMEM_INO_BATCH 1024
	282	+static int shmem_reserve_inode(struct super_block sb, ino_t inop)
243	283	{
244	284	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
245		- if (sbinfo->max_inodes) {
246		- spin_lock(&sbinfo->stat_lock);
247		- if (!sbinfo->free_inodes) {
248		- spin_unlock(&sbinfo->stat_lock);
249		- return -ENOSPC;
	285	+ ino_t ino;
	286	+
	287	+ if (!(sb->s_flags & SB_KERNMOUNT)) {
	288	+ raw_spin_lock(&sbinfo->stat_lock);
	289	+ if (sbinfo->max_inodes) {
	290	+ if (!sbinfo->free_inodes) {
	291	+ raw_spin_unlock(&sbinfo->stat_lock);
	292	+ return -ENOSPC;
	293	+ }
	294	+ sbinfo->free_inodes--;
250	295	}
251		- sbinfo->free_inodes--;
252		- spin_unlock(&sbinfo->stat_lock);
	296	+ if (inop) {
	297	+ ino = sbinfo->next_ino++;
	298	+ if (unlikely(is_zero_ino(ino)))
	299	+ ino = sbinfo->next_ino++;
	300	+ if (unlikely(!sbinfo->full_inums &&
	301	+ ino > UINT_MAX)) {
	302	+ /*
	303	+ * Emulate get_next_ino uint wraparound for
	304	+ * compatibility
	305	+ */
	306	+ if (IS_ENABLED(CONFIG_64BIT))
	307	+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
	308	+ __func__, MINOR(sb->s_dev));
	309	+ sbinfo->next_ino = 1;
	310	+ ino = sbinfo->next_ino++;
	311	+ }
	312	+ *inop = ino;
	313	+ }
	314	+ raw_spin_unlock(&sbinfo->stat_lock);
	315	+ } else if (inop) {
	316	+ /*
	317	+ * __shmem_file_setup, one of our callers, is lock-free: it
	318	+ * doesn't hold stat_lock in shmem_reserve_inode since
	319	+ * max_inodes is always 0, and is called from potentially
	320	+ * unknown contexts. As such, use a per-cpu batched allocator
	321	+ * which doesn't require the per-sb stat_lock unless we are at
	322	+ * the batch boundary.
	323	+ *
	324	+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
	325	+ * shmem mounts are not exposed to userspace, so we don't need
	326	+ * to worry about things like glibc compatibility.
	327	+ */
	328	+ ino_t *next_ino;
	329	+
	330	+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
	331	+ ino = *next_ino;
	332	+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
	333	+ raw_spin_lock(&sbinfo->stat_lock);
	334	+ ino = sbinfo->next_ino;
	335	+ sbinfo->next_ino += SHMEM_INO_BATCH;
	336	+ raw_spin_unlock(&sbinfo->stat_lock);
	337	+ if (unlikely(is_zero_ino(ino)))
	338	+ ino++;
	339	+ }
	340	+ *inop = ino;
	341	+ *next_ino = ++ino;
	342	+ put_cpu();
253	343	}
	344	+
254	345	return 0;
255	346	}
256	347
..	..	@@ -258,9 +349,9 @@
258	349	{
259	350	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
260	351	if (sbinfo->max_inodes) {
261		- spin_lock(&sbinfo->stat_lock);
	352	+ raw_spin_lock(&sbinfo->stat_lock);
262	353	sbinfo->free_inodes++;
263		- spin_unlock(&sbinfo->stat_lock);
	354	+ raw_spin_unlock(&sbinfo->stat_lock);
264	355	}
265	356	}
266	357
..	..	@@ -326,24 +417,20 @@
326	417	}
327	418
328	419	/*
329		- * Replace item expected in radix tree by a new item, while holding tree lock.
	420	+ * Replace item expected in xarray by a new item, while holding xa_lock.
330	421	*/
331		-static int shmem_radix_tree_replace(struct address_space *mapping,
	422	+static int shmem_replace_entry(struct address_space *mapping,
332	423	pgoff_t index, void expected, void replacement)
333	424	{
334		- struct radix_tree_node *node;
335		- void __rcu **pslot;
	425	+ XA_STATE(xas, &mapping->i_pages, index);
336	426	void *item;
337	427
338	428	VM_BUG_ON(!expected);
339	429	VM_BUG_ON(!replacement);
340		- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
341		- if (!item)
342		- return -ENOENT;
	430	+ item = xas_load(&xas);
343	431	if (item != expected)
344	432	return -ENOENT;
345		- __radix_tree_replace(&mapping->i_pages, node, pslot,
346		- replacement, NULL);
	433	+ xas_store(&xas, replacement);
347	434	return 0;
348	435	}
349	436
..	..	@@ -357,12 +444,7 @@
357	444	static bool shmem_confirm_swap(struct address_space *mapping,
358	445	pgoff_t index, swp_entry_t swap)
359	446	{
360		- void *item;
361		-
362		- rcu_read_lock();
363		- item = radix_tree_lookup(&mapping->i_pages, index);
364		- rcu_read_unlock();
365		- return item == swp_to_radix_entry(swap);
	447	+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
366	448	}
367	449
368	450	/*
..	..	@@ -397,12 +479,12 @@
397	479	#define SHMEM_HUGE_DENY (-1)
398	480	#define SHMEM_HUGE_FORCE (-2)
399	481
400		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	482	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
401	483	/* ifdef here to avoid bloating shmem.o when not necessary */
402	484
403	485	static int shmem_huge __read_mostly;
404	486
405		-#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
	487	+#if defined(CONFIG_SYSFS)
406	488	static int shmem_parse_huge(const char *str)
407	489	{
408	490	if (!strcmp(str, "never"))
..	..	@@ -419,7 +501,9 @@
419	501	return SHMEM_HUGE_FORCE;
420	502	return -EINVAL;
421	503	}
	504	+#endif
422	505
	506	+#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
423	507	static const char *shmem_format_huge(int huge)
424	508	{
425	509	switch (huge) {
..	..	@@ -570,7 +654,7 @@
570	654	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
571	655	return READ_ONCE(sbinfo->shrinklist_len);
572	656	}
573		-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	657	+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
574	658
575	659	#define shmem_huge SHMEM_HUGE_DENY
576	660
..	..	@@ -579,11 +663,11 @@
579	663	{
580	664	return 0;
581	665	}
582		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	666	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
583	667
584	668	static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
585	669	{
586		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	670	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
587	671	(shmem_huge == SHMEM_HUGE_FORCE \|\| sbinfo->huge) &&
588	672	shmem_huge != SHMEM_HUGE_DENY)
589	673	return true;
..	..	@@ -595,9 +679,13 @@
595	679	*/
596	680	static int shmem_add_to_page_cache(struct page *page,
597	681	struct address_space *mapping,
598		- pgoff_t index, void *expected)
	682	+ pgoff_t index, void *expected, gfp_t gfp,
	683	+ struct mm_struct *charge_mm)
599	684	{
600		- int error, nr = hpage_nr_pages(page);
	685	+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
	686	+ unsigned long i = 0;
	687	+ unsigned long nr = compound_nr(page);
	688	+ int error;
601	689
602	690	VM_BUG_ON_PAGE(PageTail(page), page);
603	691	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
..	..	@@ -609,46 +697,53 @@
609	697	page->mapping = mapping;
610	698	page->index = index;
611	699
612		- xa_lock_irq(&mapping->i_pages);
613		- if (PageTransHuge(page)) {
614		- void __rcu **results;
615		- pgoff_t idx;
616		- int i;
617		-
618		- error = 0;
619		- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
620		- &results, &idx, index, 1) &&
621		- idx < index + HPAGE_PMD_NR) {
622		- error = -EEXIST;
623		- }
624		-
625		- if (!error) {
626		- for (i = 0; i < HPAGE_PMD_NR; i++) {
627		- error = radix_tree_insert(&mapping->i_pages,
628		- index + i, page + i);
629		- VM_BUG_ON(error);
	700	+ if (!PageSwapCache(page)) {
	701	+ error = mem_cgroup_charge(page, charge_mm, gfp);
	702	+ if (error) {
	703	+ if (PageTransHuge(page)) {
	704	+ count_vm_event(THP_FILE_FALLBACK);
	705	+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
630	706	}
631		- count_vm_event(THP_FILE_ALLOC);
	707	+ goto error;
632	708	}
633		- } else if (!expected) {
634		- error = radix_tree_insert(&mapping->i_pages, index, page);
635		- } else {
636		- error = shmem_radix_tree_replace(mapping, index, expected,
637		- page);
	709	+ }
	710	+ cgroup_throttle_swaprate(page, gfp);
	711	+
	712	+ do {
	713	+ void *entry;
	714	+ xas_lock_irq(&xas);
	715	+ entry = xas_find_conflict(&xas);
	716	+ if (entry != expected)
	717	+ xas_set_err(&xas, -EEXIST);
	718	+ xas_create_range(&xas);
	719	+ if (xas_error(&xas))
	720	+ goto unlock;
	721	+next:
	722	+ xas_store(&xas, page);
	723	+ if (++i < nr) {
	724	+ xas_next(&xas);
	725	+ goto next;
	726	+ }
	727	+ if (PageTransHuge(page)) {
	728	+ count_vm_event(THP_FILE_ALLOC);
	729	+ __inc_node_page_state(page, NR_SHMEM_THPS);
	730	+ }
	731	+ mapping->nrpages += nr;
	732	+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
	733	+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
	734	+unlock:
	735	+ xas_unlock_irq(&xas);
	736	+ } while (xas_nomem(&xas, gfp));
	737	+
	738	+ if (xas_error(&xas)) {
	739	+ error = xas_error(&xas);
	740	+ goto error;
638	741	}
639	742
640		- if (!error) {
641		- mapping->nrpages += nr;
642		- if (PageTransHuge(page))
643		- __inc_node_page_state(page, NR_SHMEM_THPS);
644		- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
645		- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
646		- xa_unlock_irq(&mapping->i_pages);
647		- } else {
648		- page->mapping = NULL;
649		- xa_unlock_irq(&mapping->i_pages);
650		- page_ref_sub(page, nr);
651		- }
	743	+ return 0;
	744	+error:
	745	+ page->mapping = NULL;
	746	+ page_ref_sub(page, nr);
652	747	return error;
653	748	}
654	749
..	..	@@ -663,27 +758,25 @@
663	758	VM_BUG_ON_PAGE(PageCompound(page), page);
664	759
665	760	xa_lock_irq(&mapping->i_pages);
666		- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
	761	+ error = shmem_replace_entry(mapping, page->index, page, radswap);
667	762	page->mapping = NULL;
668	763	mapping->nrpages--;
669		- __dec_node_page_state(page, NR_FILE_PAGES);
670		- __dec_node_page_state(page, NR_SHMEM);
	764	+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
	765	+ __dec_lruvec_page_state(page, NR_SHMEM);
671	766	xa_unlock_irq(&mapping->i_pages);
672	767	put_page(page);
673	768	BUG_ON(error);
674	769	}
675	770
676	771	/*
677		- * Remove swap entry from radix tree, free the swap and its page cache.
	772	+ * Remove swap entry from page cache, free the swap and its page cache.
678	773	*/
679	774	static int shmem_free_swap(struct address_space *mapping,
680	775	pgoff_t index, void *radswap)
681	776	{
682	777	void *old;
683	778
684		- xa_lock_irq(&mapping->i_pages);
685		- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
686		- xa_unlock_irq(&mapping->i_pages);
	779	+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
687	780	if (old != radswap)
688	781	return -ENOENT;
689	782	free_swap_and_cache(radix_to_swp_entry(radswap));
..	..	@@ -700,29 +793,19 @@
700	793	unsigned long shmem_partial_swap_usage(struct address_space *mapping,
701	794	pgoff_t start, pgoff_t end)
702	795	{
703		- struct radix_tree_iter iter;
704		- void __rcu **slot;
	796	+ XA_STATE(xas, &mapping->i_pages, start);
705	797	struct page *page;
706	798	unsigned long swapped = 0;
707	799
708	800	rcu_read_lock();
709		-
710		- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
711		- if (iter.index >= end)
712		- break;
713		-
714		- page = radix_tree_deref_slot(slot);
715		-
716		- if (radix_tree_deref_retry(page)) {
717		- slot = radix_tree_iter_retry(&iter);
	801	+ xas_for_each(&xas, page, end - 1) {
	802	+ if (xas_retry(&xas, page))
718	803	continue;
719		- }
720		-
721		- if (radix_tree_exceptional_entry(page))
	804	+ if (xa_is_value(page))
722	805	swapped++;
723	806
724	807	if (need_resched()) {
725		- slot = radix_tree_iter_resume(slot, &iter);
	808	+ xas_pause(&xas);
726	809	cond_resched_rcu();
727	810	}
728	811	}
..	..	@@ -797,7 +880,33 @@
797	880	}
798	881
799	882	/*
800		- * Remove range of pages and swap entries from radix tree, and free them.
	883	+ * Check whether a hole-punch or truncation needs to split a huge page,
	884	+ * returning true if no split was required, or the split has been successful.
	885	+ *
	886	+ * Eviction (or truncation to 0 size) should never need to split a huge page;
	887	+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
	888	+ * head, and then succeeded to trylock on tail.
	889	+ *
	890	+ * A split can only succeed when there are no additional references on the
	891	+ * huge page: so the split below relies upon find_get_entries() having stopped
	892	+ * when it found a subpage of the huge page, without getting further references.
	893	+ */
	894	+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
	895	+{
	896	+ if (!PageTransCompound(page))
	897	+ return true;
	898	+
	899	+ /* Just proceed to delete a huge page wholly within the range punched */
	900	+ if (PageHead(page) &&
	901	+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
	902	+ return true;
	903	+
	904	+ /* Try to split huge page, so we can truly punch the hole or truncate */
	905	+ return split_huge_page(page) >= 0;
	906	+}
	907	+
	908	+/*
	909	+ * Remove range of pages and swap entries from page cache, and free them.
801	910	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
802	911	*/
803	912	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
..	..	@@ -833,7 +942,7 @@
833	942	if (index >= end)
834	943	break;
835	944
836		- if (radix_tree_exceptional_entry(page)) {
	945	+ if (xa_is_value(page)) {
837	946	if (unfalloc)
838	947	continue;
839	948	nr_swaps_freed += !shmem_free_swap(mapping,
..	..	@@ -846,31 +955,11 @@
846	955	if (!trylock_page(page))
847	956	continue;
848	957
849		- if (PageTransTail(page)) {
850		- /* Middle of THP: zero out the page */
851		- clear_highpage(page);
852		- unlock_page(page);
853		- continue;
854		- } else if (PageTransHuge(page)) {
855		- if (index == round_down(end, HPAGE_PMD_NR)) {
856		- /*
857		- * Range ends in the middle of THP:
858		- * zero out the page
859		- */
860		- clear_highpage(page);
861		- unlock_page(page);
862		- continue;
863		- }
864		- index += HPAGE_PMD_NR - 1;
865		- i += HPAGE_PMD_NR - 1;
866		- }
867		-
868		- if (!unfalloc \|\| !PageUptodate(page)) {
869		- VM_BUG_ON_PAGE(PageTail(page), page);
870		- if (page_mapping(page) == mapping) {
871		- VM_BUG_ON_PAGE(PageWriteback(page), page);
	958	+ if ((!unfalloc \|\| !PageUptodate(page)) &&
	959	+ page_mapping(page) == mapping) {
	960	+ VM_BUG_ON_PAGE(PageWriteback(page), page);
	961	+ if (shmem_punch_compound(page, start, end))
872	962	truncate_inode_page(mapping, page);
873		- }
874	963	}
875	964	unlock_page(page);
876	965	}
..	..	@@ -930,7 +1019,7 @@
930	1019	if (index >= end)
931	1020	break;
932	1021
933		- if (radix_tree_exceptional_entry(page)) {
	1022	+ if (xa_is_value(page)) {
934	1023	if (unfalloc)
935	1024	continue;
936	1025	if (shmem_free_swap(mapping, index, page)) {
..	..	@@ -944,42 +1033,24 @@
944	1033
945	1034	lock_page(page);
946	1035
947		- if (PageTransTail(page)) {
948		- /* Middle of THP: zero out the page */
949		- clear_highpage(page);
950		- unlock_page(page);
951		- /*
952		- * Partial thp truncate due 'start' in middle
953		- * of THP: don't need to look on these pages
954		- * again on !pvec.nr restart.
955		- */
956		- if (index != round_down(end, HPAGE_PMD_NR))
957		- start++;
958		- continue;
959		- } else if (PageTransHuge(page)) {
960		- if (index == round_down(end, HPAGE_PMD_NR)) {
961		- /*
962		- * Range ends in the middle of THP:
963		- * zero out the page
964		- */
965		- clear_highpage(page);
966		- unlock_page(page);
967		- continue;
968		- }
969		- index += HPAGE_PMD_NR - 1;
970		- i += HPAGE_PMD_NR - 1;
971		- }
972		-
973	1036	if (!unfalloc \|\| !PageUptodate(page)) {
974		- VM_BUG_ON_PAGE(PageTail(page), page);
975		- if (page_mapping(page) == mapping) {
976		- VM_BUG_ON_PAGE(PageWriteback(page), page);
977		- truncate_inode_page(mapping, page);
978		- } else {
	1037	+ if (page_mapping(page) != mapping) {
979	1038	/* Page was replaced by swap: retry */
980	1039	unlock_page(page);
981	1040	index--;
982	1041	break;
	1042	+ }
	1043	+ VM_BUG_ON_PAGE(PageWriteback(page), page);
	1044	+ if (shmem_punch_compound(page, start, end))
	1045	+ truncate_inode_page(mapping, page);
	1046	+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
	1047	+ /* Wipe the page and don't get stuck */
	1048	+ clear_highpage(page);
	1049	+ flush_dcache_page(page);
	1050	+ set_page_dirty(page);
	1051	+ if (index <
	1052	+ round_up(start, HPAGE_PMD_NR))
	1053	+ start = index + 1;
983	1054	}
984	1055	}
985	1056	unlock_page(page);
..	..	@@ -1067,7 +1138,7 @@
1067	1138	* Part of the huge page can be beyond i_size: subject
1068	1139	* to shrink under memory pressure.
1069	1140	*/
1070		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
	1141	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1071	1142	spin_lock(&sbinfo->shrinklist_lock);
1072	1143	/*
1073	1144	* _careful to defend against unlocked access to
..	..	@@ -1106,9 +1177,14 @@
1106	1177	}
1107	1178	spin_unlock(&sbinfo->shrinklist_lock);
1108	1179	}
1109		- if (!list_empty(&info->swaplist)) {
	1180	+ while (!list_empty(&info->swaplist)) {
	1181	+ /* Wait while shmem_unuse() is scanning this inode... */
	1182	+ wait_var_event(&info->stop_eviction,
	1183	+ !atomic_read(&info->stop_eviction));
1110	1184	mutex_lock(&shmem_swaplist_mutex);
1111		- list_del_init(&info->swaplist);
	1185	+ /* ...but beware of the race if we peeked too early */
	1186	+ if (!atomic_read(&info->stop_eviction))
	1187	+ list_del_init(&info->swaplist);
1112	1188	mutex_unlock(&shmem_swaplist_mutex);
1113	1189	}
1114	1190	}
..	..	@@ -1119,166 +1195,174 @@
1119	1195	clear_inode(inode);
1120	1196	}
1121	1197
1122		-static unsigned long find_swap_entry(struct radix_tree_root root, void item)
	1198	+extern struct swap_info_struct *swap_info[];
	1199	+
	1200	+static int shmem_find_swap_entries(struct address_space *mapping,
	1201	+ pgoff_t start, unsigned int nr_entries,
	1202	+ struct page *entries, pgoff_t indices,
	1203	+ unsigned int type, bool frontswap)
1123	1204	{
1124		- struct radix_tree_iter iter;
1125		- void __rcu **slot;
1126		- unsigned long found = -1;
1127		- unsigned int checked = 0;
	1205	+ XA_STATE(xas, &mapping->i_pages, start);
	1206	+ struct page *page;
	1207	+ swp_entry_t entry;
	1208	+ unsigned int ret = 0;
	1209	+
	1210	+ if (!nr_entries)
	1211	+ return 0;
1128	1212
1129	1213	rcu_read_lock();
1130		- radix_tree_for_each_slot(slot, root, &iter, 0) {
1131		- void *entry = radix_tree_deref_slot(slot);
1132		-
1133		- if (radix_tree_deref_retry(entry)) {
1134		- slot = radix_tree_iter_retry(&iter);
	1214	+ xas_for_each(&xas, page, ULONG_MAX) {
	1215	+ if (xas_retry(&xas, page))
1135	1216	continue;
	1217	+
	1218	+ if (!xa_is_value(page))
	1219	+ continue;
	1220	+
	1221	+ entry = radix_to_swp_entry(page);
	1222	+ if (swp_type(entry) != type)
	1223	+ continue;
	1224	+ if (frontswap &&
	1225	+ !frontswap_test(swap_info[type], swp_offset(entry)))
	1226	+ continue;
	1227	+
	1228	+ indices[ret] = xas.xa_index;
	1229	+ entries[ret] = page;
	1230	+
	1231	+ if (need_resched()) {
	1232	+ xas_pause(&xas);
	1233	+ cond_resched_rcu();
1136	1234	}
1137		- if (entry == item) {
1138		- found = iter.index;
	1235	+ if (++ret == nr_entries)
1139	1236	break;
1140		- }
1141		- checked++;
1142		- if ((checked % 4096) != 0)
1143		- continue;
1144		- slot = radix_tree_iter_resume(slot, &iter);
1145		- cond_resched_rcu();
1146	1237	}
1147		-
1148	1238	rcu_read_unlock();
1149		- return found;
	1239	+
	1240	+ return ret;
	1241	+}
	1242	+
	1243	+/*
	1244	+ * Move the swapped pages for an inode to page cache. Returns the count
	1245	+ * of pages swapped in, or the error in case of failure.
	1246	+ */
	1247	+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
	1248	+ pgoff_t *indices)
	1249	+{
	1250	+ int i = 0;
	1251	+ int ret = 0;
	1252	+ int error = 0;
	1253	+ struct address_space *mapping = inode->i_mapping;
	1254	+
	1255	+ for (i = 0; i < pvec.nr; i++) {
	1256	+ struct page *page = pvec.pages[i];
	1257	+
	1258	+ if (!xa_is_value(page))
	1259	+ continue;
	1260	+ error = shmem_swapin_page(inode, indices[i],
	1261	+ &page, SGP_CACHE,
	1262	+ mapping_gfp_mask(mapping),
	1263	+ NULL, NULL);
	1264	+ if (error == 0) {
	1265	+ unlock_page(page);
	1266	+ put_page(page);
	1267	+ ret++;
	1268	+ }
	1269	+ if (error == -ENOMEM)
	1270	+ break;
	1271	+ error = 0;
	1272	+ }
	1273	+ return error ? error : ret;
1150	1274	}
1151	1275
1152	1276	/*
1153	1277	* If swap found in inode, free it and move page from swapcache to filecache.
1154	1278	*/
1155		-static int shmem_unuse_inode(struct shmem_inode_info *info,
1156		- swp_entry_t swap, struct page **pagep)
	1279	+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
	1280	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1157	1281	{
1158		- struct address_space *mapping = info->vfs_inode.i_mapping;
1159		- void *radswap;
1160		- pgoff_t index;
1161		- gfp_t gfp;
1162		- int error = 0;
	1282	+ struct address_space *mapping = inode->i_mapping;
	1283	+ pgoff_t start = 0;
	1284	+ struct pagevec pvec;
	1285	+ pgoff_t indices[PAGEVEC_SIZE];
	1286	+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
	1287	+ int ret = 0;
1163	1288
1164		- radswap = swp_to_radix_entry(swap);
1165		- index = find_swap_entry(&mapping->i_pages, radswap);
1166		- if (index == -1)
1167		- return -EAGAIN; /* tell shmem_unuse we found nothing */
	1289	+ pagevec_init(&pvec);
	1290	+ do {
	1291	+ unsigned int nr_entries = PAGEVEC_SIZE;
1168	1292
1169		- /*
1170		- * Move _head_ to start search for next from here.
1171		- * But be careful: shmem_evict_inode checks list_empty without taking
1172		- * mutex, and there's an instant in list_move_tail when info->swaplist
1173		- * would appear empty, if it were the only one on shmem_swaplist.
1174		- */
1175		- if (shmem_swaplist.next != &info->swaplist)
1176		- list_move_tail(&shmem_swaplist, &info->swaplist);
	1293	+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
	1294	+ nr_entries = *fs_pages_to_unuse;
1177	1295
1178		- gfp = mapping_gfp_mask(mapping);
1179		- if (shmem_should_replace_page(*pagep, gfp)) {
1180		- mutex_unlock(&shmem_swaplist_mutex);
1181		- error = shmem_replace_page(pagep, gfp, info, index);
1182		- mutex_lock(&shmem_swaplist_mutex);
1183		- /*
1184		- * We needed to drop mutex to make that restrictive page
1185		- * allocation, but the inode might have been freed while we
1186		- * dropped it: although a racing shmem_evict_inode() cannot
1187		- * complete without emptying the radix_tree, our page lock
1188		- * on this swapcache page is not enough to prevent that -
1189		- * free_swap_and_cache() of our swap entry will only
1190		- * trylock_page(), removing swap from radix_tree whatever.
1191		- *
1192		- * We must not proceed to shmem_add_to_page_cache() if the
1193		- * inode has been freed, but of course we cannot rely on
1194		- * inode or mapping or info to check that. However, we can
1195		- * safely check if our swap entry is still in use (and here
1196		- * it can't have got reused for another page): if it's still
1197		- * in use, then the inode cannot have been freed yet, and we
1198		- * can safely proceed (if it's no longer in use, that tells
1199		- * nothing about the inode, but we don't need to unuse swap).
1200		- */
1201		- if (!page_swapcount(*pagep))
1202		- error = -ENOENT;
1203		- }
1204		-
1205		- /*
1206		- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
1207		- * but also to hold up shmem_evict_inode(): so inode cannot be freed
1208		- * beneath us (pagelock doesn't help until the page is in pagecache).
1209		- */
1210		- if (!error)
1211		- error = shmem_add_to_page_cache(*pagep, mapping, index,
1212		- radswap);
1213		- if (error != -ENOMEM) {
1214		- /*
1215		- * Truncation and eviction use free_swap_and_cache(), which
1216		- * only does trylock page: if we raced, best clean up here.
1217		- */
1218		- delete_from_swap_cache(*pagep);
1219		- set_page_dirty(*pagep);
1220		- if (!error) {
1221		- spin_lock_irq(&info->lock);
1222		- info->swapped--;
1223		- spin_unlock_irq(&info->lock);
1224		- swap_free(swap);
	1296	+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
	1297	+ pvec.pages, indices,
	1298	+ type, frontswap);
	1299	+ if (pvec.nr == 0) {
	1300	+ ret = 0;
	1301	+ break;
1225	1302	}
1226		- }
1227		- return error;
	1303	+
	1304	+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
	1305	+ if (ret < 0)
	1306	+ break;
	1307	+
	1308	+ if (frontswap_partial) {
	1309	+ *fs_pages_to_unuse -= ret;
	1310	+ if (*fs_pages_to_unuse == 0) {
	1311	+ ret = FRONTSWAP_PAGES_UNUSED;
	1312	+ break;
	1313	+ }
	1314	+ }
	1315	+
	1316	+ start = indices[pvec.nr - 1];
	1317	+ } while (true);
	1318	+
	1319	+ return ret;
1228	1320	}
1229	1321
1230	1322	/*
1231		- * Search through swapped inodes to find and replace swap by page.
	1323	+ * Read all the shared memory data that resides in the swap
	1324	+ * device 'type' back into memory, so the swap device can be
	1325	+ * unused.
1232	1326	*/
1233		-int shmem_unuse(swp_entry_t swap, struct page *page)
	1327	+int shmem_unuse(unsigned int type, bool frontswap,
	1328	+ unsigned long *fs_pages_to_unuse)
1234	1329	{
1235		- struct list_head this, next;
1236		- struct shmem_inode_info *info;
1237		- struct mem_cgroup *memcg;
	1330	+ struct shmem_inode_info info, next;
1238	1331	int error = 0;
1239	1332
1240		- /*
1241		- * There's a faint possibility that swap page was replaced before
1242		- * caller locked it: caller will come back later with the right page.
1243		- */
1244		- if (unlikely(!PageSwapCache(page) \|\| page_private(page) != swap.val))
1245		- goto out;
1246		-
1247		- /*
1248		- * Charge page using GFP_KERNEL while we can wait, before taking
1249		- * the shmem_swaplist_mutex which might hold up shmem_writepage().
1250		- * Charged back to the user (not to caller) when swap account is used.
1251		- */
1252		- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
1253		- &memcg, false);
1254		- if (error)
1255		- goto out;
1256		- /* No radix_tree_preload: swap entry keeps a place for page in tree */
1257		- error = -EAGAIN;
	1333	+ if (list_empty(&shmem_swaplist))
	1334	+ return 0;
1258	1335
1259	1336	mutex_lock(&shmem_swaplist_mutex);
1260		- list_for_each_safe(this, next, &shmem_swaplist) {
1261		- info = list_entry(this, struct shmem_inode_info, swaplist);
1262		- if (info->swapped)
1263		- error = shmem_unuse_inode(info, swap, &page);
1264		- else
	1337	+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
	1338	+ if (!info->swapped) {
1265	1339	list_del_init(&info->swaplist);
	1340	+ continue;
	1341	+ }
	1342	+ /*
	1343	+ * Drop the swaplist mutex while searching the inode for swap;
	1344	+ * but before doing so, make sure shmem_evict_inode() will not
	1345	+ * remove placeholder inode from swaplist, nor let it be freed
	1346	+ * (igrab() would protect from unlink, but not from unmount).
	1347	+ */
	1348	+ atomic_inc(&info->stop_eviction);
	1349	+ mutex_unlock(&shmem_swaplist_mutex);
	1350	+
	1351	+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
	1352	+ fs_pages_to_unuse);
1266	1353	cond_resched();
1267		- if (error != -EAGAIN)
	1354	+
	1355	+ mutex_lock(&shmem_swaplist_mutex);
	1356	+ next = list_next_entry(info, swaplist);
	1357	+ if (!info->swapped)
	1358	+ list_del_init(&info->swaplist);
	1359	+ if (atomic_dec_and_test(&info->stop_eviction))
	1360	+ wake_up_var(&info->stop_eviction);
	1361	+ if (error)
1268	1362	break;
1269		- /* found nothing in this: move on to search the next */
1270	1363	}
1271	1364	mutex_unlock(&shmem_swaplist_mutex);
1272	1365
1273		- if (error) {
1274		- if (error != -ENOMEM)
1275		- error = 0;
1276		- mem_cgroup_cancel_charge(page, memcg, false);
1277		- } else
1278		- mem_cgroup_commit_charge(page, memcg, true, false);
1279		-out:
1280		- unlock_page(page);
1281		- put_page(page);
1282	1366	return error;
1283	1367	}
1284	1368
..	..	@@ -1348,6 +1432,7 @@
1348	1432	SetPageUptodate(page);
1349	1433	}
1350	1434
	1435	+ trace_android_vh_set_shmem_page_flag(page);
1351	1436	swap = get_swap_page(page);
1352	1437	if (!swap.val)
1353	1438	goto redirty;
..	..	@@ -1362,9 +1447,11 @@
1362	1447	*/
1363	1448	mutex_lock(&shmem_swaplist_mutex);
1364	1449	if (list_empty(&info->swaplist))
1365		- list_add_tail(&info->swaplist, &shmem_swaplist);
	1450	+ list_add(&info->swaplist, &shmem_swaplist);
1366	1451
1367		- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
	1452	+ if (add_to_swap_cache(page, swap,
	1453	+ __GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN,
	1454	+ NULL) == 0) {
1368	1455	spin_lock_irq(&info->lock);
1369	1456	shmem_recalc_inode(inode);
1370	1457	info->swapped++;
..	..	@@ -1406,10 +1493,10 @@
1406	1493	{
1407	1494	struct mempolicy *mpol = NULL;
1408	1495	if (sbinfo->mpol) {
1409		- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
	1496	+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1410	1497	mpol = sbinfo->mpol;
1411	1498	mpol_get(mpol);
1412		- spin_unlock(&sbinfo->stat_lock);
	1499	+ raw_spin_unlock(&sbinfo->stat_lock);
1413	1500	}
1414	1501	return mpol;
1415	1502	}
..	..	@@ -1447,11 +1534,11 @@
1447	1534	{
1448	1535	struct vm_area_struct pvma;
1449	1536	struct page *page;
1450		- struct vm_fault vmf;
	1537	+ struct vm_fault vmf = {
	1538	+ .vma = &pvma,
	1539	+ };
1451	1540
1452	1541	shmem_pseudo_vma_init(&pvma, info, index);
1453		- vmf.vma = &pvma;
1454		- vmf.address = 0;
1455	1542	page = swap_cluster_readahead(swap, gfp, &vmf);
1456	1543	shmem_pseudo_vma_destroy(&pvma);
1457	1544
..	..	@@ -1462,23 +1549,14 @@
1462	1549	struct shmem_inode_info *info, pgoff_t index)
1463	1550	{
1464	1551	struct vm_area_struct pvma;
1465		- struct inode *inode = &info->vfs_inode;
1466		- struct address_space *mapping = inode->i_mapping;
1467		- pgoff_t idx, hindex;
1468		- void __rcu **results;
	1552	+ struct address_space *mapping = info->vfs_inode.i_mapping;
	1553	+ pgoff_t hindex;
1469	1554	struct page *page;
1470	1555
1471		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1472		- return NULL;
1473		-
1474	1556	hindex = round_down(index, HPAGE_PMD_NR);
1475		- rcu_read_lock();
1476		- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
1477		- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1478		- rcu_read_unlock();
	1557	+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
	1558	+ XA_PRESENT))
1479	1559	return NULL;
1480		- }
1481		- rcu_read_unlock();
1482	1560
1483	1561	shmem_pseudo_vma_init(&pvma, info, hindex);
1484	1562	page = alloc_pages_vma(gfp \| __GFP_COMP \| __GFP_NORETRY \| __GFP_NOWARN,
..	..	@@ -1486,6 +1564,8 @@
1486	1564	shmem_pseudo_vma_destroy(&pvma);
1487	1565	if (page)
1488	1566	prep_transhuge_page(page);
	1567	+ else
	1568	+ count_vm_event(THP_FILE_FALLBACK);
1489	1569	return page;
1490	1570	}
1491	1571
..	..	@@ -1493,7 +1573,11 @@
1493	1573	struct shmem_inode_info *info, pgoff_t index)
1494	1574	{
1495	1575	struct vm_area_struct pvma;
1496		- struct page *page;
	1576	+ struct page *page = NULL;
	1577	+
	1578	+ trace_android_vh_shmem_alloc_page(&page);
	1579	+ if (page)
	1580	+ return page;
1497	1581
1498	1582	shmem_pseudo_vma_init(&pvma, info, index);
1499	1583	page = alloc_page_vma(gfp, &pvma, 0);
..	..	@@ -1511,7 +1595,7 @@
1511	1595	int nr;
1512	1596	int err = -ENOSPC;
1513	1597
1514		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
	1598	+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1515	1599	huge = false;
1516	1600	nr = huge ? HPAGE_PMD_NR : 1;
1517	1601
..	..	@@ -1589,11 +1673,11 @@
1589	1673	* a nice clean interface for us to replace oldpage by newpage there.
1590	1674	*/
1591	1675	xa_lock_irq(&swap_mapping->i_pages);
1592		- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1593		- newpage);
	1676	+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1594	1677	if (!error) {
1595		- __inc_node_page_state(newpage, NR_FILE_PAGES);
1596		- __dec_node_page_state(oldpage, NR_FILE_PAGES);
	1678	+ mem_cgroup_migrate(oldpage, newpage);
	1679	+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
	1680	+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
1597	1681	}
1598	1682	xa_unlock_irq(&swap_mapping->i_pages);
1599	1683
..	..	@@ -1605,8 +1689,7 @@
1605	1689	*/
1606	1690	oldpage = newpage;
1607	1691	} else {
1608		- mem_cgroup_migrate(oldpage, newpage);
1609		- lru_cache_add_anon(newpage);
	1692	+ lru_cache_add(newpage);
1610	1693	*pagep = newpage;
1611	1694	}
1612	1695
..	..	@@ -1620,13 +1703,109 @@
1620	1703	}
1621	1704
1622	1705	/*
	1706	+ * Swap in the page pointed to by *pagep.
	1707	+ * Caller has to make sure that *pagep contains a valid swapped page.
	1708	+ * Returns 0 and the page in pagep if success. On failure, returns the
	1709	+ * error code and NULL in *pagep.
	1710	+ */
	1711	+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
	1712	+ struct page **pagep, enum sgp_type sgp,
	1713	+ gfp_t gfp, struct vm_area_struct *vma,
	1714	+ vm_fault_t *fault_type)
	1715	+{
	1716	+ struct address_space *mapping = inode->i_mapping;
	1717	+ struct shmem_inode_info *info = SHMEM_I(inode);
	1718	+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
	1719	+ struct page *page;
	1720	+ swp_entry_t swap;
	1721	+ int error;
	1722	+
	1723	+ VM_BUG_ON(!pagep \|\| !xa_is_value(pagep));
	1724	+ swap = radix_to_swp_entry(*pagep);
	1725	+ *pagep = NULL;
	1726	+
	1727	+ /* Look it up and read it in.. */
	1728	+ page = lookup_swap_cache(swap, NULL, 0);
	1729	+ if (!page) {
	1730	+ /* Or update major stats only when swapin succeeds?? */
	1731	+ if (fault_type) {
	1732	+ *fault_type \|= VM_FAULT_MAJOR;
	1733	+ count_vm_event(PGMAJFAULT);
	1734	+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
	1735	+ }
	1736	+ /* Here we actually start the io */
	1737	+ page = shmem_swapin(swap, gfp, info, index);
	1738	+ if (!page) {
	1739	+ error = -ENOMEM;
	1740	+ goto failed;
	1741	+ }
	1742	+ }
	1743	+
	1744	+ /* We have to do this with page locked to prevent races */
	1745	+ lock_page(page);
	1746	+ if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
	1747	+ !shmem_confirm_swap(mapping, index, swap)) {
	1748	+ error = -EEXIST;
	1749	+ goto unlock;
	1750	+ }
	1751	+ if (!PageUptodate(page)) {
	1752	+ error = -EIO;
	1753	+ goto failed;
	1754	+ }
	1755	+ wait_on_page_writeback(page);
	1756	+
	1757	+ /*
	1758	+ * Some architectures may have to restore extra metadata to the
	1759	+ * physical page after reading from swap.
	1760	+ */
	1761	+ arch_swap_restore(swap, page);
	1762	+
	1763	+ if (shmem_should_replace_page(page, gfp)) {
	1764	+ error = shmem_replace_page(&page, gfp, info, index);
	1765	+ if (error)
	1766	+ goto failed;
	1767	+ }
	1768	+
	1769	+ error = shmem_add_to_page_cache(page, mapping, index,
	1770	+ swp_to_radix_entry(swap), gfp,
	1771	+ charge_mm);
	1772	+ if (error)
	1773	+ goto failed;
	1774	+
	1775	+ spin_lock_irq(&info->lock);
	1776	+ info->swapped--;
	1777	+ shmem_recalc_inode(inode);
	1778	+ spin_unlock_irq(&info->lock);
	1779	+
	1780	+ if (sgp == SGP_WRITE)
	1781	+ mark_page_accessed(page);
	1782	+
	1783	+ delete_from_swap_cache(page);
	1784	+ set_page_dirty(page);
	1785	+ swap_free(swap);
	1786	+
	1787	+ *pagep = page;
	1788	+ return 0;
	1789	+failed:
	1790	+ if (!shmem_confirm_swap(mapping, index, swap))
	1791	+ error = -EEXIST;
	1792	+unlock:
	1793	+ if (page) {
	1794	+ unlock_page(page);
	1795	+ put_page(page);
	1796	+ }
	1797	+
	1798	+ return error;
	1799	+}
	1800	+
	1801	+/*
1623	1802	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1624	1803	*
1625	1804	* If we allocate a new one we do not mark it dirty. That's up to the
1626	1805	* vm. If we swap it in we mark it dirty since we also free the swap
1627	1806	* entry since a page cannot live in both the swap and page cache.
1628	1807	*
1629		- * fault_mm and fault_type are only supplied by shmem_fault:
	1808	+ * vma, vmf, and fault_type are only supplied by shmem_fault:
1630	1809	* otherwise they are NULL.
1631	1810	*/
1632	1811	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
..	..	@@ -1638,9 +1817,7 @@
1638	1817	struct shmem_inode_info *info = SHMEM_I(inode);
1639	1818	struct shmem_sb_info *sbinfo;
1640	1819	struct mm_struct *charge_mm;
1641		- struct mem_cgroup *memcg;
1642	1820	struct page *page;
1643		- swp_entry_t swap;
1644	1821	enum sgp_type sgp_huge = sgp;
1645	1822	pgoff_t hindex = index;
1646	1823	int error;
..	..	@@ -1652,19 +1829,37 @@
1652	1829	if (sgp == SGP_NOHUGE \|\| sgp == SGP_HUGE)
1653	1830	sgp = SGP_CACHE;
1654	1831	repeat:
1655		- swap.val = 0;
1656		- page = find_lock_entry(mapping, index);
1657		- if (radix_tree_exceptional_entry(page)) {
1658		- swap = radix_to_swp_entry(page);
1659		- page = NULL;
1660		- }
1661		-
1662	1832	if (sgp <= SGP_CACHE &&
1663	1833	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1664		- error = -EINVAL;
1665		- goto unlock;
	1834	+ return -EINVAL;
1666	1835	}
1667	1836
	1837	+ sbinfo = SHMEM_SB(inode->i_sb);
	1838	+ charge_mm = vma ? vma->vm_mm : current->mm;
	1839	+
	1840	+ page = find_lock_entry(mapping, index);
	1841	+
	1842	+ if (page && vma && userfaultfd_minor(vma)) {
	1843	+ if (!xa_is_value(page)) {
	1844	+ unlock_page(page);
	1845	+ put_page(page);
	1846	+ }
	1847	+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
	1848	+ return 0;
	1849	+ }
	1850	+
	1851	+ if (xa_is_value(page)) {
	1852	+ error = shmem_swapin_page(inode, index, &page,
	1853	+ sgp, gfp, vma, fault_type);
	1854	+ if (error == -EEXIST)
	1855	+ goto repeat;
	1856	+
	1857	+ *pagep = page;
	1858	+ return error;
	1859	+ }
	1860	+
	1861	+ if (page)
	1862	+ hindex = page->index;
1668	1863	if (page && sgp == SGP_WRITE)
1669	1864	mark_page_accessed(page);
1670	1865
..	..	@@ -1675,230 +1870,141 @@
1675	1870	unlock_page(page);
1676	1871	put_page(page);
1677	1872	page = NULL;
	1873	+ hindex = index;
1678	1874	}
1679		- if (page \|\| (sgp == SGP_READ && !swap.val)) {
1680		- *pagep = page;
1681		- return 0;
1682		- }
	1875	+ if (page \|\| sgp == SGP_READ)
	1876	+ goto out;
1683	1877
1684	1878	/*
1685	1879	* Fast cache lookup did not find it:
1686	1880	* bring it back from swap or allocate.
1687	1881	*/
1688		- sbinfo = SHMEM_SB(inode->i_sb);
1689		- charge_mm = vma ? vma->vm_mm : current->mm;
1690	1882
1691		- if (swap.val) {
1692		- /* Look it up and read it in.. */
1693		- page = lookup_swap_cache(swap, NULL, 0);
1694		- if (!page) {
1695		- /* Or update major stats only when swapin succeeds?? */
1696		- if (fault_type) {
1697		- *fault_type \|= VM_FAULT_MAJOR;
1698		- count_vm_event(PGMAJFAULT);
1699		- count_memcg_event_mm(charge_mm, PGMAJFAULT);
1700		- }
1701		- /* Here we actually start the io */
1702		- page = shmem_swapin(swap, gfp, info, index);
1703		- if (!page) {
1704		- error = -ENOMEM;
1705		- goto failed;
1706		- }
1707		- }
	1883	+ if (vma && userfaultfd_missing(vma)) {
	1884	+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
	1885	+ return 0;
	1886	+ }
1708	1887
1709		- /* We have to do this with page locked to prevent races */
1710		- lock_page(page);
1711		- if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
1712		- !shmem_confirm_swap(mapping, index, swap)) {
1713		- error = -EEXIST; /* try again */
1714		- goto unlock;
1715		- }
1716		- if (!PageUptodate(page)) {
1717		- error = -EIO;
1718		- goto failed;
1719		- }
1720		- wait_on_page_writeback(page);
	1888	+ /* shmem_symlink() */
	1889	+ if (mapping->a_ops != &shmem_aops)
	1890	+ goto alloc_nohuge;
	1891	+ if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
	1892	+ goto alloc_nohuge;
	1893	+ if (shmem_huge == SHMEM_HUGE_FORCE)
	1894	+ goto alloc_huge;
	1895	+ switch (sbinfo->huge) {
	1896	+ case SHMEM_HUGE_NEVER:
	1897	+ goto alloc_nohuge;
	1898	+ case SHMEM_HUGE_WITHIN_SIZE: {
	1899	+ loff_t i_size;
	1900	+ pgoff_t off;
1721	1901
1722		- if (shmem_should_replace_page(page, gfp)) {
1723		- error = shmem_replace_page(&page, gfp, info, index);
1724		- if (error)
1725		- goto failed;
1726		- }
1727		-
1728		- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1729		- false);
1730		- if (!error) {
1731		- error = shmem_add_to_page_cache(page, mapping, index,
1732		- swp_to_radix_entry(swap));
1733		- /*
1734		- * We already confirmed swap under page lock, and make
1735		- * no memory allocation here, so usually no possibility
1736		- * of error; but free_swap_and_cache() only trylocks a
1737		- * page, so it is just possible that the entry has been
1738		- * truncated or holepunched since swap was confirmed.
1739		- * shmem_undo_range() will have done some of the
1740		- * unaccounting, now delete_from_swap_cache() will do
1741		- * the rest.
1742		- * Reset swap.val? No, leave it so "failed" goes back to
1743		- * "repeat": reading a hole and writing should succeed.
1744		- */
1745		- if (error) {
1746		- mem_cgroup_cancel_charge(page, memcg, false);
1747		- delete_from_swap_cache(page);
1748		- }
1749		- }
1750		- if (error)
1751		- goto failed;
1752		-
1753		- mem_cgroup_commit_charge(page, memcg, true, false);
1754		-
1755		- spin_lock_irq(&info->lock);
1756		- info->swapped--;
1757		- shmem_recalc_inode(inode);
1758		- spin_unlock_irq(&info->lock);
1759		-
1760		- if (sgp == SGP_WRITE)
1761		- mark_page_accessed(page);
1762		-
1763		- delete_from_swap_cache(page);
1764		- set_page_dirty(page);
1765		- swap_free(swap);
1766		-
1767		- } else {
1768		- if (vma && userfaultfd_missing(vma)) {
1769		- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1770		- return 0;
1771		- }
1772		-
1773		- /* shmem_symlink() */
1774		- if (mapping->a_ops != &shmem_aops)
1775		- goto alloc_nohuge;
1776		- if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
1777		- goto alloc_nohuge;
1778		- if (shmem_huge == SHMEM_HUGE_FORCE)
	1902	+ off = round_up(index, HPAGE_PMD_NR);
	1903	+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
	1904	+ if (i_size >= HPAGE_PMD_SIZE &&
	1905	+ i_size >> PAGE_SHIFT >= off)
1779	1906	goto alloc_huge;
1780		- switch (sbinfo->huge) {
1781		- loff_t i_size;
1782		- pgoff_t off;
1783		- case SHMEM_HUGE_NEVER:
1784		- goto alloc_nohuge;
1785		- case SHMEM_HUGE_WITHIN_SIZE:
1786		- off = round_up(index, HPAGE_PMD_NR);
1787		- i_size = round_up(i_size_read(inode), PAGE_SIZE);
1788		- if (i_size >= HPAGE_PMD_SIZE &&
1789		- i_size >> PAGE_SHIFT >= off)
1790		- goto alloc_huge;
1791		- /* fallthrough */
1792		- case SHMEM_HUGE_ADVISE:
1793		- if (sgp_huge == SGP_HUGE)
1794		- goto alloc_huge;
1795		- /* TODO: implement fadvise() hints */
1796		- goto alloc_nohuge;
1797		- }
	1907	+
	1908	+ fallthrough;
	1909	+ }
	1910	+ case SHMEM_HUGE_ADVISE:
	1911	+ if (sgp_huge == SGP_HUGE)
	1912	+ goto alloc_huge;
	1913	+ /* TODO: implement fadvise() hints */
	1914	+ goto alloc_nohuge;
	1915	+ }
1798	1916
1799	1917	alloc_huge:
1800		- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
1801		- if (IS_ERR(page)) {
1802		-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1803		- index, false);
1804		- }
1805		- if (IS_ERR(page)) {
1806		- int retry = 5;
1807		- error = PTR_ERR(page);
1808		- page = NULL;
1809		- if (error != -ENOSPC)
1810		- goto failed;
1811		- /*
1812		- * Try to reclaim some spece by splitting a huge page
1813		- * beyond i_size on the filesystem.
1814		- */
1815		- while (retry--) {
1816		- int ret;
1817		- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1818		- if (ret == SHRINK_STOP)
1819		- break;
1820		- if (ret)
1821		- goto alloc_nohuge;
1822		- }
1823		- goto failed;
1824		- }
	1918	+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
	1919	+ if (IS_ERR(page)) {
	1920	+alloc_nohuge:
	1921	+ page = shmem_alloc_and_acct_page(gfp, inode,
	1922	+ index, false);
	1923	+ }
	1924	+ if (IS_ERR(page)) {
	1925	+ int retry = 5;
1825	1926
1826		- if (PageTransHuge(page))
1827		- hindex = round_down(index, HPAGE_PMD_NR);
1828		- else
1829		- hindex = index;
1830		-
1831		- if (sgp == SGP_WRITE)
1832		- __SetPageReferenced(page);
1833		-
1834		- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1835		- PageTransHuge(page));
1836		- if (error)
1837		- goto unacct;
1838		- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1839		- compound_order(page));
1840		- if (!error) {
1841		- error = shmem_add_to_page_cache(page, mapping, hindex,
1842		- NULL);
1843		- radix_tree_preload_end();
1844		- }
1845		- if (error) {
1846		- mem_cgroup_cancel_charge(page, memcg,
1847		- PageTransHuge(page));
1848		- goto unacct;
1849		- }
1850		- mem_cgroup_commit_charge(page, memcg, false,
1851		- PageTransHuge(page));
1852		- lru_cache_add_anon(page);
1853		-
1854		- spin_lock_irq(&info->lock);
1855		- info->alloced += 1 << compound_order(page);
1856		- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1857		- shmem_recalc_inode(inode);
1858		- spin_unlock_irq(&info->lock);
1859		- alloced = true;
1860		-
1861		- if (PageTransHuge(page) &&
1862		- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1863		- hindex + HPAGE_PMD_NR - 1) {
1864		- /*
1865		- * Part of the huge page is beyond i_size: subject
1866		- * to shrink under memory pressure.
1867		- */
1868		- spin_lock(&sbinfo->shrinklist_lock);
1869		- /*
1870		- * _careful to defend against unlocked access to
1871		- * ->shrink_list in shmem_unused_huge_shrink()
1872		- */
1873		- if (list_empty_careful(&info->shrinklist)) {
1874		- list_add_tail(&info->shrinklist,
1875		- &sbinfo->shrinklist);
1876		- sbinfo->shrinklist_len++;
1877		- }
1878		- spin_unlock(&sbinfo->shrinklist_lock);
1879		- }
1880		-
	1927	+ error = PTR_ERR(page);
	1928	+ page = NULL;
	1929	+ if (error != -ENOSPC)
	1930	+ goto unlock;
1881	1931	/*
1882		- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
	1932	+ * Try to reclaim some space by splitting a huge page
	1933	+ * beyond i_size on the filesystem.
1883	1934	*/
1884		- if (sgp == SGP_FALLOC)
1885		- sgp = SGP_WRITE;
	1935	+ while (retry--) {
	1936	+ int ret;
	1937	+
	1938	+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
	1939	+ if (ret == SHRINK_STOP)
	1940	+ break;
	1941	+ if (ret)
	1942	+ goto alloc_nohuge;
	1943	+ }
	1944	+ goto unlock;
	1945	+ }
	1946	+
	1947	+ if (PageTransHuge(page))
	1948	+ hindex = round_down(index, HPAGE_PMD_NR);
	1949	+ else
	1950	+ hindex = index;
	1951	+
	1952	+ if (sgp == SGP_WRITE)
	1953	+ __SetPageReferenced(page);
	1954	+
	1955	+ error = shmem_add_to_page_cache(page, mapping, hindex,
	1956	+ NULL, gfp & GFP_RECLAIM_MASK,
	1957	+ charge_mm);
	1958	+ if (error)
	1959	+ goto unacct;
	1960	+ lru_cache_add(page);
	1961	+
	1962	+ spin_lock_irq(&info->lock);
	1963	+ info->alloced += compound_nr(page);
	1964	+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
	1965	+ shmem_recalc_inode(inode);
	1966	+ spin_unlock_irq(&info->lock);
	1967	+ alloced = true;
	1968	+
	1969	+ if (PageTransHuge(page) &&
	1970	+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
	1971	+ hindex + HPAGE_PMD_NR - 1) {
	1972	+ /*
	1973	+ * Part of the huge page is beyond i_size: subject
	1974	+ * to shrink under memory pressure.
	1975	+ */
	1976	+ spin_lock(&sbinfo->shrinklist_lock);
	1977	+ /*
	1978	+ * _careful to defend against unlocked access to
	1979	+ * ->shrink_list in shmem_unused_huge_shrink()
	1980	+ */
	1981	+ if (list_empty_careful(&info->shrinklist)) {
	1982	+ list_add_tail(&info->shrinklist,
	1983	+ &sbinfo->shrinklist);
	1984	+ sbinfo->shrinklist_len++;
	1985	+ }
	1986	+ spin_unlock(&sbinfo->shrinklist_lock);
	1987	+ }
	1988	+
	1989	+ /*
	1990	+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
	1991	+ */
	1992	+ if (sgp == SGP_FALLOC)
	1993	+ sgp = SGP_WRITE;
1886	1994	clear:
1887		- /*
1888		- * Let SGP_WRITE caller clear ends if write does not fill page;
1889		- * but SGP_FALLOC on a page fallocated earlier must initialize
1890		- * it now, lest undo on failure cancel our earlier guarantee.
1891		- */
1892		- if (sgp != SGP_WRITE && !PageUptodate(page)) {
1893		- struct page *head = compound_head(page);
1894		- int i;
	1995	+ /*
	1996	+ * Let SGP_WRITE caller clear ends if write does not fill page;
	1997	+ * but SGP_FALLOC on a page fallocated earlier must initialize
	1998	+ * it now, lest undo on failure cancel our earlier guarantee.
	1999	+ */
	2000	+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
	2001	+ int i;
1895	2002
1896		- for (i = 0; i < (1 << compound_order(head)); i++) {
1897		- clear_highpage(head + i);
1898		- flush_dcache_page(head + i);
1899		- }
1900		- SetPageUptodate(head);
	2003	+ for (i = 0; i < compound_nr(page); i++) {
	2004	+ clear_highpage(page + i);
	2005	+ flush_dcache_page(page + i);
1901	2006	}
	2007	+ SetPageUptodate(page);
1902	2008	}
1903	2009
1904	2010	/* Perhaps the file has been truncated since we checked */
..	..	@@ -1914,6 +2020,7 @@
1914	2020	error = -EINVAL;
1915	2021	goto unlock;
1916	2022	}
	2023	+out:
1917	2024	*pagep = page + index - hindex;
1918	2025	return 0;
1919	2026
..	..	@@ -1921,16 +2028,13 @@
1921	2028	* Error recovery.
1922	2029	*/
1923	2030	unacct:
1924		- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
	2031	+ shmem_inode_unacct_blocks(inode, compound_nr(page));
1925	2032
1926	2033	if (PageTransHuge(page)) {
1927	2034	unlock_page(page);
1928	2035	put_page(page);
1929	2036	goto alloc_nohuge;
1930	2037	}
1931		-failed:
1932		- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1933		- error = -EEXIST;
1934	2038	unlock:
1935	2039	if (page) {
1936	2040	unlock_page(page);
..	..	@@ -1942,7 +2046,7 @@
1942	2046	spin_unlock_irq(&info->lock);
1943	2047	goto repeat;
1944	2048	}
1945		- if (error == -EEXIST) /* from above or from radix_tree_insert */
	2049	+ if (error == -EEXIST)
1946	2050	goto repeat;
1947	2051	return error;
1948	2052	}
..	..	@@ -1994,16 +2098,14 @@
1994	2098	shmem_falloc->waitq &&
1995	2099	vmf->pgoff >= shmem_falloc->start &&
1996	2100	vmf->pgoff < shmem_falloc->next) {
	2101	+ struct file *fpin;
1997	2102	wait_queue_head_t *shmem_falloc_waitq;
1998	2103	DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
1999	2104
2000	2105	ret = VM_FAULT_NOPAGE;
2001		- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
2002		- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
2003		- /* It's polite to up mmap_sem if we can */
2004		- up_read(&vma->vm_mm->mmap_sem);
	2106	+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
	2107	+ if (fpin)
2005	2108	ret = VM_FAULT_RETRY;
2006		- }
2007	2109
2008	2110	shmem_falloc_waitq = shmem_falloc->waitq;
2009	2111	prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
..	..	@@ -2021,6 +2123,9 @@
2021	2123	spin_lock(&inode->i_lock);
2022	2124	finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2023	2125	spin_unlock(&inode->i_lock);
	2126	+
	2127	+ if (fpin)
	2128	+ fput(fpin);
2024	2129	return ret;
2025	2130	}
2026	2131	spin_unlock(&inode->i_lock);
..	..	@@ -2059,7 +2164,7 @@
2059	2164	get_area = current->mm->get_unmapped_area;
2060	2165	addr = get_area(file, uaddr, len, pgoff, flags);
2061	2166
2062		- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
	2167	+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2063	2168	return addr;
2064	2169	if (IS_ERR_VALUE(addr))
2065	2170	return addr;
..	..	@@ -2179,26 +2284,18 @@
2179	2284	static int shmem_mmap(struct file file, struct vm_area_struct vma)
2180	2285	{
2181	2286	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
	2287	+ int ret;
2182	2288
2183		- if (info->seals & F_SEAL_FUTURE_WRITE) {
2184		- /*
2185		- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
2186		- * "future write" seal active.
2187		- */
2188		- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
2189		- return -EPERM;
	2289	+ ret = seal_check_future_write(info->seals, vma);
	2290	+ if (ret)
	2291	+ return ret;
2190	2292
2191		- /*
2192		- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
2193		- * read-only mapping, take care to not allow mprotect to revert
2194		- * protections.
2195		- */
2196		- vma->vm_flags &= ~(VM_MAYWRITE);
2197		- }
	2293	+ /* arm64 - allow memory tagging on RAM-based files */
	2294	+ vma->vm_flags \|= VM_MTE_ALLOWED;
2198	2295
2199	2296	file_accessed(file);
2200	2297	vma->vm_ops = &shmem_vm_ops;
2201		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	2298	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2202	2299	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2203	2300	(vma->vm_end & HPAGE_PMD_MASK)) {
2204	2301	khugepaged_enter(vma, vma->vm_flags);
..	..	@@ -2212,13 +2309,14 @@
2212	2309	struct inode *inode;
2213	2310	struct shmem_inode_info *info;
2214	2311	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	2312	+ ino_t ino;
2215	2313
2216		- if (shmem_reserve_inode(sb))
	2314	+ if (shmem_reserve_inode(sb, &ino))
2217	2315	return NULL;
2218	2316
2219	2317	inode = new_inode(sb);
2220	2318	if (inode) {
2221		- inode->i_ino = get_next_ino();
	2319	+ inode->i_ino = ino;
2222	2320	inode_init_owner(inode, dir, mode);
2223	2321	inode->i_blocks = 0;
2224	2322	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
..	..	@@ -2226,6 +2324,7 @@
2226	2324	info = SHMEM_I(inode);
2227	2325	memset(info, 0, (char )inode - (char )info);
2228	2326	spin_lock_init(&info->lock);
	2327	+ atomic_set(&info->stop_eviction, 0);
2229	2328	info->seals = F_SEAL_SEAL;
2230	2329	info->flags = flags & VM_NORESERVE;
2231	2330	INIT_LIST_HEAD(&info->shrinklist);
..	..	@@ -2272,28 +2371,25 @@
2272	2371	return mapping->a_ops == &shmem_aops;
2273	2372	}
2274	2373
2275		-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2276		- pmd_t *dst_pmd,
2277		- struct vm_area_struct *dst_vma,
2278		- unsigned long dst_addr,
2279		- unsigned long src_addr,
2280		- bool zeropage,
2281		- struct page **pagep)
	2374	+#ifdef CONFIG_USERFAULTFD
	2375	+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
	2376	+ pmd_t *dst_pmd,
	2377	+ struct vm_area_struct *dst_vma,
	2378	+ unsigned long dst_addr,
	2379	+ unsigned long src_addr,
	2380	+ bool zeropage,
	2381	+ struct page **pagep)
2282	2382	{
2283	2383	struct inode *inode = file_inode(dst_vma->vm_file);
2284	2384	struct shmem_inode_info *info = SHMEM_I(inode);
2285	2385	struct address_space *mapping = inode->i_mapping;
2286	2386	gfp_t gfp = mapping_gfp_mask(mapping);
2287	2387	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2288		- struct mem_cgroup *memcg;
2289		- spinlock_t *ptl;
2290	2388	void *page_kaddr;
2291	2389	struct page *page;
2292		- pte_t _dst_pte, *dst_pte;
2293	2390	int ret;
2294		- pgoff_t offset, max_off;
	2391	+ pgoff_t max_off;
2295	2392
2296		- ret = -ENOMEM;
2297	2393	if (!shmem_inode_acct_block(inode, 1)) {
2298	2394	/*
2299	2395	* We may have got a page, returned -ENOENT triggering a retry,
..	..	@@ -2304,29 +2400,30 @@
2304	2400	put_page(*pagep);
2305	2401	*pagep = NULL;
2306	2402	}
2307		- goto out;
	2403	+ return -ENOMEM;
2308	2404	}
2309	2405
2310	2406	if (!*pagep) {
	2407	+ ret = -ENOMEM;
2311	2408	page = shmem_alloc_page(gfp, info, pgoff);
2312	2409	if (!page)
2313	2410	goto out_unacct_blocks;
2314	2411
2315		- if (!zeropage) { /* mcopy_atomic */
	2412	+ if (!zeropage) { /* COPY */
2316	2413	page_kaddr = kmap_atomic(page);
2317	2414	ret = copy_from_user(page_kaddr,
2318	2415	(const void __user *)src_addr,
2319	2416	PAGE_SIZE);
2320	2417	kunmap_atomic(page_kaddr);
2321	2418
2322		- /* fallback to copy_from_user outside mmap_sem */
	2419	+ /* fallback to copy_from_user outside mmap_lock */
2323	2420	if (unlikely(ret)) {
2324	2421	*pagep = page;
2325		- shmem_inode_unacct_blocks(inode, 1);
	2422	+ ret = -ENOENT;
2326	2423	/* don't free the page */
2327		- return -ENOENT;
	2424	+ goto out_unacct_blocks;
2328	2425	}
2329		- } else { /* mfill_zeropage_atomic */
	2426	+ } else { /* ZEROPAGE */
2330	2427	clear_highpage(page);
2331	2428	}
2332	2429	} else {
..	..	@@ -2334,57 +2431,26 @@
2334	2431	*pagep = NULL;
2335	2432	}
2336	2433
2337		- VM_BUG_ON(PageLocked(page) \|\| PageSwapBacked(page));
	2434	+ VM_BUG_ON(PageLocked(page));
	2435	+ VM_BUG_ON(PageSwapBacked(page));
2338	2436	__SetPageLocked(page);
2339	2437	__SetPageSwapBacked(page);
2340	2438	__SetPageUptodate(page);
2341	2439
2342	2440	ret = -EFAULT;
2343		- offset = linear_page_index(dst_vma, dst_addr);
2344	2441	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2345		- if (unlikely(offset >= max_off))
	2442	+ if (unlikely(pgoff >= max_off))
2346	2443	goto out_release;
2347	2444
2348		- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
	2445	+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
	2446	+ gfp & GFP_RECLAIM_MASK, dst_mm);
2349	2447	if (ret)
2350	2448	goto out_release;
2351	2449
2352		- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
2353		- if (!ret) {
2354		- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
2355		- radix_tree_preload_end();
2356		- }
	2450	+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
	2451	+ page, true, false);
2357	2452	if (ret)
2358		- goto out_release_uncharge;
2359		-
2360		- mem_cgroup_commit_charge(page, memcg, false, false);
2361		-
2362		- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
2363		- if (dst_vma->vm_flags & VM_WRITE)
2364		- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
2365		- else {
2366		- /*
2367		- * We don't set the pte dirty if the vma has no
2368		- * VM_WRITE permission, so mark the page dirty or it
2369		- * could be freed from under us. We could do it
2370		- * unconditionally before unlock_page(), but doing it
2371		- * only if VM_WRITE is not set is faster.
2372		- */
2373		- set_page_dirty(page);
2374		- }
2375		-
2376		- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
2377		-
2378		- ret = -EFAULT;
2379		- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2380		- if (unlikely(offset >= max_off))
2381		- goto out_release_uncharge_unlock;
2382		-
2383		- ret = -EEXIST;
2384		- if (!pte_none(*dst_pte))
2385		- goto out_release_uncharge_unlock;
2386		-
2387		- lru_cache_add_anon(page);
	2453	+ goto out_delete_from_cache;
2388	2454
2389	2455	spin_lock_irq(&info->lock);
2390	2456	info->alloced++;
..	..	@@ -2392,52 +2458,19 @@
2392	2458	shmem_recalc_inode(inode);
2393	2459	spin_unlock_irq(&info->lock);
2394	2460
2395		- inc_mm_counter(dst_mm, mm_counter_file(page));
2396		- page_add_file_rmap(page, false);
2397		- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
2398		-
2399		- /* No need to invalidate - it was non-present before */
2400		- update_mmu_cache(dst_vma, dst_addr, dst_pte);
2401		- pte_unmap_unlock(dst_pte, ptl);
	2461	+ SetPageDirty(page);
2402	2462	unlock_page(page);
2403		- ret = 0;
2404		-out:
2405		- return ret;
2406		-out_release_uncharge_unlock:
2407		- pte_unmap_unlock(dst_pte, ptl);
2408		- ClearPageDirty(page);
	2463	+ return 0;
	2464	+out_delete_from_cache:
2409	2465	delete_from_page_cache(page);
2410		-out_release_uncharge:
2411		- mem_cgroup_cancel_charge(page, memcg, false);
2412	2466	out_release:
2413	2467	unlock_page(page);
2414	2468	put_page(page);
2415	2469	out_unacct_blocks:
2416	2470	shmem_inode_unacct_blocks(inode, 1);
2417		- goto out;
	2471	+ return ret;
2418	2472	}
2419		-
2420		-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
2421		- pmd_t *dst_pmd,
2422		- struct vm_area_struct *dst_vma,
2423		- unsigned long dst_addr,
2424		- unsigned long src_addr,
2425		- struct page **pagep)
2426		-{
2427		- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2428		- dst_addr, src_addr, false, pagep);
2429		-}
2430		-
2431		-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
2432		- pmd_t *dst_pmd,
2433		- struct vm_area_struct *dst_vma,
2434		- unsigned long dst_addr)
2435		-{
2436		- struct page *page = NULL;
2437		-
2438		- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
2439		- dst_addr, 0, true, &page);
2440		-}
	2473	+#endif /* CONFIG_USERFAULTFD */
2441	2474
2442	2475	#ifdef CONFIG_TMPFS
2443	2476	static const struct inode_operations shmem_symlink_inode_operations;
..	..	@@ -2617,7 +2650,7 @@
2617	2650	}
2618	2651
2619	2652	/*
2620		- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
	2653	+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
2621	2654	*/
2622	2655	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2623	2656	pgoff_t index, pgoff_t end, int whence)
..	..	@@ -2647,7 +2680,7 @@
2647	2680	index = indices[i];
2648	2681	}
2649	2682	page = pvec.pages[i];
2650		- if (page && !radix_tree_exceptional_entry(page)) {
	2683	+ if (page && !xa_is_value(page)) {
2651	2684	if (!PageUptodate(page))
2652	2685	page = NULL;
2653	2686	}
..	..	@@ -2943,7 +2976,7 @@
2943	2976	* first link must skip that, to get the accounting right.
2944	2977	*/
2945	2978	if (inode->i_nlink) {
2946		- ret = shmem_reserve_inode(inode->i_sb);
	2979	+ ret = shmem_reserve_inode(inode->i_sb, NULL);
2947	2980	if (ret)
2948	2981	goto out;
2949	2982	}
..	..	@@ -3095,12 +3128,9 @@
3095	3128
3096	3129	error = security_inode_init_security(inode, dir, &dentry->d_name,
3097	3130	shmem_initxattrs, NULL);
3098		- if (error) {
3099		- if (error != -EOPNOTSUPP) {
3100		- iput(inode);
3101		- return error;
3102		- }
3103		- error = 0;
	3131	+ if (error && error != -EOPNOTSUPP) {
	3132	+ iput(inode);
	3133	+ return error;
3104	3134	}
3105	3135
3106	3136	inode->i_size = len-1;
..	..	@@ -3192,7 +3222,7 @@
3192	3222	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3193	3223	GFP_KERNEL);
3194	3224	if (!new_xattr->name) {
3195		- kfree(new_xattr);
	3225	+ kvfree(new_xattr);
3196	3226	return -ENOMEM;
3197	3227	}
3198	3228
..	..	@@ -3209,7 +3239,8 @@
3209	3239
3210	3240	static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3211	3241	struct dentry unused, struct inode inode,
3212		- const char name, void buffer, size_t size)
	3242	+ const char name, void buffer, size_t size,
	3243	+ int flags)
3213	3244	{
3214	3245	struct shmem_inode_info *info = SHMEM_I(inode);
3215	3246
..	..	@@ -3225,7 +3256,7 @@
3225	3256	struct shmem_inode_info *info = SHMEM_I(inode);
3226	3257
3227	3258	name = xattr_full_name(handler, name);
3228		- return simple_xattr_set(&info->xattrs, name, value, size, flags);
	3259	+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3229	3260	}
3230	3261
3231	3262	static const struct xattr_handler shmem_security_xattr_handler = {
..	..	@@ -3352,16 +3383,142 @@
3352	3383	.fh_to_dentry = shmem_fh_to_dentry,
3353	3384	};
3354	3385
3355		-static int shmem_parse_options(char options, struct shmem_sb_info sbinfo,
3356		- bool remount)
	3386	+enum shmem_param {
	3387	+ Opt_gid,
	3388	+ Opt_huge,
	3389	+ Opt_mode,
	3390	+ Opt_mpol,
	3391	+ Opt_nr_blocks,
	3392	+ Opt_nr_inodes,
	3393	+ Opt_size,
	3394	+ Opt_uid,
	3395	+ Opt_inode32,
	3396	+ Opt_inode64,
	3397	+};
	3398	+
	3399	+static const struct constant_table shmem_param_enums_huge[] = {
	3400	+ {"never", SHMEM_HUGE_NEVER },
	3401	+ {"always", SHMEM_HUGE_ALWAYS },
	3402	+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
	3403	+ {"advise", SHMEM_HUGE_ADVISE },
	3404	+ {}
	3405	+};
	3406	+
	3407	+const struct fs_parameter_spec shmem_fs_parameters[] = {
	3408	+ fsparam_u32 ("gid", Opt_gid),
	3409	+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
	3410	+ fsparam_u32oct("mode", Opt_mode),
	3411	+ fsparam_string("mpol", Opt_mpol),
	3412	+ fsparam_string("nr_blocks", Opt_nr_blocks),
	3413	+ fsparam_string("nr_inodes", Opt_nr_inodes),
	3414	+ fsparam_string("size", Opt_size),
	3415	+ fsparam_u32 ("uid", Opt_uid),
	3416	+ fsparam_flag ("inode32", Opt_inode32),
	3417	+ fsparam_flag ("inode64", Opt_inode64),
	3418	+ {}
	3419	+};
	3420	+
	3421	+static int shmem_parse_one(struct fs_context fc, struct fs_parameter param)
3357	3422	{
3358		- char this_char, value, *rest;
3359		- struct mempolicy *mpol = NULL;
3360		- uid_t uid;
3361		- gid_t gid;
	3423	+ struct shmem_options *ctx = fc->fs_private;
	3424	+ struct fs_parse_result result;
	3425	+ unsigned long long size;
	3426	+ char *rest;
	3427	+ int opt;
	3428	+
	3429	+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
	3430	+ if (opt < 0)
	3431	+ return opt;
	3432	+
	3433	+ switch (opt) {
	3434	+ case Opt_size:
	3435	+ size = memparse(param->string, &rest);
	3436	+ if (*rest == '%') {
	3437	+ size <<= PAGE_SHIFT;
	3438	+ size *= totalram_pages();
	3439	+ do_div(size, 100);
	3440	+ rest++;
	3441	+ }
	3442	+ if (*rest)
	3443	+ goto bad_value;
	3444	+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
	3445	+ ctx->seen \|= SHMEM_SEEN_BLOCKS;
	3446	+ break;
	3447	+ case Opt_nr_blocks:
	3448	+ ctx->blocks = memparse(param->string, &rest);
	3449	+ if (*rest)
	3450	+ goto bad_value;
	3451	+ ctx->seen \|= SHMEM_SEEN_BLOCKS;
	3452	+ break;
	3453	+ case Opt_nr_inodes:
	3454	+ ctx->inodes = memparse(param->string, &rest);
	3455	+ if (*rest)
	3456	+ goto bad_value;
	3457	+ ctx->seen \|= SHMEM_SEEN_INODES;
	3458	+ break;
	3459	+ case Opt_mode:
	3460	+ ctx->mode = result.uint_32 & 07777;
	3461	+ break;
	3462	+ case Opt_uid:
	3463	+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
	3464	+ if (!uid_valid(ctx->uid))
	3465	+ goto bad_value;
	3466	+ break;
	3467	+ case Opt_gid:
	3468	+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
	3469	+ if (!gid_valid(ctx->gid))
	3470	+ goto bad_value;
	3471	+ break;
	3472	+ case Opt_huge:
	3473	+ ctx->huge = result.uint_32;
	3474	+ if (ctx->huge != SHMEM_HUGE_NEVER &&
	3475	+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
	3476	+ has_transparent_hugepage()))
	3477	+ goto unsupported_parameter;
	3478	+ ctx->seen \|= SHMEM_SEEN_HUGE;
	3479	+ break;
	3480	+ case Opt_mpol:
	3481	+ if (IS_ENABLED(CONFIG_NUMA)) {
	3482	+ mpol_put(ctx->mpol);
	3483	+ ctx->mpol = NULL;
	3484	+ if (mpol_parse_str(param->string, &ctx->mpol))
	3485	+ goto bad_value;
	3486	+ break;
	3487	+ }
	3488	+ goto unsupported_parameter;
	3489	+ case Opt_inode32:
	3490	+ ctx->full_inums = false;
	3491	+ ctx->seen \|= SHMEM_SEEN_INUMS;
	3492	+ break;
	3493	+ case Opt_inode64:
	3494	+ if (sizeof(ino_t) < 8) {
	3495	+ return invalfc(fc,
	3496	+ "Cannot use inode64 with <64bit inums in kernel\n");
	3497	+ }
	3498	+ ctx->full_inums = true;
	3499	+ ctx->seen \|= SHMEM_SEEN_INUMS;
	3500	+ break;
	3501	+ }
	3502	+ return 0;
	3503	+
	3504	+unsupported_parameter:
	3505	+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
	3506	+bad_value:
	3507	+ return invalfc(fc, "Bad value for '%s'", param->key);
	3508	+}
	3509	+
	3510	+static int shmem_parse_options(struct fs_context fc, void data)
	3511	+{
	3512	+ char *options = data;
	3513	+
	3514	+ if (options) {
	3515	+ int err = security_sb_eat_lsm_opts(options, &fc->security);
	3516	+ if (err)
	3517	+ return err;
	3518	+ }
3362	3519
3363	3520	while (options != NULL) {
3364		- this_char = options;
	3521	+ char *this_char = options;
3365	3522	for (;;) {
3366	3523	/*
3367	3524	* NUL-terminate this option: unfortunately,
..	..	@@ -3377,139 +3534,93 @@
3377	3534	break;
3378	3535	}
3379	3536	}
3380		- if (!*this_char)
3381		- continue;
3382		- if ((value = strchr(this_char,'=')) != NULL) {
3383		- *value++ = 0;
3384		- } else {
3385		- pr_err("tmpfs: No value for mount option '%s'\n",
3386		- this_char);
3387		- goto error;
3388		- }
	3537	+ if (*this_char) {
	3538	+ char *value = strchr(this_char,'=');
	3539	+ size_t len = 0;
	3540	+ int err;
3389	3541
3390		- if (!strcmp(this_char,"size")) {
3391		- unsigned long long size;
3392		- size = memparse(value,&rest);
3393		- if (*rest == '%') {
3394		- size <<= PAGE_SHIFT;
3395		- size *= totalram_pages;
3396		- do_div(size, 100);
3397		- rest++;
	3542	+ if (value) {
	3543	+ *value++ = '\0';
	3544	+ len = strlen(value);
3398	3545	}
3399		- if (*rest)
3400		- goto bad_val;
3401		- sbinfo->max_blocks =
3402		- DIV_ROUND_UP(size, PAGE_SIZE);
3403		- } else if (!strcmp(this_char,"nr_blocks")) {
3404		- sbinfo->max_blocks = memparse(value, &rest);
3405		- if (*rest)
3406		- goto bad_val;
3407		- } else if (!strcmp(this_char,"nr_inodes")) {
3408		- sbinfo->max_inodes = memparse(value, &rest);
3409		- if (*rest)
3410		- goto bad_val;
3411		- } else if (!strcmp(this_char,"mode")) {
3412		- if (remount)
3413		- continue;
3414		- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
3415		- if (*rest)
3416		- goto bad_val;
3417		- } else if (!strcmp(this_char,"uid")) {
3418		- if (remount)
3419		- continue;
3420		- uid = simple_strtoul(value, &rest, 0);
3421		- if (*rest)
3422		- goto bad_val;
3423		- sbinfo->uid = make_kuid(current_user_ns(), uid);
3424		- if (!uid_valid(sbinfo->uid))
3425		- goto bad_val;
3426		- } else if (!strcmp(this_char,"gid")) {
3427		- if (remount)
3428		- continue;
3429		- gid = simple_strtoul(value, &rest, 0);
3430		- if (*rest)
3431		- goto bad_val;
3432		- sbinfo->gid = make_kgid(current_user_ns(), gid);
3433		- if (!gid_valid(sbinfo->gid))
3434		- goto bad_val;
3435		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3436		- } else if (!strcmp(this_char, "huge")) {
3437		- int huge;
3438		- huge = shmem_parse_huge(value);
3439		- if (huge < 0)
3440		- goto bad_val;
3441		- if (!has_transparent_hugepage() &&
3442		- huge != SHMEM_HUGE_NEVER)
3443		- goto bad_val;
3444		- sbinfo->huge = huge;
3445		-#endif
3446		-#ifdef CONFIG_NUMA
3447		- } else if (!strcmp(this_char,"mpol")) {
3448		- mpol_put(mpol);
3449		- mpol = NULL;
3450		- if (mpol_parse_str(value, &mpol))
3451		- goto bad_val;
3452		-#endif
3453		- } else {
3454		- pr_err("tmpfs: Bad mount option %s\n", this_char);
3455		- goto error;
	3546	+ err = vfs_parse_fs_string(fc, this_char, value, len);
	3547	+ if (err < 0)
	3548	+ return err;
3456	3549	}
3457	3550	}
3458		- sbinfo->mpol = mpol;
3459	3551	return 0;
3460		-
3461		-bad_val:
3462		- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
3463		- value, this_char);
3464		-error:
3465		- mpol_put(mpol);
3466		- return 1;
3467		-
3468	3552	}
3469	3553
3470		-static int shmem_remount_fs(struct super_block sb, int flags, char *data)
	3554	+/*
	3555	+ * Reconfigure a shmem filesystem.
	3556	+ *
	3557	+ * Note that we disallow change from limited->unlimited blocks/inodes while any
	3558	+ * are in use; but we must separately disallow unlimited->limited, because in
	3559	+ * that case we have no record of how much is already in use.
	3560	+ */
	3561	+static int shmem_reconfigure(struct fs_context *fc)
3471	3562	{
3472		- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3473		- struct shmem_sb_info config = *sbinfo;
	3563	+ struct shmem_options *ctx = fc->fs_private;
	3564	+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3474	3565	unsigned long inodes;
3475		- int error = -EINVAL;
	3566	+ struct mempolicy *mpol = NULL;
	3567	+ const char *err;
3476	3568
3477		- config.mpol = NULL;
3478		- if (shmem_parse_options(data, &config, true))
3479		- return error;
3480		-
3481		- spin_lock(&sbinfo->stat_lock);
	3569	+ raw_spin_lock(&sbinfo->stat_lock);
3482	3570	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3483		- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
3484		- goto out;
3485		- if (config.max_inodes < inodes)
3486		- goto out;
3487		- /*
3488		- * Those tests disallow limited->unlimited while any are in use;
3489		- * but we must separately disallow unlimited->limited, because
3490		- * in that case we have no record of how much is already in use.
3491		- */
3492		- if (config.max_blocks && !sbinfo->max_blocks)
3493		- goto out;
3494		- if (config.max_inodes && !sbinfo->max_inodes)
3495		- goto out;
	3571	+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
	3572	+ if (!sbinfo->max_blocks) {
	3573	+ err = "Cannot retroactively limit size";
	3574	+ goto out;
	3575	+ }
	3576	+ if (percpu_counter_compare(&sbinfo->used_blocks,
	3577	+ ctx->blocks) > 0) {
	3578	+ err = "Too small a size for current use";
	3579	+ goto out;
	3580	+ }
	3581	+ }
	3582	+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
	3583	+ if (!sbinfo->max_inodes) {
	3584	+ err = "Cannot retroactively limit inodes";
	3585	+ goto out;
	3586	+ }
	3587	+ if (ctx->inodes < inodes) {
	3588	+ err = "Too few inodes for current use";
	3589	+ goto out;
	3590	+ }
	3591	+ }
3496	3592
3497		- error = 0;
3498		- sbinfo->huge = config.huge;
3499		- sbinfo->max_blocks = config.max_blocks;
3500		- sbinfo->max_inodes = config.max_inodes;
3501		- sbinfo->free_inodes = config.max_inodes - inodes;
	3593	+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
	3594	+ sbinfo->next_ino > UINT_MAX) {
	3595	+ err = "Current inum too high to switch to 32-bit inums";
	3596	+ goto out;
	3597	+ }
	3598	+
	3599	+ if (ctx->seen & SHMEM_SEEN_HUGE)
	3600	+ sbinfo->huge = ctx->huge;
	3601	+ if (ctx->seen & SHMEM_SEEN_INUMS)
	3602	+ sbinfo->full_inums = ctx->full_inums;
	3603	+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
	3604	+ sbinfo->max_blocks = ctx->blocks;
	3605	+ if (ctx->seen & SHMEM_SEEN_INODES) {
	3606	+ sbinfo->max_inodes = ctx->inodes;
	3607	+ sbinfo->free_inodes = ctx->inodes - inodes;
	3608	+ }
3502	3609
3503	3610	/*
3504	3611	* Preserve previous mempolicy unless mpol remount option was specified.
3505	3612	*/
3506		- if (config.mpol) {
3507		- mpol_put(sbinfo->mpol);
3508		- sbinfo->mpol = config.mpol; /* transfers initial ref */
	3613	+ if (ctx->mpol) {
	3614	+ mpol = sbinfo->mpol;
	3615	+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
	3616	+ ctx->mpol = NULL;
3509	3617	}
	3618	+ raw_spin_unlock(&sbinfo->stat_lock);
	3619	+ mpol_put(mpol);
	3620	+ return 0;
3510	3621	out:
3511		- spin_unlock(&sbinfo->stat_lock);
3512		- return error;
	3622	+ raw_spin_unlock(&sbinfo->stat_lock);
	3623	+ return invalfc(fc, "%s", err);
3513	3624	}
3514	3625
3515	3626	static int shmem_show_options(struct seq_file seq, struct dentry root)
..	..	@@ -3529,7 +3640,30 @@
3529	3640	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3530	3641	seq_printf(seq, ",gid=%u",
3531	3642	from_kgid_munged(&init_user_ns, sbinfo->gid));
3532		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	3643	+
	3644	+ /*
	3645	+ * Showing inode{64,32} might be useful even if it's the system default,
	3646	+ * since then people don't have to resort to checking both here and
	3647	+ * /proc/config.gz to confirm 64-bit inums were successfully applied
	3648	+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
	3649	+ *
	3650	+ * We hide it when inode64 isn't the default and we are using 32-bit
	3651	+ * inodes, since that probably just means the feature isn't even under
	3652	+ * consideration.
	3653	+ *
	3654	+ * As such:
	3655	+ *
	3656	+ * +-----------------+-----------------+
	3657	+ * \| TMPFS_INODE64=y \| TMPFS_INODE64=n \|
	3658	+ * +------------------+-----------------+-----------------+
	3659	+ * \| full_inums=true \| show \| show \|
	3660	+ * \| full_inums=false \| show \| hide \|
	3661	+ * +------------------+-----------------+-----------------+
	3662	+ *
	3663	+ */
	3664	+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) \|\| sbinfo->full_inums)
	3665	+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
	3666	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3533	3667	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3534	3668	if (sbinfo->huge)
3535	3669	seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
..	..	@@ -3544,14 +3678,16 @@
3544	3678	{
3545	3679	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3546	3680
	3681	+ free_percpu(sbinfo->ino_batch);
3547	3682	percpu_counter_destroy(&sbinfo->used_blocks);
3548	3683	mpol_put(sbinfo->mpol);
3549	3684	kfree(sbinfo);
3550	3685	sb->s_fs_info = NULL;
3551	3686	}
3552	3687
3553		-int shmem_fill_super(struct super_block sb, void data, int silent)
	3688	+static int shmem_fill_super(struct super_block sb, struct fs_context fc)
3554	3689	{
	3690	+ struct shmem_options *ctx = fc->fs_private;
3555	3691	struct inode *inode;
3556	3692	struct shmem_sb_info *sbinfo;
3557	3693	int err = -ENOMEM;
..	..	@@ -3562,9 +3698,6 @@
3562	3698	if (!sbinfo)
3563	3699	return -ENOMEM;
3564	3700
3565		- sbinfo->mode = 0777 \| S_ISVTX;
3566		- sbinfo->uid = current_fsuid();
3567		- sbinfo->gid = current_fsgid();
3568	3701	sb->s_fs_info = sbinfo;
3569	3702
3570	3703	#ifdef CONFIG_TMPFS
..	..	@@ -3574,12 +3707,12 @@
3574	3707	* but the internal instance is left unlimited.
3575	3708	*/
3576	3709	if (!(sb->s_flags & SB_KERNMOUNT)) {
3577		- sbinfo->max_blocks = shmem_default_max_blocks();
3578		- sbinfo->max_inodes = shmem_default_max_inodes();
3579		- if (shmem_parse_options(data, sbinfo, false)) {
3580		- err = -EINVAL;
3581		- goto failed;
3582		- }
	3710	+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
	3711	+ ctx->blocks = shmem_default_max_blocks();
	3712	+ if (!(ctx->seen & SHMEM_SEEN_INODES))
	3713	+ ctx->inodes = shmem_default_max_inodes();
	3714	+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
	3715	+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3583	3716	} else {
3584	3717	sb->s_flags \|= SB_NOUSER;
3585	3718	}
..	..	@@ -3588,11 +3721,24 @@
3588	3721	#else
3589	3722	sb->s_flags \|= SB_NOUSER;
3590	3723	#endif
	3724	+ sbinfo->max_blocks = ctx->blocks;
	3725	+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
	3726	+ if (sb->s_flags & SB_KERNMOUNT) {
	3727	+ sbinfo->ino_batch = alloc_percpu(ino_t);
	3728	+ if (!sbinfo->ino_batch)
	3729	+ goto failed;
	3730	+ }
	3731	+ sbinfo->uid = ctx->uid;
	3732	+ sbinfo->gid = ctx->gid;
	3733	+ sbinfo->full_inums = ctx->full_inums;
	3734	+ sbinfo->mode = ctx->mode;
	3735	+ sbinfo->huge = ctx->huge;
	3736	+ sbinfo->mpol = ctx->mpol;
	3737	+ ctx->mpol = NULL;
3591	3738
3592		- spin_lock_init(&sbinfo->stat_lock);
	3739	+ raw_spin_lock_init(&sbinfo->stat_lock);
3593	3740	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3594	3741	goto failed;
3595		- sbinfo->free_inodes = sbinfo->max_inodes;
3596	3742	spin_lock_init(&sbinfo->shrinklist_lock);
3597	3743	INIT_LIST_HEAD(&sbinfo->shrinklist);
3598	3744
..	..	@@ -3625,6 +3771,31 @@
3625	3771	return err;
3626	3772	}
3627	3773
	3774	+static int shmem_get_tree(struct fs_context *fc)
	3775	+{
	3776	+ return get_tree_nodev(fc, shmem_fill_super);
	3777	+}
	3778	+
	3779	+static void shmem_free_fc(struct fs_context *fc)
	3780	+{
	3781	+ struct shmem_options *ctx = fc->fs_private;
	3782	+
	3783	+ if (ctx) {
	3784	+ mpol_put(ctx->mpol);
	3785	+ kfree(ctx);
	3786	+ }
	3787	+}
	3788	+
	3789	+static const struct fs_context_operations shmem_fs_context_ops = {
	3790	+ .free = shmem_free_fc,
	3791	+ .get_tree = shmem_get_tree,
	3792	+#ifdef CONFIG_TMPFS
	3793	+ .parse_monolithic = shmem_parse_options,
	3794	+ .parse_param = shmem_parse_one,
	3795	+ .reconfigure = shmem_reconfigure,
	3796	+#endif
	3797	+};
	3798	+
3628	3799	static struct kmem_cache *shmem_inode_cachep;
3629	3800
3630	3801	static struct inode shmem_alloc_inode(struct super_block sb)
..	..	@@ -3636,9 +3807,8 @@
3636	3807	return &info->vfs_inode;
3637	3808	}
3638	3809
3639		-static void shmem_destroy_callback(struct rcu_head *head)
	3810	+static void shmem_free_in_core_inode(struct inode *inode)
3640	3811	{
3641		- struct inode *inode = container_of(head, struct inode, i_rcu);
3642	3812	if (S_ISLNK(inode->i_mode))
3643	3813	kfree(inode->i_link);
3644	3814	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
..	..	@@ -3648,7 +3818,6 @@
3648	3818	{
3649	3819	if (S_ISREG(inode->i_mode))
3650	3820	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3651		- call_rcu(&inode->i_rcu, shmem_destroy_callback);
3652	3821	}
3653	3822
3654	3823	static void shmem_init_inode(void *foo)
..	..	@@ -3739,16 +3908,16 @@
3739	3908
3740	3909	static const struct super_operations shmem_ops = {
3741	3910	.alloc_inode = shmem_alloc_inode,
	3911	+ .free_inode = shmem_free_in_core_inode,
3742	3912	.destroy_inode = shmem_destroy_inode,
3743	3913	#ifdef CONFIG_TMPFS
3744	3914	.statfs = shmem_statfs,
3745		- .remount_fs = shmem_remount_fs,
3746	3915	.show_options = shmem_show_options,
3747	3916	#endif
3748	3917	.evict_inode = shmem_evict_inode,
3749	3918	.drop_inode = generic_delete_inode,
3750	3919	.put_super = shmem_put_super,
3751		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	3920	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3752	3921	.nr_cached_objects = shmem_unused_huge_count,
3753	3922	.free_cached_objects = shmem_unused_huge_scan,
3754	3923	#endif
..	..	@@ -3761,29 +3930,42 @@
3761	3930	.set_policy = shmem_set_policy,
3762	3931	.get_policy = shmem_get_policy,
3763	3932	#endif
	3933	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	3934	+ .allow_speculation = filemap_allow_speculation,
	3935	+#endif
3764	3936	};
3765	3937
3766		-static struct dentry shmem_mount(struct file_system_type fs_type,
3767		- int flags, const char dev_name, void data)
	3938	+int shmem_init_fs_context(struct fs_context *fc)
3768	3939	{
3769		- return mount_nodev(fs_type, flags, data, shmem_fill_super);
	3940	+ struct shmem_options *ctx;
	3941	+
	3942	+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
	3943	+ if (!ctx)
	3944	+ return -ENOMEM;
	3945	+
	3946	+ ctx->mode = 0777 \| S_ISVTX;
	3947	+ ctx->uid = current_fsuid();
	3948	+ ctx->gid = current_fsgid();
	3949	+
	3950	+ fc->fs_private = ctx;
	3951	+ fc->ops = &shmem_fs_context_ops;
	3952	+ return 0;
3770	3953	}
3771	3954
3772	3955	static struct file_system_type shmem_fs_type = {
3773	3956	.owner = THIS_MODULE,
3774	3957	.name = "tmpfs",
3775		- .mount = shmem_mount,
	3958	+ .init_fs_context = shmem_init_fs_context,
	3959	+#ifdef CONFIG_TMPFS
	3960	+ .parameters = shmem_fs_parameters,
	3961	+#endif
3776	3962	.kill_sb = kill_litter_super,
3777		- .fs_flags = FS_USERNS_MOUNT,
	3963	+ .fs_flags = FS_USERNS_MOUNT \| FS_THP_SUPPORT,
3778	3964	};
3779	3965
3780	3966	int __init shmem_init(void)
3781	3967	{
3782	3968	int error;
3783		-
3784		- /* If rootfs called this, don't re-init */
3785		- if (shmem_inode_cachep)
3786		- return 0;
3787	3969
3788	3970	shmem_init_inodecache();
3789	3971
..	..	@@ -3800,7 +3982,7 @@
3800	3982	goto out1;
3801	3983	}
3802	3984
3803		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	3985	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3804	3986	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3805	3987	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3806	3988	else
..	..	@@ -3816,11 +3998,11 @@
3816	3998	return error;
3817	3999	}
3818	4000
3819		-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
	4001	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
3820	4002	static ssize_t shmem_enabled_show(struct kobject *kobj,
3821	4003	struct kobj_attribute attr, char buf)
3822	4004	{
3823		- int values[] = {
	4005	+ static const int values[] = {
3824	4006	SHMEM_HUGE_ALWAYS,
3825	4007	SHMEM_HUGE_WITHIN_SIZE,
3826	4008	SHMEM_HUGE_ADVISE,
..	..	@@ -3868,9 +4050,9 @@
3868	4050
3869	4051	struct kobj_attribute shmem_enabled_attr =
3870	4052	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3871		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
	4053	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
3872	4054
3873		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	4055	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3874	4056	bool shmem_huge_enabled(struct vm_area_struct *vma)
3875	4057	{
3876	4058	struct inode *inode = file_inode(vma->vm_file);
..	..	@@ -3878,6 +4060,8 @@
3878	4060	loff_t i_size;
3879	4061	pgoff_t off;
3880	4062
	4063	+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
	4064	+ return false;
3881	4065	if (shmem_huge == SHMEM_HUGE_FORCE)
3882	4066	return true;
3883	4067	if (shmem_huge == SHMEM_HUGE_DENY)
..	..	@@ -3893,7 +4077,7 @@
3893	4077	if (i_size >= HPAGE_PMD_SIZE &&
3894	4078	i_size >> PAGE_SHIFT >= off)
3895	4079	return true;
3896		- /* fall through */
	4080	+ fallthrough;
3897	4081	case SHMEM_HUGE_ADVISE:
3898	4082	/* TODO: implement fadvise() hints */
3899	4083	return (vma->vm_flags & VM_HUGEPAGE);
..	..	@@ -3902,7 +4086,7 @@
3902	4086	return false;
3903	4087	}
3904	4088	}
3905		-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
	4089	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3906	4090
3907	4091	#else /* !CONFIG_SHMEM */
3908	4092
..	..	@@ -3917,7 +4101,8 @@
3917	4101
3918	4102	static struct file_system_type shmem_fs_type = {
3919	4103	.name = "tmpfs",
3920		- .mount = ramfs_mount,
	4104	+ .init_fs_context = ramfs_init_fs_context,
	4105	+ .parameters = ramfs_fs_parameters,
3921	4106	.kill_sb = kill_litter_super,
3922	4107	.fs_flags = FS_USERNS_MOUNT,
3923	4108	};
..	..	@@ -3932,7 +4117,8 @@
3932	4117	return 0;
3933	4118	}
3934	4119
3935		-int shmem_unuse(swp_entry_t swap, struct page *page)
	4120	+int shmem_unuse(unsigned int type, bool frontswap,
	4121	+ unsigned long *fs_pages_to_unuse)
3936	4122	{
3937	4123	return 0;
3938	4124	}
..	..	@@ -4047,7 +4233,7 @@
4047	4233
4048	4234	/**
4049	4235	* shmem_zero_setup - setup a shared anonymous mapping
4050		- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
	4236	+ * @vma: the vma to be mmapped is prepared by do_mmap
4051	4237	*/
4052	4238	int shmem_zero_setup(struct vm_area_struct *vma)
4053	4239	{
..	..	@@ -4055,7 +4241,7 @@
4055	4241	loff_t size = vma->vm_end - vma->vm_start;
4056	4242
4057	4243	/*
4058		- * Cloning a new file under mmap_sem leads to a lock ordering conflict
	4244	+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
4059	4245	* between XFS directory reading and selinux: since this file is only
4060	4246	* accessible to the user through its mapping, use S_PRIVATE flag to
4061	4247	* bypass file security, in the same way as shmem_kernel_file_setup().
..	..	@@ -4069,7 +4255,7 @@
4069	4255	vma->vm_file = file;
4070	4256	vma->vm_ops = &shmem_vm_ops;
4071	4257
4072		- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	4258	+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4073	4259	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4074	4260	(vma->vm_end & HPAGE_PMD_MASK)) {
4075	4261	khugepaged_enter(vma, vma->vm_flags);
..	..	@@ -4117,3 +4303,47 @@
4117	4303	#endif
4118	4304	}
4119	4305	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
	4306	+
	4307	+void shmem_mark_page_lazyfree(struct page *page, bool tail)
	4308	+{
	4309	+ mark_page_lazyfree_movetail(page, tail);
	4310	+}
	4311	+EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree);
	4312	+
	4313	+int reclaim_shmem_address_space(struct address_space *mapping)
	4314	+{
	4315	+#ifdef CONFIG_SHMEM
	4316	+ pgoff_t start = 0;
	4317	+ struct page *page;
	4318	+ LIST_HEAD(page_list);
	4319	+ XA_STATE(xas, &mapping->i_pages, start);
	4320	+
	4321	+ if (!shmem_mapping(mapping))
	4322	+ return -EINVAL;
	4323	+
	4324	+ lru_add_drain();
	4325	+
	4326	+ rcu_read_lock();
	4327	+ xas_for_each(&xas, page, ULONG_MAX) {
	4328	+ if (xas_retry(&xas, page))
	4329	+ continue;
	4330	+ if (xa_is_value(page))
	4331	+ continue;
	4332	+ if (isolate_lru_page(page))
	4333	+ continue;
	4334	+
	4335	+ list_add(&page->lru, &page_list);
	4336	+
	4337	+ if (need_resched()) {
	4338	+ xas_pause(&xas);
	4339	+ cond_resched_rcu();
	4340	+ }
	4341	+ }
	4342	+ rcu_read_unlock();
	4343	+
	4344	+ return reclaim_pages(&page_list);
	4345	+#else
	4346	+ return 0;
	4347	+#endif
	4348	+}
	4349	+EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);