~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/swapfile.c
3	4	*
..	..	@@ -39,10 +40,10 @@
39	40	#include <linux/swap_slots.h>
40	41	#include <linux/sort.h>
41	42
42		-#include <asm/pgtable.h>
43	43	#include <asm/tlbflush.h>
44	44	#include <linux/swapops.h>
45	45	#include <linux/swap_cgroup.h>
	46	+#include <trace/hooks/mm.h>
46	47
47	48	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
48	49	unsigned char);
..	..	@@ -98,7 +99,7 @@
98	99
99	100	atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100	101
101		-static struct swap_info_struct *swap_type_to_swap_info(int type)
	102	+struct swap_info_struct *swap_type_to_swap_info(int type)
102	103	{
103	104	if (type >= READ_ONCE(nr_swapfiles))
104	105	return NULL;
..	..	@@ -106,36 +107,62 @@
106	107	smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
107	108	return READ_ONCE(swap_info[type]);
108	109	}
	110	+EXPORT_SYMBOL_GPL(swap_type_to_swap_info);
109	111
110	112	static inline unsigned char swap_count(unsigned char ent)
111	113	{
112	114	return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
113	115	}
114	116
	117	+/* Reclaim the swap entry anyway if possible */
	118	+#define TTRS_ANYWAY 0x1
	119	+/*
	120	+ * Reclaim the swap entry if there are no more mappings of the
	121	+ * corresponding page
	122	+ */
	123	+#define TTRS_UNMAPPED 0x2
	124	+/* Reclaim the swap entry if swap is getting full*/
	125	+#define TTRS_FULL 0x4
	126	+
115	127	/* returns 1 if swap entry is freed */
116		-static int
117		-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
	128	+static int __try_to_reclaim_swap(struct swap_info_struct *si,
	129	+ unsigned long offset, unsigned long flags)
118	130	{
119	131	swp_entry_t entry = swp_entry(si->type, offset);
120	132	struct page *page;
121	133	int ret = 0;
122	134
123		- page = find_get_page(swap_address_space(entry), swp_offset(entry));
	135	+ page = find_get_page(swap_address_space(entry), offset);
124	136	if (!page)
125	137	return 0;
126	138	/*
127		- * This function is called from scan_swap_map() and it's called
128		- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
129		- * We have to use trylock for avoiding deadlock. This is a special
	139	+ * When this function is called from scan_swap_map_slots() and it's
	140	+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
	141	+ * here. We have to use trylock for avoiding deadlock. This is a special
130	142	* case and you should use try_to_free_swap() with explicit lock_page()
131	143	* in usual operations.
132	144	*/
133	145	if (trylock_page(page)) {
134		- ret = try_to_free_swap(page);
	146	+ if ((flags & TTRS_ANYWAY) \|\|
	147	+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) \|\|
	148	+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
	149	+ ret = try_to_free_swap(page);
135	150	unlock_page(page);
136	151	}
137	152	put_page(page);
138	153	return ret;
	154	+}
	155	+
	156	+static inline struct swap_extent first_se(struct swap_info_struct sis)
	157	+{
	158	+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
	159	+ return rb_entry(rb, struct swap_extent, rb_node);
	160	+}
	161	+
	162	+static inline struct swap_extent next_se(struct swap_extent se)
	163	+{
	164	+ struct rb_node *rb = rb_next(&se->rb_node);
	165	+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
139	166	}
140	167
141	168	/*
..	..	@@ -150,7 +177,7 @@
150	177	int err = 0;
151	178
152	179	/* Do not discard the swap header page! */
153		- se = &si->first_swap_extent;
	180	+ se = first_se(si);
154	181	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
155	182	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
156	183	if (nr_blocks) {
..	..	@@ -161,7 +188,7 @@
161	188	cond_resched();
162	189	}
163	190
164		- list_for_each_entry(se, &si->first_swap_extent.list, list) {
	191	+ for (se = next_se(se); se; se = next_se(se)) {
165	192	start_block = se->start_block << (PAGE_SHIFT - 9);
166	193	nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
167	194
..	..	@@ -175,6 +202,39 @@
175	202	return err; /* That will often be -EOPNOTSUPP */
176	203	}
177	204
	205	+static struct swap_extent *
	206	+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
	207	+{
	208	+ struct swap_extent *se;
	209	+ struct rb_node *rb;
	210	+
	211	+ rb = sis->swap_extent_root.rb_node;
	212	+ while (rb) {
	213	+ se = rb_entry(rb, struct swap_extent, rb_node);
	214	+ if (offset < se->start_page)
	215	+ rb = rb->rb_left;
	216	+ else if (offset >= se->start_page + se->nr_pages)
	217	+ rb = rb->rb_right;
	218	+ else
	219	+ return se;
	220	+ }
	221	+ /* It must be present */
	222	+ BUG();
	223	+}
	224	+
	225	+sector_t swap_page_sector(struct page *page)
	226	+{
	227	+ struct swap_info_struct *sis = page_swap_info(page);
	228	+ struct swap_extent *se;
	229	+ sector_t sector;
	230	+ pgoff_t offset;
	231	+
	232	+ offset = __page_file_index(page);
	233	+ se = offset_to_swap_extent(sis, offset);
	234	+ sector = se->start_block + (offset - se->start_page);
	235	+ return sector << (PAGE_SHIFT - 9);
	236	+}
	237	+
178	238	/*
179	239	* swap allocation tell device that a cluster of swap can now be discarded,
180	240	* to allow the swap device to optimize its wear-levelling.
..	..	@@ -182,32 +242,25 @@
182	242	static void discard_swap_cluster(struct swap_info_struct *si,
183	243	pgoff_t start_page, pgoff_t nr_pages)
184	244	{
185		- struct swap_extent *se = si->curr_swap_extent;
186		- int found_extent = 0;
	245	+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
187	246
188	247	while (nr_pages) {
189		- if (se->start_page <= start_page &&
190		- start_page < se->start_page + se->nr_pages) {
191		- pgoff_t offset = start_page - se->start_page;
192		- sector_t start_block = se->start_block + offset;
193		- sector_t nr_blocks = se->nr_pages - offset;
	248	+ pgoff_t offset = start_page - se->start_page;
	249	+ sector_t start_block = se->start_block + offset;
	250	+ sector_t nr_blocks = se->nr_pages - offset;
194	251
195		- if (nr_blocks > nr_pages)
196		- nr_blocks = nr_pages;
197		- start_page += nr_blocks;
198		- nr_pages -= nr_blocks;
	252	+ if (nr_blocks > nr_pages)
	253	+ nr_blocks = nr_pages;
	254	+ start_page += nr_blocks;
	255	+ nr_pages -= nr_blocks;
199	256
200		- if (!found_extent++)
201		- si->curr_swap_extent = se;
	257	+ start_block <<= PAGE_SHIFT - 9;
	258	+ nr_blocks <<= PAGE_SHIFT - 9;
	259	+ if (blkdev_issue_discard(si->bdev, start_block,
	260	+ nr_blocks, GFP_NOIO, 0))
	261	+ break;
202	262
203		- start_block <<= PAGE_SHIFT - 9;
204		- nr_blocks <<= PAGE_SHIFT - 9;
205		- if (blkdev_issue_discard(si->bdev, start_block,
206		- nr_blocks, GFP_NOIO, 0))
207		- break;
208		- }
209		-
210		- se = list_next_entry(se, list);
	263	+ se = next_se(se);
211	264	}
212	265	}
213	266
..	..	@@ -562,7 +615,6 @@
562	615	{
563	616	struct percpu_cluster *cluster;
564	617	struct swap_cluster_info *ci;
565		- bool found_free;
566	618	unsigned long tmp, max;
567	619
568	620	new_cluster:
..	..	@@ -575,16 +627,16 @@
575	627	} else if (!cluster_list_empty(&si->discard_clusters)) {
576	628	/*
577	629	* we don't have free cluster but have some clusters in
578		- * discarding, do discard now and reclaim them
	630	+ * discarding, do discard now and reclaim them, then
	631	+ * reread cluster_next_cpu since we dropped si->lock
579	632	*/
580	633	swap_do_scheduled_discard(si);
581		- scan_base = offset = si->cluster_next;
	634	+ scan_base = this_cpu_read(si->cluster_next_cpu);
	635	+ offset = scan_base;
582	636	goto new_cluster;
583	637	} else
584	638	return false;
585	639	}
586		-
587		- found_free = false;
588	640
589	641	/*
590	642	* Other CPUs can use our cluster if they can't find a free cluster,
..	..	@@ -593,39 +645,42 @@
593	645	tmp = cluster->next;
594	646	max = min_t(unsigned long, si->max,
595	647	(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
596		- if (tmp >= max) {
597		- cluster_set_null(&cluster->index);
598		- goto new_cluster;
599		- }
600		- ci = lock_cluster(si, tmp);
601		- while (tmp < max) {
602		- if (!si->swap_map[tmp]) {
603		- found_free = true;
604		- break;
	648	+ if (tmp < max) {
	649	+ ci = lock_cluster(si, tmp);
	650	+ while (tmp < max) {
	651	+ if (!si->swap_map[tmp])
	652	+ break;
	653	+ tmp++;
605	654	}
606		- tmp++;
	655	+ unlock_cluster(ci);
607	656	}
608		- unlock_cluster(ci);
609		- if (!found_free) {
	657	+ if (tmp >= max) {
610	658	cluster_set_null(&cluster->index);
611	659	goto new_cluster;
612	660	}
613	661	cluster->next = tmp + 1;
614	662	*offset = tmp;
615	663	*scan_base = tmp;
616		- return found_free;
	664	+ return true;
617	665	}
618	666
619	667	static void __del_from_avail_list(struct swap_info_struct *p)
620	668	{
621	669	int nid;
622	670
	671	+ assert_spin_locked(&p->lock);
623	672	for_each_node(nid)
624	673	plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
625	674	}
626	675
627	676	static void del_from_avail_list(struct swap_info_struct *p)
628	677	{
	678	+ bool skip = false;
	679	+
	680	+ trace_android_vh_del_from_avail_list(p, &skip);
	681	+ if (skip)
	682	+ return;
	683	+
629	684	spin_lock(&swap_avail_lock);
630	685	__del_from_avail_list(p);
631	686	spin_unlock(&swap_avail_lock);
..	..	@@ -639,7 +694,7 @@
639	694	if (offset == si->lowest_bit)
640	695	si->lowest_bit += nr_entries;
641	696	if (end == si->highest_bit)
642		- si->highest_bit -= nr_entries;
	697	+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
643	698	si->inuse_pages += nr_entries;
644	699	if (si->inuse_pages == si->pages) {
645	700	si->lowest_bit = si->max;
..	..	@@ -651,6 +706,11 @@
651	706	static void add_to_avail_list(struct swap_info_struct *p)
652	707	{
653	708	int nid;
	709	+ bool skip = false;
	710	+
	711	+ trace_android_vh_add_to_avail_list(p, &skip);
	712	+ if (skip)
	713	+ return;
654	714
655	715	spin_lock(&swap_avail_lock);
656	716	for_each_node(nid) {
..	..	@@ -663,19 +723,23 @@
663	723	static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
664	724	unsigned int nr_entries)
665	725	{
	726	+ unsigned long begin = offset;
666	727	unsigned long end = offset + nr_entries - 1;
667	728	void (swap_slot_free_notify)(struct block_device , unsigned long);
	729	+ bool skip = false;
668	730
669	731	if (offset < si->lowest_bit)
670	732	si->lowest_bit = offset;
671	733	if (end > si->highest_bit) {
672	734	bool was_full = !si->highest_bit;
673	735
674		- si->highest_bit = end;
	736	+ WRITE_ONCE(si->highest_bit, end);
675	737	if (was_full && (si->flags & SWP_WRITEOK))
676	738	add_to_avail_list(si);
677	739	}
678		- atomic_long_add(nr_entries, &nr_swap_pages);
	740	+ trace_android_vh_account_swap_pages(si, &skip);
	741	+ if (!skip)
	742	+ atomic_long_add(nr_entries, &nr_swap_pages);
679	743	si->inuse_pages -= nr_entries;
680	744	if (si->flags & SWP_BLKDEV)
681	745	swap_slot_free_notify =
..	..	@@ -683,14 +747,44 @@
683	747	else
684	748	swap_slot_free_notify = NULL;
685	749	while (offset <= end) {
	750	+ arch_swap_invalidate_page(si->type, offset);
686	751	frontswap_invalidate_page(si->type, offset);
687	752	if (swap_slot_free_notify)
688	753	swap_slot_free_notify(si->bdev, offset);
689	754	offset++;
690	755	}
	756	+ clear_shadow_from_swap_cache(si->type, begin, end);
691	757	}
692	758
693		-static int scan_swap_map_slots(struct swap_info_struct *si,
	759	+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
	760	+{
	761	+ unsigned long prev;
	762	+
	763	+ if (!(si->flags & SWP_SOLIDSTATE)) {
	764	+ si->cluster_next = next;
	765	+ return;
	766	+ }
	767	+
	768	+ prev = this_cpu_read(*si->cluster_next_cpu);
	769	+ /*
	770	+ * Cross the swap address space size aligned trunk, choose
	771	+ * another trunk randomly to avoid lock contention on swap
	772	+ * address space if possible.
	773	+ */
	774	+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
	775	+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
	776	+ /* No free swap slots available */
	777	+ if (si->highest_bit <= si->lowest_bit)
	778	+ return;
	779	+ next = si->lowest_bit +
	780	+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
	781	+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
	782	+ next = max_t(unsigned int, next, si->lowest_bit);
	783	+ }
	784	+ this_cpu_write(*si->cluster_next_cpu, next);
	785	+}
	786	+
	787	+int scan_swap_map_slots(struct swap_info_struct *si,
694	788	unsigned char usage, int nr,
695	789	swp_entry_t slots[])
696	790	{
..	..	@@ -700,9 +794,7 @@
700	794	unsigned long last_in_cluster = 0;
701	795	int latency_ration = LATENCY_LIMIT;
702	796	int n_ret = 0;
703		-
704		- if (nr > SWAP_BATCH)
705		- nr = SWAP_BATCH;
	797	+ bool scanned_many = false;
706	798
707	799	/*
708	800	* We try to cluster swap pages by allocating them sequentially
..	..	@@ -716,17 +808,22 @@
716	808	*/
717	809
718	810	si->flags += SWP_SCANNING;
719		- scan_base = offset = si->cluster_next;
	811	+ /*
	812	+ * Use percpu scan base for SSD to reduce lock contention on
	813	+ * cluster and swap cache. For HDD, sequential access is more
	814	+ * important.
	815	+ */
	816	+ if (si->flags & SWP_SOLIDSTATE)
	817	+ scan_base = this_cpu_read(*si->cluster_next_cpu);
	818	+ else
	819	+ scan_base = si->cluster_next;
	820	+ offset = scan_base;
720	821
721	822	/* SSD algorithm */
722	823	if (si->cluster_info) {
723		- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
724		- goto checks;
725		- else
	824	+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
726	825	goto scan;
727		- }
728		-
729		- if (unlikely(!si->cluster_nr--)) {
	826	+ } else if (unlikely(!si->cluster_nr--)) {
730	827	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
731	828	si->cluster_nr = SWAPFILE_CLUSTER - 1;
732	829	goto checks;
..	..	@@ -789,7 +886,7 @@
789	886	int swap_was_freed;
790	887	unlock_cluster(ci);
791	888	spin_unlock(&si->lock);
792		- swap_was_freed = __try_to_reclaim_swap(si, offset);
	889	+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
793	890	spin_lock(&si->lock);
794	891	/* entry was freed successfully, try to use this again */
795	892	if (swap_was_freed)
..	..	@@ -804,12 +901,11 @@
804	901	else
805	902	goto done;
806	903	}
807		- si->swap_map[offset] = usage;
	904	+ WRITE_ONCE(si->swap_map[offset], usage);
808	905	inc_cluster_info_page(si, si->cluster_info, offset);
809	906	unlock_cluster(ci);
810	907
811	908	swap_range_alloc(si, offset, 1);
812		- si->cluster_next = offset + 1;
813	909	slots[n_ret++] = swp_entry(si->type, offset);
814	910
815	911	/* got enough slots or reach max slots? */
..	..	@@ -832,51 +928,69 @@
832	928	if (si->cluster_info) {
833	929	if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
834	930	goto checks;
835		- else
836		- goto done;
837		- }
838		- /* non-ssd case */
839		- ++offset;
840		-
841		- /* non-ssd case, still more slots in cluster? */
842		- if (si->cluster_nr && !si->swap_map[offset]) {
	931	+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
	932	+ /* non-ssd case, still more slots in cluster? */
843	933	--si->cluster_nr;
844	934	goto checks;
845	935	}
846	936
	937	+ /*
	938	+ * Even if there's no free clusters available (fragmented),
	939	+ * try to scan a little more quickly with lock held unless we
	940	+ * have scanned too many slots already.
	941	+ */
	942	+ if (!scanned_many) {
	943	+ unsigned long scan_limit;
	944	+
	945	+ if (offset < scan_base)
	946	+ scan_limit = scan_base;
	947	+ else
	948	+ scan_limit = si->highest_bit;
	949	+ for (; offset <= scan_limit && --latency_ration > 0;
	950	+ offset++) {
	951	+ if (!si->swap_map[offset])
	952	+ goto checks;
	953	+ }
	954	+ }
	955	+
847	956	done:
	957	+ set_cluster_next(si, offset + 1);
848	958	si->flags -= SWP_SCANNING;
849	959	return n_ret;
850	960
851	961	scan:
852	962	spin_unlock(&si->lock);
853		- while (++offset <= si->highest_bit) {
854		- if (!si->swap_map[offset]) {
	963	+ while (++offset <= READ_ONCE(si->highest_bit)) {
	964	+ if (data_race(!si->swap_map[offset])) {
855	965	spin_lock(&si->lock);
856	966	goto checks;
857	967	}
858		- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
	968	+ if (vm_swap_full() &&
	969	+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
859	970	spin_lock(&si->lock);
860	971	goto checks;
861	972	}
862	973	if (unlikely(--latency_ration < 0)) {
863	974	cond_resched();
864	975	latency_ration = LATENCY_LIMIT;
	976	+ scanned_many = true;
865	977	}
866	978	}
867	979	offset = si->lowest_bit;
868	980	while (offset < scan_base) {
869		- if (!si->swap_map[offset]) {
	981	+ if (data_race(!si->swap_map[offset])) {
870	982	spin_lock(&si->lock);
871	983	goto checks;
872	984	}
873		- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
	985	+ if (vm_swap_full() &&
	986	+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
874	987	spin_lock(&si->lock);
875	988	goto checks;
876	989	}
877	990	if (unlikely(--latency_ration < 0)) {
878	991	cond_resched();
879	992	latency_ration = LATENCY_LIMIT;
	993	+ scanned_many = true;
880	994	}
881	995	offset++;
882	996	}
..	..	@@ -886,8 +1000,9 @@
886	1000	si->flags -= SWP_SCANNING;
887	1001	return n_ret;
888	1002	}
	1003	+EXPORT_SYMBOL_GPL(scan_swap_map_slots);
889	1004
890		-static int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
	1005	+int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
891	1006	{
892	1007	unsigned long idx;
893	1008	struct swap_cluster_info *ci;
..	..	@@ -921,6 +1036,7 @@
921	1036
922	1037	return 1;
923	1038	}
	1039	+EXPORT_SYMBOL_GPL(swap_alloc_cluster);
924	1040
925	1041	static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
926	1042	{
..	..	@@ -928,6 +1044,7 @@
928	1044	struct swap_cluster_info *ci;
929	1045
930	1046	ci = lock_cluster(si, offset);
	1047	+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
931	1048	cluster_set_count_flag(ci, 0, 0);
932	1049	free_cluster(si, idx);
933	1050	unlock_cluster(ci);
..	..	@@ -960,19 +1077,17 @@
960	1077	/* Only single cluster request supported */
961	1078	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
962	1079
	1080	+ spin_lock(&swap_avail_lock);
	1081	+
963	1082	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
964		- if (avail_pgs <= 0)
	1083	+ if (avail_pgs <= 0) {
	1084	+ spin_unlock(&swap_avail_lock);
965	1085	goto noswap;
	1086	+ }
966	1087
967		- if (n_goal > SWAP_BATCH)
968		- n_goal = SWAP_BATCH;
969		-
970		- if (n_goal > avail_pgs)
971		- n_goal = avail_pgs;
	1088	+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
972	1089
973	1090	atomic_long_sub(n_goal * size, &nr_swap_pages);
974		-
975		- spin_lock(&swap_avail_lock);
976	1091
977	1092	start_over:
978	1093	node = numa_node_id();
..	..	@@ -1008,6 +1123,7 @@
1008	1123	goto check_out;
1009	1124	pr_debug("scan_swap_map of si %d failed to find offset\n",
1010	1125	si->type);
	1126	+ cond_resched();
1011	1127
1012	1128	spin_lock(&swap_avail_lock);
1013	1129	nextsi:
..	..	@@ -1041,20 +1157,22 @@
1041	1157	{
1042	1158	struct swap_info_struct *si = swap_type_to_swap_info(type);
1043	1159	pgoff_t offset;
	1160	+ bool skip = false;
1044	1161
1045	1162	if (!si)
1046	1163	goto fail;
1047	1164
1048	1165	spin_lock(&si->lock);
1049	1166	if (si->flags & SWP_WRITEOK) {
1050		- atomic_long_dec(&nr_swap_pages);
1051	1167	/* This is called for allocating swap entry, not cache */
1052	1168	offset = scan_swap_map(si, 1);
1053	1169	if (offset) {
	1170	+ trace_android_vh_account_swap_pages(si, &skip);
	1171	+ if (!skip)
	1172	+ atomic_long_dec(&nr_swap_pages);
1054	1173	spin_unlock(&si->lock);
1055	1174	return swp_entry(type, offset);
1056	1175	}
1057		- atomic_long_inc(&nr_swap_pages);
1058	1176	}
1059	1177	spin_unlock(&si->lock);
1060	1178	fail:
..	..	@@ -1064,15 +1182,14 @@
1064	1182	static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1065	1183	{
1066	1184	struct swap_info_struct *p;
1067		- unsigned long offset, type;
	1185	+ unsigned long offset;
1068	1186
1069	1187	if (!entry.val)
1070	1188	goto out;
1071		- type = swp_type(entry);
1072		- p = swap_type_to_swap_info(type);
	1189	+ p = swp_swap_info(entry);
1073	1190	if (!p)
1074	1191	goto bad_nofile;
1075		- if (!(p->flags & SWP_USED))
	1192	+ if (data_race(!(p->flags & SWP_USED)))
1076	1193	goto bad_device;
1077	1194	offset = swp_offset(entry);
1078	1195	if (offset >= p->max)
..	..	@@ -1098,13 +1215,12 @@
1098	1215	p = __swap_info_get(entry);
1099	1216	if (!p)
1100	1217	goto out;
1101		- if (!p->swap_map[swp_offset(entry)])
	1218	+ if (data_race(!p->swap_map[swp_offset(entry)]))
1102	1219	goto bad_free;
1103	1220	return p;
1104	1221
1105	1222	bad_free:
1106	1223	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1107		- goto out;
1108	1224	out:
1109	1225	return NULL;
1110	1226	}
..	..	@@ -1167,20 +1283,89 @@
1167	1283	}
1168	1284
1169	1285	usage = count \| has_cache;
1170		- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
	1286	+ if (usage)
	1287	+ WRITE_ONCE(p->swap_map[offset], usage);
	1288	+ else
	1289	+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
1171	1290
1172	1291	return usage;
1173	1292	}
1174	1293
	1294	+/*
	1295	+ * Check whether swap entry is valid in the swap device. If so,
	1296	+ * return pointer to swap_info_struct, and keep the swap entry valid
	1297	+ * via preventing the swap device from being swapoff, until
	1298	+ * put_swap_device() is called. Otherwise return NULL.
	1299	+ *
	1300	+ * The entirety of the RCU read critical section must come before the
	1301	+ * return from or after the call to synchronize_rcu() in
	1302	+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
	1303	+ * true, the si->map, si->cluster_info, etc. must be valid in the
	1304	+ * critical section.
	1305	+ *
	1306	+ * Notice that swapoff or swapoff+swapon can still happen before the
	1307	+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
	1308	+ * in put_swap_device() if there isn't any other way to prevent
	1309	+ * swapoff, such as page lock, page table lock, etc. The caller must
	1310	+ * be prepared for that. For example, the following situation is
	1311	+ * possible.
	1312	+ *
	1313	+ * CPU1 CPU2
	1314	+ * do_swap_page()
	1315	+ * ... swapoff+swapon
	1316	+ * __read_swap_cache_async()
	1317	+ * swapcache_prepare()
	1318	+ * __swap_duplicate()
	1319	+ * // check swap_map
	1320	+ * // verify PTE not changed
	1321	+ *
	1322	+ * In __swap_duplicate(), the swap_map need to be checked before
	1323	+ * changing partly because the specified swap entry may be for another
	1324	+ * swap device which has been swapoff. And in do_swap_page(), after
	1325	+ * the page is read from the swap device, the PTE is verified not
	1326	+ * changed with the page table locked to check whether the swap device
	1327	+ * has been swapoff or swapoff+swapon.
	1328	+ */
	1329	+struct swap_info_struct *get_swap_device(swp_entry_t entry)
	1330	+{
	1331	+ struct swap_info_struct *si;
	1332	+ unsigned long offset;
	1333	+
	1334	+ if (!entry.val)
	1335	+ goto out;
	1336	+ si = swp_swap_info(entry);
	1337	+ if (!si)
	1338	+ goto bad_nofile;
	1339	+
	1340	+ rcu_read_lock();
	1341	+ if (data_race(!(si->flags & SWP_VALID)))
	1342	+ goto unlock_out;
	1343	+ offset = swp_offset(entry);
	1344	+ if (offset >= si->max)
	1345	+ goto unlock_out;
	1346	+
	1347	+ return si;
	1348	+bad_nofile:
	1349	+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
	1350	+out:
	1351	+ return NULL;
	1352	+unlock_out:
	1353	+ rcu_read_unlock();
	1354	+ return NULL;
	1355	+}
	1356	+
1175	1357	static unsigned char __swap_entry_free(struct swap_info_struct *p,
1176		- swp_entry_t entry, unsigned char usage)
	1358	+ swp_entry_t entry)
1177	1359	{
1178	1360	struct swap_cluster_info *ci;
1179	1361	unsigned long offset = swp_offset(entry);
	1362	+ unsigned char usage;
1180	1363
1181	1364	ci = lock_cluster_or_swap_info(p, offset);
1182		- usage = __swap_entry_free_locked(p, offset, usage);
	1365	+ usage = __swap_entry_free_locked(p, offset, 1);
1183	1366	unlock_cluster_or_swap_info(p, ci);
	1367	+ if (!usage)
	1368	+ free_swap_slot(entry);
1184	1369
1185	1370	return usage;
1186	1371	}
..	..	@@ -1211,10 +1396,8 @@
1211	1396	struct swap_info_struct *p;
1212	1397
1213	1398	p = _swap_info_get(entry);
1214		- if (p) {
1215		- if (!__swap_entry_free(p, entry, 1))
1216		- free_swap_slot(entry);
1217		- }
	1399	+ if (p)
	1400	+ __swap_entry_free(p, entry);
1218	1401	}
1219	1402
1220	1403	/*
..	..	@@ -1229,7 +1412,7 @@
1229	1412	unsigned char *map;
1230	1413	unsigned int i, free_entries = 0;
1231	1414	unsigned char val;
1232		- int size = swap_entry_size(hpage_nr_pages(page));
	1415	+ int size = swap_entry_size(thp_nr_pages(page));
1233	1416
1234	1417	si = _swap_info_get(entry);
1235	1418	if (!si)
..	..	@@ -1249,9 +1432,6 @@
1249	1432	if (free_entries == SWAPFILE_CLUSTER) {
1250	1433	unlock_cluster_or_swap_info(si, ci);
1251	1434	spin_lock(&si->lock);
1252		- ci = lock_cluster(si, offset);
1253		- memset(map, 0, SWAPFILE_CLUSTER);
1254		- unlock_cluster(ci);
1255	1435	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1256	1436	swap_free_cluster(si, idx);
1257	1437	spin_unlock(&si->lock);
..	..	@@ -1321,6 +1501,7 @@
1321	1501	if (p)
1322	1502	spin_unlock(&p->lock);
1323	1503	}
	1504	+EXPORT_SYMBOL_GPL(swapcache_free_entries);
1324	1505
1325	1506	/*
1326	1507	* How many references to page are currently swapped out?
..	..	@@ -1346,11 +1527,18 @@
1346	1527	return count;
1347	1528	}
1348	1529
1349		-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
	1530	+int __swap_count(swp_entry_t entry)
1350	1531	{
	1532	+ struct swap_info_struct *si;
1351	1533	pgoff_t offset = swp_offset(entry);
	1534	+ int count = 0;
1352	1535
1353		- return swap_count(si->swap_map[offset]);
	1536	+ si = get_swap_device(entry);
	1537	+ if (si) {
	1538	+ count = swap_count(si->swap_map[offset]);
	1539	+ put_swap_device(si);
	1540	+ }
	1541	+ return count;
1354	1542	}
1355	1543
1356	1544	static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
..	..	@@ -1375,9 +1563,11 @@
1375	1563	int count = 0;
1376	1564	struct swap_info_struct *si;
1377	1565
1378		- si = __swap_info_get(entry);
1379		- if (si)
	1566	+ si = get_swap_device(entry);
	1567	+ if (si) {
1380	1568	count = swap_swapcount(si, entry);
	1569	+ put_swap_device(si);
	1570	+ }
1381	1571	return count;
1382	1572	}
1383	1573
..	..	@@ -1624,7 +1814,6 @@
1624	1814	int free_swap_and_cache(swp_entry_t entry)
1625	1815	{
1626	1816	struct swap_info_struct *p;
1627		- struct page *page = NULL;
1628	1817	unsigned char count;
1629	1818
1630	1819	if (non_swap_entry(entry))
..	..	@@ -1632,32 +1821,11 @@
1632	1821
1633	1822	p = _swap_info_get(entry);
1634	1823	if (p) {
1635		- count = __swap_entry_free(p, entry, 1);
	1824	+ count = __swap_entry_free(p, entry);
1636	1825	if (count == SWAP_HAS_CACHE &&
1637		- !swap_page_trans_huge_swapped(p, entry)) {
1638		- page = find_get_page(swap_address_space(entry),
1639		- swp_offset(entry));
1640		- if (page && !trylock_page(page)) {
1641		- put_page(page);
1642		- page = NULL;
1643		- }
1644		- } else if (!count)
1645		- free_swap_slot(entry);
1646		- }
1647		- if (page) {
1648		- /*
1649		- * Not mapped elsewhere, or swap space full? Free it!
1650		- * Also recheck PageSwapCache now page is locked (above).
1651		- */
1652		- if (PageSwapCache(page) && !PageWriteback(page) &&
1653		- (!page_mapped(page) \|\| mem_cgroup_swap_full(page)) &&
1654		- !swap_page_trans_huge_swapped(p, entry)) {
1655		- page = compound_head(page);
1656		- delete_from_swap_cache(page);
1657		- SetPageDirty(page);
1658		- }
1659		- unlock_page(page);
1660		- put_page(page);
	1826	+ !swap_page_trans_huge_swapped(p, entry))
	1827	+ __try_to_reclaim_swap(p, swp_offset(entry),
	1828	+ TTRS_UNMAPPED \| TTRS_FULL);
1661	1829	}
1662	1830	return p != NULL;
1663	1831	}
..	..	@@ -1671,13 +1839,12 @@
1671	1839	*
1672	1840	* This is needed for the suspend to disk (aka swsusp).
1673	1841	*/
1674		-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
	1842	+int swap_type_of(dev_t device, sector_t offset)
1675	1843	{
1676		- struct block_device *bdev = NULL;
1677	1844	int type;
1678	1845
1679		- if (device)
1680		- bdev = bdget(device);
	1846	+ if (!device)
	1847	+ return -1;
1681	1848
1682	1849	spin_lock(&swap_lock);
1683	1850	for (type = 0; type < nr_swapfiles; type++) {
..	..	@@ -1686,30 +1853,34 @@
1686	1853	if (!(sis->flags & SWP_WRITEOK))
1687	1854	continue;
1688	1855
1689		- if (!bdev) {
1690		- if (bdev_p)
1691		- *bdev_p = bdgrab(sis->bdev);
1692		-
1693		- spin_unlock(&swap_lock);
1694		- return type;
1695		- }
1696		- if (bdev == sis->bdev) {
1697		- struct swap_extent *se = &sis->first_swap_extent;
	1856	+ if (device == sis->bdev->bd_dev) {
	1857	+ struct swap_extent *se = first_se(sis);
1698	1858
1699	1859	if (se->start_block == offset) {
1700		- if (bdev_p)
1701		- *bdev_p = bdgrab(sis->bdev);
1702		-
1703	1860	spin_unlock(&swap_lock);
1704		- bdput(bdev);
1705	1861	return type;
1706	1862	}
1707	1863	}
1708	1864	}
1709	1865	spin_unlock(&swap_lock);
1710		- if (bdev)
1711		- bdput(bdev);
	1866	+ return -ENODEV;
	1867	+}
1712	1868
	1869	+int find_first_swap(dev_t *device)
	1870	+{
	1871	+ int type;
	1872	+
	1873	+ spin_lock(&swap_lock);
	1874	+ for (type = 0; type < nr_swapfiles; type++) {
	1875	+ struct swap_info_struct *sis = swap_info[type];
	1876	+
	1877	+ if (!(sis->flags & SWP_WRITEOK))
	1878	+ continue;
	1879	+ *device = sis->bdev->bd_dev;
	1880	+ spin_unlock(&swap_lock);
	1881	+ return type;
	1882	+ }
	1883	+ spin_unlock(&swap_lock);
1713	1884	return -ENODEV;
1714	1885	}
1715	1886
..	..	@@ -1756,7 +1927,7 @@
1756	1927
1757	1928	static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1758	1929	{
1759		- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
	1930	+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
1760	1931	}
1761	1932
1762	1933	/*
..	..	@@ -1768,7 +1939,6 @@
1768	1939	unsigned long addr, swp_entry_t entry, struct page *page)
1769	1940	{
1770	1941	struct page *swapcache;
1771		- struct mem_cgroup *memcg;
1772	1942	spinlock_t *ptl;
1773	1943	pte_t *pte;
1774	1944	int ret = 1;
..	..	@@ -1778,15 +1948,8 @@
1778	1948	if (unlikely(!page))
1779	1949	return -ENOMEM;
1780	1950
1781		- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1782		- &memcg, false)) {
1783		- ret = -ENOMEM;
1784		- goto out_nolock;
1785		- }
1786		-
1787	1951	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1788	1952	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1789		- mem_cgroup_cancel_charge(page, memcg, false);
1790	1953	ret = 0;
1791	1954	goto out;
1792	1955	}
..	..	@@ -1798,21 +1961,13 @@
1798	1961	pte_mkold(mk_pte(page, vma->vm_page_prot)));
1799	1962	if (page == swapcache) {
1800	1963	page_add_anon_rmap(page, vma, addr, false);
1801		- mem_cgroup_commit_charge(page, memcg, true, false);
1802	1964	} else { /* ksm created a completely new copy */
1803	1965	page_add_new_anon_rmap(page, vma, addr, false);
1804		- mem_cgroup_commit_charge(page, memcg, false, false);
1805		- lru_cache_add_active_or_unevictable(page, vma);
	1966	+ lru_cache_add_inactive_or_unevictable(page, vma);
1806	1967	}
1807	1968	swap_free(entry);
1808		- /*
1809		- * Move the page to the active list so it is not
1810		- * immediately swapped out again after swapon.
1811		- */
1812		- activate_page(page);
1813	1969	out:
1814	1970	pte_unmap_unlock(pte, ptl);
1815		-out_nolock:
1816	1971	if (page != swapcache) {
1817	1972	unlock_page(page);
1818	1973	put_page(page);
..	..	@@ -1821,44 +1976,83 @@
1821	1976	}
1822	1977
1823	1978	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
1824		- unsigned long addr, unsigned long end,
1825		- swp_entry_t entry, struct page *page)
	1979	+ unsigned long addr, unsigned long end,
	1980	+ unsigned int type, bool frontswap,
	1981	+ unsigned long *fs_pages_to_unuse)
1826	1982	{
1827		- pte_t swp_pte = swp_entry_to_pte(entry);
	1983	+ struct page *page;
	1984	+ swp_entry_t entry;
1828	1985	pte_t *pte;
	1986	+ struct swap_info_struct *si;
	1987	+ unsigned long offset;
1829	1988	int ret = 0;
	1989	+ volatile unsigned char *swap_map;
1830	1990
1831		- /*
1832		- * We don't actually need pte lock while scanning for swp_pte: since
1833		- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1834		- * page table while we're scanning; though it could get zapped, and on
1835		- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1836		- * of unmatched parts which look like swp_pte, so unuse_pte must
1837		- * recheck under pte lock. Scanning without pte lock lets it be
1838		- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1839		- */
	1991	+ si = swap_info[type];
1840	1992	pte = pte_offset_map(pmd, addr);
1841	1993	do {
1842		- /*
1843		- * swapoff spends a _lot_ of time in this loop!
1844		- * Test inline before going to call unuse_pte.
1845		- */
1846		- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1847		- pte_unmap(pte);
1848		- ret = unuse_pte(vma, pmd, addr, entry, page);
1849		- if (ret)
1850		- goto out;
1851		- pte = pte_offset_map(pmd, addr);
	1994	+ if (!is_swap_pte(*pte))
	1995	+ continue;
	1996	+
	1997	+ entry = pte_to_swp_entry(*pte);
	1998	+ if (swp_type(entry) != type)
	1999	+ continue;
	2000	+
	2001	+ offset = swp_offset(entry);
	2002	+ if (frontswap && !frontswap_test(si, offset))
	2003	+ continue;
	2004	+
	2005	+ pte_unmap(pte);
	2006	+ swap_map = &si->swap_map[offset];
	2007	+ page = lookup_swap_cache(entry, vma, addr);
	2008	+ if (!page) {
	2009	+ struct vm_fault vmf = {
	2010	+ .vma = vma,
	2011	+ .address = addr,
	2012	+ .pmd = pmd,
	2013	+ };
	2014	+
	2015	+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
	2016	+ &vmf);
1852	2017	}
	2018	+ if (!page) {
	2019	+ if (swap_map == 0 \|\| swap_map == SWAP_MAP_BAD)
	2020	+ goto try_next;
	2021	+ return -ENOMEM;
	2022	+ }
	2023	+
	2024	+ lock_page(page);
	2025	+ wait_on_page_writeback(page);
	2026	+ ret = unuse_pte(vma, pmd, addr, entry, page);
	2027	+ if (ret < 0) {
	2028	+ unlock_page(page);
	2029	+ put_page(page);
	2030	+ goto out;
	2031	+ }
	2032	+
	2033	+ try_to_free_swap(page);
	2034	+ trace_android_vh_unuse_swap_page(si, page);
	2035	+ unlock_page(page);
	2036	+ put_page(page);
	2037	+
	2038	+ if (fs_pages_to_unuse && !--(fs_pages_to_unuse)) {
	2039	+ ret = FRONTSWAP_PAGES_UNUSED;
	2040	+ goto out;
	2041	+ }
	2042	+try_next:
	2043	+ pte = pte_offset_map(pmd, addr);
1853	2044	} while (pte++, addr += PAGE_SIZE, addr != end);
1854	2045	pte_unmap(pte - 1);
	2046	+
	2047	+ ret = 0;
1855	2048	out:
1856	2049	return ret;
1857	2050	}
1858	2051
1859	2052	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
1860	2053	unsigned long addr, unsigned long end,
1861		- swp_entry_t entry, struct page *page)
	2054	+ unsigned int type, bool frontswap,
	2055	+ unsigned long *fs_pages_to_unuse)
1862	2056	{
1863	2057	pmd_t *pmd;
1864	2058	unsigned long next;
..	..	@@ -1870,7 +2064,8 @@
1870	2064	next = pmd_addr_end(addr, end);
1871	2065	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1872	2066	continue;
1873		- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
	2067	+ ret = unuse_pte_range(vma, pmd, addr, next, type,
	2068	+ frontswap, fs_pages_to_unuse);
1874	2069	if (ret)
1875	2070	return ret;
1876	2071	} while (pmd++, addr = next, addr != end);
..	..	@@ -1879,7 +2074,8 @@
1879	2074
1880	2075	static inline int unuse_pud_range(struct vm_area_struct vma, p4d_t p4d,
1881	2076	unsigned long addr, unsigned long end,
1882		- swp_entry_t entry, struct page *page)
	2077	+ unsigned int type, bool frontswap,
	2078	+ unsigned long *fs_pages_to_unuse)
1883	2079	{
1884	2080	pud_t *pud;
1885	2081	unsigned long next;
..	..	@@ -1890,7 +2086,8 @@
1890	2086	next = pud_addr_end(addr, end);
1891	2087	if (pud_none_or_clear_bad(pud))
1892	2088	continue;
1893		- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
	2089	+ ret = unuse_pmd_range(vma, pud, addr, next, type,
	2090	+ frontswap, fs_pages_to_unuse);
1894	2091	if (ret)
1895	2092	return ret;
1896	2093	} while (pud++, addr = next, addr != end);
..	..	@@ -1899,7 +2096,8 @@
1899	2096
1900	2097	static inline int unuse_p4d_range(struct vm_area_struct vma, pgd_t pgd,
1901	2098	unsigned long addr, unsigned long end,
1902		- swp_entry_t entry, struct page *page)
	2099	+ unsigned int type, bool frontswap,
	2100	+ unsigned long *fs_pages_to_unuse)
1903	2101	{
1904	2102	p4d_t *p4d;
1905	2103	unsigned long next;
..	..	@@ -1910,78 +2108,66 @@
1910	2108	next = p4d_addr_end(addr, end);
1911	2109	if (p4d_none_or_clear_bad(p4d))
1912	2110	continue;
1913		- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
	2111	+ ret = unuse_pud_range(vma, p4d, addr, next, type,
	2112	+ frontswap, fs_pages_to_unuse);
1914	2113	if (ret)
1915	2114	return ret;
1916	2115	} while (p4d++, addr = next, addr != end);
1917	2116	return 0;
1918	2117	}
1919	2118
1920		-static int unuse_vma(struct vm_area_struct *vma,
1921		- swp_entry_t entry, struct page *page)
	2119	+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
	2120	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1922	2121	{
1923	2122	pgd_t *pgd;
1924	2123	unsigned long addr, end, next;
1925	2124	int ret;
1926	2125
1927		- if (page_anon_vma(page)) {
1928		- addr = page_address_in_vma(page, vma);
1929		- if (addr == -EFAULT)
1930		- return 0;
1931		- else
1932		- end = addr + PAGE_SIZE;
1933		- } else {
1934		- addr = vma->vm_start;
1935		- end = vma->vm_end;
1936		- }
	2126	+ addr = vma->vm_start;
	2127	+ end = vma->vm_end;
1937	2128
1938	2129	pgd = pgd_offset(vma->vm_mm, addr);
1939	2130	do {
1940	2131	next = pgd_addr_end(addr, end);
1941	2132	if (pgd_none_or_clear_bad(pgd))
1942	2133	continue;
1943		- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
	2134	+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
	2135	+ frontswap, fs_pages_to_unuse);
1944	2136	if (ret)
1945	2137	return ret;
1946	2138	} while (pgd++, addr = next, addr != end);
1947	2139	return 0;
1948	2140	}
1949	2141
1950		-static int unuse_mm(struct mm_struct *mm,
1951		- swp_entry_t entry, struct page *page)
	2142	+static int unuse_mm(struct mm_struct *mm, unsigned int type,
	2143	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1952	2144	{
1953	2145	struct vm_area_struct *vma;
1954	2146	int ret = 0;
1955	2147
1956		- if (!down_read_trylock(&mm->mmap_sem)) {
1957		- /*
1958		- * Activate page so shrink_inactive_list is unlikely to unmap
1959		- * its ptes while lock is dropped, so swapoff can make progress.
1960		- */
1961		- activate_page(page);
1962		- unlock_page(page);
1963		- down_read(&mm->mmap_sem);
1964		- lock_page(page);
1965		- }
	2148	+ mmap_read_lock(mm);
1966	2149	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1967		- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1968		- break;
	2150	+ if (vma->anon_vma) {
	2151	+ ret = unuse_vma(vma, type, frontswap,
	2152	+ fs_pages_to_unuse);
	2153	+ if (ret)
	2154	+ break;
	2155	+ }
1969	2156	cond_resched();
1970	2157	}
1971		- up_read(&mm->mmap_sem);
1972		- return (ret < 0)? ret: 0;
	2158	+ mmap_read_unlock(mm);
	2159	+ return ret;
1973	2160	}
1974	2161
1975	2162	/*
1976	2163	* Scan swap_map (or frontswap_map if frontswap parameter is true)
1977		- * from current position to next entry still in use.
1978		- * Recycle to start on reaching the end, returning 0 when empty.
	2164	+ * from current position to next entry still in use. Return 0
	2165	+ * if there are no inuse entries after prev till end of the map.
1979	2166	*/
1980	2167	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1981	2168	unsigned int prev, bool frontswap)
1982	2169	{
1983		- unsigned int max = si->max;
1984		- unsigned int i = prev;
	2170	+ unsigned int i;
1985	2171	unsigned char count;
1986	2172
1987	2173	/*
..	..	@@ -1990,20 +2176,7 @@
1990	2176	* hits are okay, and sys_swapoff() has already prevented new
1991	2177	* allocations from this area (while holding swap_lock).
1992	2178	*/
1993		- for (;;) {
1994		- if (++i >= max) {
1995		- if (!prev) {
1996		- i = 0;
1997		- break;
1998		- }
1999		- /*
2000		- * No entries in use at top of swap_map,
2001		- * loop back to start and recheck there.
2002		- */
2003		- max = prev + 1;
2004		- prev = 0;
2005		- i = 1;
2006		- }
	2179	+ for (i = prev + 1; i < si->max; i++) {
2007	2180	count = READ_ONCE(si->swap_map[i]);
2008	2181	if (count && swap_count(count) != SWAP_MAP_BAD)
2009	2182	if (!frontswap \|\| frontswap_test(si, i))
..	..	@@ -2011,240 +2184,124 @@
2011	2184	if ((i % LATENCY_LIMIT) == 0)
2012	2185	cond_resched();
2013	2186	}
	2187	+
	2188	+ if (i == si->max)
	2189	+ i = 0;
	2190	+
2014	2191	return i;
2015	2192	}
2016	2193
2017	2194	/*
2018		- * We completely avoid races by reading each swap page in advance,
2019		- * and then search for the process using it. All the necessary
2020		- * page table adjustments can then be made atomically.
2021		- *
2022		- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
	2195	+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
2023	2196	* pages_to_unuse==0 means all pages; ignored if frontswap is false
2024	2197	*/
2025	2198	int try_to_unuse(unsigned int type, bool frontswap,
2026	2199	unsigned long pages_to_unuse)
2027	2200	{
	2201	+ struct mm_struct *prev_mm;
	2202	+ struct mm_struct *mm;
	2203	+ struct list_head *p;
	2204	+ int retval = 0;
2028	2205	struct swap_info_struct *si = swap_info[type];
2029		- struct mm_struct *start_mm;
2030		- volatile unsigned char swap_map; / swap_map is accessed without
2031		- * locking. Mark it as volatile
2032		- * to prevent compiler doing
2033		- * something odd.
2034		- */
2035		- unsigned char swcount;
2036	2206	struct page *page;
2037	2207	swp_entry_t entry;
2038		- unsigned int i = 0;
2039		- int retval = 0;
	2208	+ unsigned int i;
2040	2209
2041		- /*
2042		- * When searching mms for an entry, a good strategy is to
2043		- * start at the first mm we freed the previous entry from
2044		- * (though actually we don't notice whether we or coincidence
2045		- * freed the entry). Initialize this start_mm with a hold.
2046		- *
2047		- * A simpler strategy would be to start at the last mm we
2048		- * freed the previous entry from; but that would take less
2049		- * advantage of mmlist ordering, which clusters forked mms
2050		- * together, child after parent. If we race with dup_mmap(), we
2051		- * prefer to resolve parent before child, lest we miss entries
2052		- * duplicated after we scanned child: using last mm would invert
2053		- * that.
2054		- */
2055		- start_mm = &init_mm;
2056		- mmget(&init_mm);
	2210	+ if (!READ_ONCE(si->inuse_pages))
	2211	+ return 0;
2057	2212
2058		- /*
2059		- * Keep on scanning until all entries have gone. Usually,
2060		- * one pass through swap_map is enough, but not necessarily:
2061		- * there are races when an instance of an entry might be missed.
2062		- */
2063		- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2064		- if (signal_pending(current)) {
2065		- retval = -EINTR;
2066		- break;
2067		- }
	2213	+ if (!frontswap)
	2214	+ pages_to_unuse = 0;
2068	2215
2069		- /*
2070		- * Get a page for the entry, using the existing swap
2071		- * cache page if there is one. Otherwise, get a clean
2072		- * page and read the swap into it.
2073		- */
2074		- swap_map = &si->swap_map[i];
2075		- entry = swp_entry(type, i);
2076		- page = read_swap_cache_async(entry,
2077		- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2078		- if (!page) {
2079		- /*
2080		- * Either swap_duplicate() failed because entry
2081		- * has been freed independently, and will not be
2082		- * reused since sys_swapoff() already disabled
2083		- * allocation from here, or alloc_page() failed.
2084		- */
2085		- swcount = *swap_map;
2086		- /*
2087		- * We don't hold lock here, so the swap entry could be
2088		- * SWAP_MAP_BAD (when the cluster is discarding).
2089		- * Instead of fail out, We can just skip the swap
2090		- * entry because swapoff will wait for discarding
2091		- * finish anyway.
2092		- */
2093		- if (!swcount \|\| swcount == SWAP_MAP_BAD)
2094		- continue;
2095		- retval = -ENOMEM;
2096		- break;
2097		- }
	2216	+retry:
	2217	+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
	2218	+ if (retval)
	2219	+ goto out;
2098	2220
2099		- /*
2100		- * Don't hold on to start_mm if it looks like exiting.
2101		- */
2102		- if (atomic_read(&start_mm->mm_users) == 1) {
2103		- mmput(start_mm);
2104		- start_mm = &init_mm;
2105		- mmget(&init_mm);
2106		- }
	2221	+ prev_mm = &init_mm;
	2222	+ mmget(prev_mm);
2107	2223
2108		- /*
2109		- * Wait for and lock page. When do_swap_page races with
2110		- * try_to_unuse, do_swap_page can handle the fault much
2111		- * faster than try_to_unuse can locate the entry. This
2112		- * apparently redundant "wait_on_page_locked" lets try_to_unuse
2113		- * defer to do_swap_page in such a case - in some tests,
2114		- * do_swap_page and try_to_unuse repeatedly compete.
2115		- */
2116		- wait_on_page_locked(page);
2117		- wait_on_page_writeback(page);
2118		- lock_page(page);
2119		- wait_on_page_writeback(page);
	2224	+ spin_lock(&mmlist_lock);
	2225	+ p = &init_mm.mmlist;
	2226	+ while (READ_ONCE(si->inuse_pages) &&
	2227	+ !signal_pending(current) &&
	2228	+ (p = p->next) != &init_mm.mmlist) {
2120	2229
2121		- /*
2122		- * Remove all references to entry.
2123		- */
2124		- swcount = *swap_map;
2125		- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2126		- retval = shmem_unuse(entry, page);
2127		- /* page has already been unlocked and released */
2128		- if (retval < 0)
2129		- break;
	2230	+ mm = list_entry(p, struct mm_struct, mmlist);
	2231	+ if (!mmget_not_zero(mm))
2130	2232	continue;
2131		- }
2132		- if (swap_count(swcount) && start_mm != &init_mm)
2133		- retval = unuse_mm(start_mm, entry, page);
	2233	+ spin_unlock(&mmlist_lock);
	2234	+ mmput(prev_mm);
	2235	+ prev_mm = mm;
	2236	+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2134	2237
2135		- if (swap_count(*swap_map)) {
2136		- int set_start_mm = (*swap_map >= swcount);
2137		- struct list_head *p = &start_mm->mmlist;
2138		- struct mm_struct *new_start_mm = start_mm;
2139		- struct mm_struct *prev_mm = start_mm;
2140		- struct mm_struct *mm;
2141		-
2142		- mmget(new_start_mm);
2143		- mmget(prev_mm);
2144		- spin_lock(&mmlist_lock);
2145		- while (swap_count(*swap_map) && !retval &&
2146		- (p = p->next) != &start_mm->mmlist) {
2147		- mm = list_entry(p, struct mm_struct, mmlist);
2148		- if (!mmget_not_zero(mm))
2149		- continue;
2150		- spin_unlock(&mmlist_lock);
2151		- mmput(prev_mm);
2152		- prev_mm = mm;
2153		-
2154		- cond_resched();
2155		-
2156		- swcount = *swap_map;
2157		- if (!swap_count(swcount)) /* any usage ? */
2158		- ;
2159		- else if (mm == &init_mm)
2160		- set_start_mm = 1;
2161		- else
2162		- retval = unuse_mm(mm, entry, page);
2163		-
2164		- if (set_start_mm && *swap_map < swcount) {
2165		- mmput(new_start_mm);
2166		- mmget(mm);
2167		- new_start_mm = mm;
2168		- set_start_mm = 0;
2169		- }
2170		- spin_lock(&mmlist_lock);
2171		- }
2172		- spin_unlock(&mmlist_lock);
2173		- mmput(prev_mm);
2174		- mmput(start_mm);
2175		- start_mm = new_start_mm;
2176		- }
2177	2238	if (retval) {
2178		- unlock_page(page);
2179		- put_page(page);
2180		- break;
	2239	+ mmput(prev_mm);
	2240	+ goto out;
2181	2241	}
2182		-
2183		- /*
2184		- * If a reference remains (rare), we would like to leave
2185		- * the page in the swap cache; but try_to_unmap could
2186		- * then re-duplicate the entry once we drop page lock,
2187		- * so we might loop indefinitely; also, that page could
2188		- * not be swapped out to other storage meanwhile. So:
2189		- * delete from cache even if there's another reference,
2190		- * after ensuring that the data has been saved to disk -
2191		- * since if the reference remains (rarer), it will be
2192		- * read from disk into another page. Splitting into two
2193		- * pages would be incorrect if swap supported "shared
2194		- * private" pages, but they are handled by tmpfs files.
2195		- *
2196		- * Given how unuse_vma() targets one particular offset
2197		- * in an anon_vma, once the anon_vma has been determined,
2198		- * this splitting happens to be just what is needed to
2199		- * handle where KSM pages have been swapped out: re-reading
2200		- * is unnecessarily slow, but we can fix that later on.
2201		- */
2202		- if (swap_count(*swap_map) &&
2203		- PageDirty(page) && PageSwapCache(page)) {
2204		- struct writeback_control wbc = {
2205		- .sync_mode = WB_SYNC_NONE,
2206		- };
2207		-
2208		- swap_writepage(compound_head(page), &wbc);
2209		- lock_page(page);
2210		- wait_on_page_writeback(page);
2211		- }
2212		-
2213		- /*
2214		- * It is conceivable that a racing task removed this page from
2215		- * swap cache just before we acquired the page lock at the top,
2216		- * or while we dropped it in unuse_mm(). The page might even
2217		- * be back in swap cache on another swap area: that we must not
2218		- * delete, since it may not have been written out to swap yet.
2219		- */
2220		- if (PageSwapCache(page) &&
2221		- likely(page_private(page) == entry.val) &&
2222		- (!PageTransCompound(page) \|\|
2223		- !swap_page_trans_huge_swapped(si, entry)))
2224		- delete_from_swap_cache(compound_head(page));
2225		-
2226		- /*
2227		- * So we could skip searching mms once swap count went
2228		- * to 1, we did not mark any present ptes as dirty: must
2229		- * mark page dirty so shrink_page_list will preserve it.
2230		- */
2231		- SetPageDirty(page);
2232		- unlock_page(page);
2233		- put_page(page);
2234	2242
2235	2243	/*
2236	2244	* Make sure that we aren't completely killing
2237	2245	* interactive performance.
2238	2246	*/
2239	2247	cond_resched();
2240		- if (frontswap && pages_to_unuse > 0) {
2241		- if (!--pages_to_unuse)
2242		- break;
2243		- }
	2248	+ spin_lock(&mmlist_lock);
	2249	+ }
	2250	+ spin_unlock(&mmlist_lock);
	2251	+
	2252	+ mmput(prev_mm);
	2253	+
	2254	+ i = 0;
	2255	+ while (READ_ONCE(si->inuse_pages) &&
	2256	+ !signal_pending(current) &&
	2257	+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
	2258	+
	2259	+ entry = swp_entry(type, i);
	2260	+ page = find_get_page(swap_address_space(entry), i);
	2261	+ if (!page)
	2262	+ continue;
	2263	+
	2264	+ /*
	2265	+ * It is conceivable that a racing task removed this page from
	2266	+ * swap cache just before we acquired the page lock. The page
	2267	+ * might even be back in swap cache on another swap area. But
	2268	+ * that is okay, try_to_free_swap() only removes stale pages.
	2269	+ */
	2270	+ lock_page(page);
	2271	+ wait_on_page_writeback(page);
	2272	+ try_to_free_swap(page);
	2273	+ trace_android_vh_unuse_swap_page(si, page);
	2274	+ unlock_page(page);
	2275	+ put_page(page);
	2276	+
	2277	+ /*
	2278	+ * For frontswap, we just need to unuse pages_to_unuse, if
	2279	+ * it was specified. Need not check frontswap again here as
	2280	+ * we already zeroed out pages_to_unuse if not frontswap.
	2281	+ */
	2282	+ if (pages_to_unuse && --pages_to_unuse == 0)
	2283	+ goto out;
2244	2284	}
2245	2285
2246		- mmput(start_mm);
2247		- return retval;
	2286	+ /*
	2287	+ * Lets check again to see if there are still swap entries in the map.
	2288	+ * If yes, we would need to do retry the unuse logic again.
	2289	+ * Under global memory pressure, swap entries can be reinserted back
	2290	+ * into process space after the mmlist loop above passes over them.
	2291	+ *
	2292	+ * Limit the number of retries? No: when mmget_not_zero() above fails,
	2293	+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
	2294	+ * at its own independent pace; and even shmem_writepage() could have
	2295	+ * been preempted after get_swap_page(), temporarily hiding that swap.
	2296	+ * It's easy and robust (though cpu-intensive) just to keep retrying.
	2297	+ */
	2298	+ if (READ_ONCE(si->inuse_pages)) {
	2299	+ if (!signal_pending(current))
	2300	+ goto retry;
	2301	+ retval = -EINTR;
	2302	+ }
	2303	+out:
	2304	+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2248	2305	}
2249	2306
2250	2307	/*
..	..	@@ -2276,7 +2333,6 @@
2276	2333	static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2277	2334	{
2278	2335	struct swap_info_struct *sis;
2279		- struct swap_extent *start_se;
2280	2336	struct swap_extent *se;
2281	2337	pgoff_t offset;
2282	2338
..	..	@@ -2284,18 +2340,8 @@
2284	2340	*bdev = sis->bdev;
2285	2341
2286	2342	offset = swp_offset(entry);
2287		- start_se = sis->curr_swap_extent;
2288		- se = start_se;
2289		-
2290		- for ( ; ; ) {
2291		- if (se->start_page <= offset &&
2292		- offset < (se->start_page + se->nr_pages)) {
2293		- return se->start_block + (offset - se->start_page);
2294		- }
2295		- se = list_next_entry(se, list);
2296		- sis->curr_swap_extent = se;
2297		- BUG_ON(se == start_se); /* It must be present */
2298		- }
	2343	+ se = offset_to_swap_extent(sis, offset);
	2344	+ return se->start_block + (offset - se->start_page);
2299	2345	}
2300	2346
2301	2347	/*
..	..	@@ -2305,7 +2351,7 @@
2305	2351	{
2306	2352	swp_entry_t entry;
2307	2353	entry.val = page_private(page);
2308		- return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9);
	2354	+ return map_swap_entry(entry, bdev);
2309	2355	}
2310	2356
2311	2357	/*
..	..	@@ -2313,27 +2359,27 @@
2313	2359	*/
2314	2360	static void destroy_swap_extents(struct swap_info_struct *sis)
2315	2361	{
2316		- while (!list_empty(&sis->first_swap_extent.list)) {
2317		- struct swap_extent *se;
	2362	+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
	2363	+ struct rb_node *rb = sis->swap_extent_root.rb_node;
	2364	+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2318	2365
2319		- se = list_first_entry(&sis->first_swap_extent.list,
2320		- struct swap_extent, list);
2321		- list_del(&se->list);
	2366	+ rb_erase(rb, &sis->swap_extent_root);
2322	2367	kfree(se);
2323	2368	}
2324	2369
2325		- if (sis->flags & SWP_FILE) {
	2370	+ if (sis->flags & SWP_ACTIVATED) {
2326	2371	struct file *swap_file = sis->swap_file;
2327	2372	struct address_space *mapping = swap_file->f_mapping;
2328	2373
2329		- sis->flags &= ~SWP_FILE;
2330		- mapping->a_ops->swap_deactivate(swap_file);
	2374	+ sis->flags &= ~SWP_ACTIVATED;
	2375	+ if (mapping->a_ops->swap_deactivate)
	2376	+ mapping->a_ops->swap_deactivate(swap_file);
2331	2377	}
2332	2378	}
2333	2379
2334	2380	/*
2335	2381	* Add a block range (and the corresponding page range) into this swapdev's
2336		- * extent list. The extent list is kept sorted in page order.
	2382	+ * extent tree.
2337	2383	*
2338	2384	* This function rather assumes that it is called in ascending page order.
2339	2385	*/
..	..	@@ -2341,20 +2387,21 @@
2341	2387	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2342	2388	unsigned long nr_pages, sector_t start_block)
2343	2389	{
	2390	+ struct rb_node *link = &sis->swap_extent_root.rb_node, parent = NULL;
2344	2391	struct swap_extent *se;
2345	2392	struct swap_extent *new_se;
2346		- struct list_head *lh;
2347	2393
2348		- if (start_page == 0) {
2349		- se = &sis->first_swap_extent;
2350		- sis->curr_swap_extent = se;
2351		- se->start_page = 0;
2352		- se->nr_pages = nr_pages;
2353		- se->start_block = start_block;
2354		- return 1;
2355		- } else {
2356		- lh = sis->first_swap_extent.list.prev; /* Highest extent */
2357		- se = list_entry(lh, struct swap_extent, list);
	2394	+ /*
	2395	+ * place the new node at the right most since the
	2396	+ * function is called in ascending page order.
	2397	+ */
	2398	+ while (*link) {
	2399	+ parent = *link;
	2400	+ link = &parent->rb_right;
	2401	+ }
	2402	+
	2403	+ if (parent) {
	2404	+ se = rb_entry(parent, struct swap_extent, rb_node);
2358	2405	BUG_ON(se->start_page + se->nr_pages != start_page);
2359	2406	if (se->start_block + se->nr_pages == start_block) {
2360	2407	/* Merge it */
..	..	@@ -2363,9 +2410,7 @@
2363	2410	}
2364	2411	}
2365	2412
2366		- /*
2367		- * No merge. Insert a new extent, preserving ordering.
2368		- */
	2413	+ /* No merge, insert a new extent. */
2369	2414	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2370	2415	if (new_se == NULL)
2371	2416	return -ENOMEM;
..	..	@@ -2373,7 +2418,8 @@
2373	2418	new_se->nr_pages = nr_pages;
2374	2419	new_se->start_block = start_block;
2375	2420
2376		- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
	2421	+ rb_link_node(&new_se->rb_node, parent, link);
	2422	+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2377	2423	return 1;
2378	2424	}
2379	2425	EXPORT_SYMBOL_GPL(add_swap_extent);
..	..	@@ -2423,8 +2469,10 @@
2423	2469
2424	2470	if (mapping->a_ops->swap_activate) {
2425	2471	ret = mapping->a_ops->swap_activate(sis, swap_file, span);
	2472	+ if (ret >= 0)
	2473	+ sis->flags \|= SWP_ACTIVATED;
2426	2474	if (!ret) {
2427		- sis->flags \|= SWP_FILE;
	2475	+ sis->flags \|= SWP_FS_OPS;
2428	2476	ret = add_swap_extent(sis, 0, sis->max, 0);
2429	2477	*span = sis->pages;
2430	2478	}
..	..	@@ -2446,9 +2494,9 @@
2446	2494	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2447	2495	}
2448	2496
2449		-static void _enable_swap_info(struct swap_info_struct *p, int prio,
2450		- unsigned char *swap_map,
2451		- struct swap_cluster_info *cluster_info)
	2497	+static void setup_swap_info(struct swap_info_struct *p, int prio,
	2498	+ unsigned char *swap_map,
	2499	+ struct swap_cluster_info *cluster_info)
2452	2500	{
2453	2501	int i;
2454	2502
..	..	@@ -2473,10 +2521,18 @@
2473	2521	}
2474	2522	p->swap_map = swap_map;
2475	2523	p->cluster_info = cluster_info;
2476		- p->flags \|= SWP_WRITEOK;
2477		- atomic_long_add(p->pages, &nr_swap_pages);
2478		- total_swap_pages += p->pages;
	2524	+}
2479	2525
	2526	+static void _enable_swap_info(struct swap_info_struct *p)
	2527	+{
	2528	+ bool skip = false;
	2529	+
	2530	+ p->flags \|= SWP_WRITEOK \| SWP_VALID;
	2531	+ trace_android_vh_account_swap_pages(p, &skip);
	2532	+ if (!skip) {
	2533	+ atomic_long_add(p->pages, &nr_swap_pages);
	2534	+ total_swap_pages += p->pages;
	2535	+ }
2480	2536	assert_spin_locked(&swap_lock);
2481	2537	/*
2482	2538	* both lists are plists, and thus priority ordered.
..	..	@@ -2500,7 +2556,17 @@
2500	2556	frontswap_init(p->type, frontswap_map);
2501	2557	spin_lock(&swap_lock);
2502	2558	spin_lock(&p->lock);
2503		- _enable_swap_info(p, prio, swap_map, cluster_info);
	2559	+ setup_swap_info(p, prio, swap_map, cluster_info);
	2560	+ spin_unlock(&p->lock);
	2561	+ spin_unlock(&swap_lock);
	2562	+ /*
	2563	+ * Guarantee swap_map, cluster_info, etc. fields are valid
	2564	+ * between get/put_swap_device() if SWP_VALID bit is set
	2565	+ */
	2566	+ synchronize_rcu();
	2567	+ spin_lock(&swap_lock);
	2568	+ spin_lock(&p->lock);
	2569	+ _enable_swap_info(p);
2504	2570	spin_unlock(&p->lock);
2505	2571	spin_unlock(&swap_lock);
2506	2572	}
..	..	@@ -2509,7 +2575,8 @@
2509	2575	{
2510	2576	spin_lock(&swap_lock);
2511	2577	spin_lock(&p->lock);
2512		- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
	2578	+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
	2579	+ _enable_swap_info(p);
2513	2580	spin_unlock(&p->lock);
2514	2581	spin_unlock(&swap_lock);
2515	2582	}
..	..	@@ -2537,6 +2604,7 @@
2537	2604	struct filename *pathname;
2538	2605	int err, found = 0;
2539	2606	unsigned int old_block_size;
	2607	+ bool skip = false;
2540	2608
2541	2609	if (!capable(CAP_SYS_ADMIN))
2542	2610	return -EPERM;
..	..	@@ -2574,8 +2642,8 @@
2574	2642	spin_unlock(&swap_lock);
2575	2643	goto out_dput;
2576	2644	}
2577		- del_from_avail_list(p);
2578	2645	spin_lock(&p->lock);
	2646	+ del_from_avail_list(p);
2579	2647	if (p->prio < 0) {
2580	2648	struct swap_info_struct *si = p;
2581	2649	int nid;
..	..	@@ -2591,8 +2659,11 @@
2591	2659	least_priority++;
2592	2660	}
2593	2661	plist_del(&p->list, &swap_active_head);
2594		- atomic_long_sub(p->pages, &nr_swap_pages);
2595		- total_swap_pages -= p->pages;
	2662	+ trace_android_vh_account_swap_pages(p, &skip);
	2663	+ if (!skip) {
	2664	+ atomic_long_sub(p->pages, &nr_swap_pages);
	2665	+ total_swap_pages -= p->pages;
	2666	+ }
2596	2667	p->flags &= ~SWP_WRITEOK;
2597	2668	spin_unlock(&p->lock);
2598	2669	spin_unlock(&swap_lock);
..	..	@@ -2611,6 +2682,17 @@
2611	2682	}
2612	2683
2613	2684	reenable_swap_slots_cache_unlock();
	2685	+
	2686	+ spin_lock(&swap_lock);
	2687	+ spin_lock(&p->lock);
	2688	+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
	2689	+ spin_unlock(&p->lock);
	2690	+ spin_unlock(&swap_lock);
	2691	+ /*
	2692	+ * wait for swap operations protected by get/put_swap_device()
	2693	+ * to complete
	2694	+ */
	2695	+ synchronize_rcu();
2614	2696
2615	2697	flush_work(&p->discard_work);
2616	2698
..	..	@@ -2647,11 +2729,14 @@
2647	2729	frontswap_map = frontswap_map_get(p);
2648	2730	spin_unlock(&p->lock);
2649	2731	spin_unlock(&swap_lock);
	2732	+ arch_swap_invalidate_area(p->type);
2650	2733	frontswap_invalidate_area(p->type);
2651	2734	frontswap_map_set(p, NULL);
2652	2735	mutex_unlock(&swapon_mutex);
2653	2736	free_percpu(p->percpu_cluster);
2654	2737	p->percpu_cluster = NULL;
	2738	+ free_percpu(p->cluster_next_cpu);
	2739	+ p->cluster_next_cpu = NULL;
2655	2740	vfree(swap_map);
2656	2741	kvfree(cluster_info);
2657	2742	kvfree(frontswap_map);
..	..	@@ -2759,20 +2844,24 @@
2759	2844	struct swap_info_struct *si = v;
2760	2845	struct file *file;
2761	2846	int len;
	2847	+ unsigned int bytes, inuse;
2762	2848
2763	2849	if (si == SEQ_START_TOKEN) {
2764		- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
	2850	+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2765	2851	return 0;
2766	2852	}
2767	2853
	2854	+ bytes = si->pages << (PAGE_SHIFT - 10);
	2855	+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
	2856	+
2768	2857	file = si->swap_file;
2769	2858	len = seq_file_path(swap, file, " \t\n\\");
2770		- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
	2859	+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
2771	2860	len < 40 ? 40 - len : 1, " ",
2772	2861	S_ISBLK(file_inode(file)->i_mode) ?
2773	2862	"partition" : "file\t",
2774		- si->pages << (PAGE_SHIFT - 10),
2775		- si->inuse_pages << (PAGE_SHIFT - 10),
	2863	+ bytes, bytes < 10000000 ? "\t" : "",
	2864	+ inuse, inuse < 10000000 ? "\t" : "",
2776	2865	si->prio);
2777	2866	return 0;
2778	2867	}
..	..	@@ -2798,17 +2887,18 @@
2798	2887	return 0;
2799	2888	}
2800	2889
2801		-static const struct file_operations proc_swaps_operations = {
2802		- .open = swaps_open,
2803		- .read = seq_read,
2804		- .llseek = seq_lseek,
2805		- .release = seq_release,
2806		- .poll = swaps_poll,
	2890	+static const struct proc_ops swaps_proc_ops = {
	2891	+ .proc_flags = PROC_ENTRY_PERMANENT,
	2892	+ .proc_open = swaps_open,
	2893	+ .proc_read = seq_read,
	2894	+ .proc_lseek = seq_lseek,
	2895	+ .proc_release = seq_release,
	2896	+ .proc_poll = swaps_poll,
2807	2897	};
2808	2898
2809	2899	static int __init procswaps_init(void)
2810	2900	{
2811		- proc_create("swaps", 0, NULL, &proc_swaps_operations);
	2901	+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
2812	2902	return 0;
2813	2903	}
2814	2904	__initcall(procswaps_init);
..	..	@@ -2825,13 +2915,16 @@
2825	2915
2826	2916	static struct swap_info_struct *alloc_swap_info(void)
2827	2917	{
2828		- struct swap_info_struct *p;
	2918	+ struct swap_info_struct *p = NULL;
2829	2919	struct swap_info_struct *defer = NULL;
2830	2920	unsigned int type;
2831	2921	int i;
2832		- int size = sizeof(p) + nr_node_ids sizeof(struct plist_node);
	2922	+ bool skip = false;
2833	2923
2834		- p = kvzalloc(size, GFP_KERNEL);
	2924	+ trace_android_rvh_alloc_si(&p, &skip);
	2925	+ trace_android_vh_alloc_si(&p, &skip);
	2926	+ if (!skip)
	2927	+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2835	2928	if (!p)
2836	2929	return ERR_PTR(-ENOMEM);
2837	2930
..	..	@@ -2863,7 +2956,7 @@
2863	2956	* would be relying on p->type to remain valid.
2864	2957	*/
2865	2958	}
2866		- INIT_LIST_HEAD(&p->first_swap_extent.list);
	2959	+ p->swap_extent_root = RB_ROOT;
2867	2960	plist_node_init(&p->list, 0);
2868	2961	for_each_node(i)
2869	2962	plist_node_init(&p->avail_lists[i], 0);
..	..	@@ -2881,10 +2974,10 @@
2881	2974	int error;
2882	2975
2883	2976	if (S_ISBLK(inode->i_mode)) {
2884		- p->bdev = bdgrab(I_BDEV(inode));
2885		- error = blkdev_get(p->bdev,
	2977	+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
2886	2978	FMODE_READ \| FMODE_WRITE \| FMODE_EXCL, p);
2887		- if (error < 0) {
	2979	+ if (IS_ERR(p->bdev)) {
	2980	+ error = PTR_ERR(p->bdev);
2888	2981	p->bdev = NULL;
2889	2982	return error;
2890	2983	}
..	..	@@ -2892,6 +2985,13 @@
2892	2985	error = set_blocksize(p->bdev, PAGE_SIZE);
2893	2986	if (error < 0)
2894	2987	return error;
	2988	+ /*
	2989	+ * Zoned block devices contain zones that have a sequential
	2990	+ * write only restriction. Hence zoned block devices are not
	2991	+ * suitable for swapping. Disallow them here.
	2992	+ */
	2993	+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
	2994	+ return -EINVAL;
2895	2995	p->flags \|= SWP_BLKDEV;
2896	2996	} else if (S_ISREG(inode->i_mode)) {
2897	2997	p->bdev = inode->i_sb->s_bdev;
..	..	@@ -3188,10 +3288,10 @@
3188	3288	goto bad_swap_unlock_inode;
3189	3289	}
3190	3290
3191		- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
	3291	+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
3192	3292	p->flags \|= SWP_STABLE_WRITES;
3193	3293
3194		- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
	3294	+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
3195	3295	p->flags \|= SWP_SYNCHRONOUS_IO;
3196	3296
3197	3297	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
..	..	@@ -3199,11 +3299,19 @@
3199	3299	unsigned long ci, nr_cluster;
3200	3300
3201	3301	p->flags \|= SWP_SOLIDSTATE;
	3302	+ p->cluster_next_cpu = alloc_percpu(unsigned int);
	3303	+ if (!p->cluster_next_cpu) {
	3304	+ error = -ENOMEM;
	3305	+ goto bad_swap_unlock_inode;
	3306	+ }
3202	3307	/*
3203	3308	* select a random position to start with to help wear leveling
3204	3309	* SSD
3205	3310	*/
3206		- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
	3311	+ for_each_possible_cpu(cpu) {
	3312	+ per_cpu(*p->cluster_next_cpu, cpu) =
	3313	+ 1 + prandom_u32_max(p->highest_bit);
	3314	+ }
3207	3315	nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3208	3316
3209	3317	cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
..	..	@@ -3289,7 +3397,7 @@
3289	3397	error = inode_drain_writes(inode);
3290	3398	if (error) {
3291	3399	inode->i_flags &= ~S_SWAPFILE;
3292		- goto bad_swap_unlock_inode;
	3400	+ goto free_swap_address_space;
3293	3401	}
3294	3402
3295	3403	mutex_lock(&swapon_mutex);
..	..	@@ -3297,8 +3405,11 @@
3297	3405	if (swap_flags & SWAP_FLAG_PREFER)
3298	3406	prio =
3299	3407	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
	3408	+
	3409	+ trace_android_vh_swap_avail_heads_init(swap_avail_heads);
3300	3410	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3301	3411
	3412	+ trace_android_vh_init_swap_info_struct(p, swap_avail_heads);
3302	3413	pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3303	3414	p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3304	3415	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
..	..	@@ -3314,11 +3425,15 @@
3314	3425
3315	3426	error = 0;
3316	3427	goto out;
	3428	+free_swap_address_space:
	3429	+ exit_swap_address_space(p->type);
3317	3430	bad_swap_unlock_inode:
3318	3431	inode_unlock(inode);
3319	3432	bad_swap:
3320	3433	free_percpu(p->percpu_cluster);
3321	3434	p->percpu_cluster = NULL;
	3435	+ free_percpu(p->cluster_next_cpu);
	3436	+ p->cluster_next_cpu = NULL;
3322	3437	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3323	3438	set_blocksize(p->bdev, p->old_block_size);
3324	3439	blkdev_put(p->bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);
..	..	@@ -3359,14 +3474,17 @@
3359	3474	spin_lock(&swap_lock);
3360	3475	for (type = 0; type < nr_swapfiles; type++) {
3361	3476	struct swap_info_struct *si = swap_info[type];
	3477	+ bool skip = false;
3362	3478
3363		- if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
	3479	+ trace_android_vh_si_swapinfo(si, &skip);
	3480	+ if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3364	3481	nr_to_be_unused += si->inuse_pages;
3365	3482	}
3366	3483	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3367	3484	val->totalswap = total_swap_pages + nr_to_be_unused;
3368	3485	spin_unlock(&swap_lock);
3369	3486	}
	3487	+EXPORT_SYMBOL_GPL(si_swapinfo);
3370	3488
3371	3489	/*
3372	3490	* Verify that a swap entry is valid and increment its swap map count.
..	..	@@ -3388,17 +3506,11 @@
3388	3506	unsigned char has_cache;
3389	3507	int err = -EINVAL;
3390	3508
3391		- if (non_swap_entry(entry))
3392		- goto out;
3393		-
3394		- p = swp_swap_info(entry);
	3509	+ p = get_swap_device(entry);
3395	3510	if (!p)
3396		- goto bad_file;
	3511	+ goto out;
3397	3512
3398	3513	offset = swp_offset(entry);
3399		- if (unlikely(offset >= p->max))
3400		- goto out;
3401		-
3402	3514	ci = lock_cluster_or_swap_info(p, offset);
3403	3515
3404	3516	count = p->swap_map[offset];
..	..	@@ -3439,16 +3551,14 @@
3439	3551	} else
3440	3552	err = -ENOENT; /* unused swap entry */
3441	3553
3442		- p->swap_map[offset] = count \| has_cache;
	3554	+ WRITE_ONCE(p->swap_map[offset], count \| has_cache);
3443	3555
3444	3556	unlock_out:
3445	3557	unlock_cluster_or_swap_info(p, ci);
3446	3558	out:
	3559	+ if (p)
	3560	+ put_swap_device(p);
3447	3561	return err;
3448		-
3449		-bad_file:
3450		- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3451		- goto out;
3452	3562	}
3453	3563
3454	3564	/*
..	..	@@ -3481,7 +3591,7 @@
3481	3591	*
3482	3592	* Called when allocating swap cache for existing swap entry,
3483	3593	* This can return error codes. Returns 0 at success.
3484		- * -EBUSY means there is a swap cache.
	3594	+ * -EEXIST means there is a swap cache.
3485	3595	* Note: return code is different from swap_duplicate().
3486	3596	*/
3487	3597	int swapcache_prepare(swp_entry_t entry)
..	..	@@ -3493,6 +3603,7 @@
3493	3603	{
3494	3604	return swap_type_to_swap_info(swp_type(entry));
3495	3605	}
	3606	+EXPORT_SYMBOL_GPL(swp_swap_info);
3496	3607
3497	3608	struct swap_info_struct page_swap_info(struct page page)
3498	3609	{
..	..	@@ -3540,6 +3651,7 @@
3540	3651	struct page *list_page;
3541	3652	pgoff_t offset;
3542	3653	unsigned char count;
	3654	+ int ret = 0;
3543	3655
3544	3656	/*
3545	3657	* When debugging, it's easier to use __GFP_ZERO here; but it's better
..	..	@@ -3547,15 +3659,15 @@
3547	3659	*/
3548	3660	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
3549	3661
3550		- si = swap_info_get(entry);
	3662	+ si = get_swap_device(entry);
3551	3663	if (!si) {
3552	3664	/*
3553	3665	* An acceptable race has occurred since the failing
3554		- * __swap_duplicate(): the swap entry has been freed,
3555		- * perhaps even the whole swap_map cleared for swapoff.
	3666	+ * __swap_duplicate(): the swap device may be swapoff
3556	3667	*/
3557	3668	goto outer;
3558	3669	}
	3670	+ spin_lock(&si->lock);
3559	3671
3560	3672	offset = swp_offset(entry);
3561	3673
..	..	@@ -3573,9 +3685,8 @@
3573	3685	}
3574	3686
3575	3687	if (!page) {
3576		- unlock_cluster(ci);
3577		- spin_unlock(&si->lock);
3578		- return -ENOMEM;
	3688	+ ret = -ENOMEM;
	3689	+ goto out;
3579	3690	}
3580	3691
3581	3692	/*
..	..	@@ -3627,10 +3738,11 @@
3627	3738	out:
3628	3739	unlock_cluster(ci);
3629	3740	spin_unlock(&si->lock);
	3741	+ put_swap_device(si);
3630	3742	outer:
3631	3743	if (page)
3632	3744	__free_page(page);
3633		- return 0;
	3745	+ return ret;
3634	3746	}
3635	3747
3636	3748	/*
..	..	@@ -3658,7 +3770,7 @@
3658	3770
3659	3771	spin_lock(&si->cont_lock);
3660	3772	offset &= ~PAGE_MASK;
3661		- page = list_entry(head->lru.next, struct page, lru);
	3773	+ page = list_next_entry(head, lru);
3662	3774	map = kmap_atomic(page) + offset;
3663	3775
3664	3776	if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
..	..	@@ -3670,13 +3782,13 @@
3670	3782	*/
3671	3783	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
3672	3784	kunmap_atomic(map);
3673		- page = list_entry(page->lru.next, struct page, lru);
	3785	+ page = list_next_entry(page, lru);
3674	3786	BUG_ON(page == head);
3675	3787	map = kmap_atomic(page) + offset;
3676	3788	}
3677	3789	if (*map == SWAP_CONT_MAX) {
3678	3790	kunmap_atomic(map);
3679		- page = list_entry(page->lru.next, struct page, lru);
	3791	+ page = list_next_entry(page, lru);
3680	3792	if (page == head) {
3681	3793	ret = false; /* add count continuation */
3682	3794	goto out;
..	..	@@ -3686,12 +3798,10 @@
3686	3798	}
3687	3799	*map += 1;
3688	3800	kunmap_atomic(map);
3689		- page = list_entry(page->lru.prev, struct page, lru);
3690		- while (page != head) {
	3801	+ while ((page = list_prev_entry(page, lru)) != head) {
3691	3802	map = kmap_atomic(page) + offset;
3692	3803	*map = COUNT_CONTINUED;
3693	3804	kunmap_atomic(map);
3694		- page = list_entry(page->lru.prev, struct page, lru);
3695	3805	}
3696	3806	ret = true; /* incremented */
3697	3807
..	..	@@ -3702,7 +3812,7 @@
3702	3812	BUG_ON(count != COUNT_CONTINUED);
3703	3813	while (*map == COUNT_CONTINUED) {
3704	3814	kunmap_atomic(map);
3705		- page = list_entry(page->lru.next, struct page, lru);
	3815	+ page = list_next_entry(page, lru);
3706	3816	BUG_ON(page == head);
3707	3817	map = kmap_atomic(page) + offset;
3708	3818	}
..	..	@@ -3711,13 +3821,11 @@
3711	3821	if (*map == 0)
3712	3822	count = 0;
3713	3823	kunmap_atomic(map);
3714		- page = list_entry(page->lru.prev, struct page, lru);
3715		- while (page != head) {
	3824	+ while ((page = list_prev_entry(page, lru)) != head) {
3716	3825	map = kmap_atomic(page) + offset;
3717	3826	*map = SWAP_CONT_MAX \| count;
3718	3827	count = COUNT_CONTINUED;
3719	3828	kunmap_atomic(map);
3720		- page = list_entry(page->lru.prev, struct page, lru);
3721	3829	}
3722	3830	ret = count == COUNT_CONTINUED;
3723	3831	}
..	..	@@ -3749,11 +3857,13 @@
3749	3857	}
3750	3858
3751	3859	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3752		-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3753		- gfp_t gfp_mask)
	3860	+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
3754	3861	{
3755	3862	struct swap_info_struct si, next;
3756		- if (!(gfp_mask & __GFP_IO) \|\| !memcg)
	3863	+ int nid = page_to_nid(page);
	3864	+ bool skip = false;
	3865	+
	3866	+ if (!(gfp_mask & __GFP_IO))
3757	3867	return;
3758	3868
3759	3869	if (!blk_cgroup_congested())
..	..	@@ -3766,12 +3876,15 @@
3766	3876	if (current->throttle_queue)
3767	3877	return;
3768	3878
	3879	+ trace_android_vh___cgroup_throttle_swaprate(nid, &skip);
	3880	+ if (skip)
	3881	+ return;
	3882	+
3769	3883	spin_lock(&swap_avail_lock);
3770		- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3771		- avail_lists[node]) {
	3884	+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
	3885	+ avail_lists[nid]) {
3772	3886	if (si->bdev) {
3773		- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3774		- true);
	3887	+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
3775	3888	break;
3776	3889	}
3777	3890	}