~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/swapfile.c
3	4	*
..	..	@@ -39,10 +40,10 @@
39	40	#include <linux/swap_slots.h>
40	41	#include <linux/sort.h>
41	42
42		-#include <asm/pgtable.h>
43	43	#include <asm/tlbflush.h>
44	44	#include <linux/swapops.h>
45	45	#include <linux/swap_cgroup.h>
	46	+#include <trace/hooks/mm.h>
46	47
47	48	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
48	49	unsigned char);
..	..	@@ -98,7 +99,7 @@
98	99
99	100	atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100	101
101		-static struct swap_info_struct *swap_type_to_swap_info(int type)
	102	+struct swap_info_struct *swap_type_to_swap_info(int type)
102	103	{
103	104	if (type >= READ_ONCE(nr_swapfiles))
104	105	return NULL;
..	..	@@ -106,36 +107,62 @@
106	107	smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
107	108	return READ_ONCE(swap_info[type]);
108	109	}
	110	+EXPORT_SYMBOL_GPL(swap_type_to_swap_info);
109	111
110	112	static inline unsigned char swap_count(unsigned char ent)
111	113	{
112	114	return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
113	115	}
114	116
	117	+/* Reclaim the swap entry anyway if possible */
	118	+#define TTRS_ANYWAY 0x1
	119	+/*
	120	+ * Reclaim the swap entry if there are no more mappings of the
	121	+ * corresponding page
	122	+ */
	123	+#define TTRS_UNMAPPED 0x2
	124	+/* Reclaim the swap entry if swap is getting full*/
	125	+#define TTRS_FULL 0x4
	126	+
115	127	/* returns 1 if swap entry is freed */
116		-static int
117		-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
	128	+static int __try_to_reclaim_swap(struct swap_info_struct *si,
	129	+ unsigned long offset, unsigned long flags)
118	130	{
119	131	swp_entry_t entry = swp_entry(si->type, offset);
120	132	struct page *page;
121	133	int ret = 0;
122	134
123		- page = find_get_page(swap_address_space(entry), swp_offset(entry));
	135	+ page = find_get_page(swap_address_space(entry), offset);
124	136	if (!page)
125	137	return 0;
126	138	/*
127		- * This function is called from scan_swap_map() and it's called
128		- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
129		- * We have to use trylock for avoiding deadlock. This is a special
	139	+ * When this function is called from scan_swap_map_slots() and it's
	140	+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
	141	+ * here. We have to use trylock for avoiding deadlock. This is a special
130	142	* case and you should use try_to_free_swap() with explicit lock_page()
131	143	* in usual operations.
132	144	*/
133	145	if (trylock_page(page)) {
134		- ret = try_to_free_swap(page);
	146	+ if ((flags & TTRS_ANYWAY) \|\|
	147	+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) \|\|
	148	+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
	149	+ ret = try_to_free_swap(page);
135	150	unlock_page(page);
136	151	}
137	152	put_page(page);
138	153	return ret;
	154	+}
	155	+
	156	+static inline struct swap_extent first_se(struct swap_info_struct sis)
	157	+{
	158	+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
	159	+ return rb_entry(rb, struct swap_extent, rb_node);
	160	+}
	161	+
	162	+static inline struct swap_extent next_se(struct swap_extent se)
	163	+{
	164	+ struct rb_node *rb = rb_next(&se->rb_node);
	165	+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
139	166	}
140	167
141	168	/*
..	..	@@ -150,7 +177,7 @@
150	177	int err = 0;
151	178
152	179	/* Do not discard the swap header page! */
153		- se = &si->first_swap_extent;
	180	+ se = first_se(si);
154	181	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
155	182	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
156	183	if (nr_blocks) {
..	..	@@ -161,7 +188,7 @@
161	188	cond_resched();
162	189	}
163	190
164		- list_for_each_entry(se, &si->first_swap_extent.list, list) {
	191	+ for (se = next_se(se); se; se = next_se(se)) {
165	192	start_block = se->start_block << (PAGE_SHIFT - 9);
166	193	nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
167	194
..	..	@@ -175,6 +202,39 @@
175	202	return err; /* That will often be -EOPNOTSUPP */
176	203	}
177	204
	205	+static struct swap_extent *
	206	+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
	207	+{
	208	+ struct swap_extent *se;
	209	+ struct rb_node *rb;
	210	+
	211	+ rb = sis->swap_extent_root.rb_node;
	212	+ while (rb) {
	213	+ se = rb_entry(rb, struct swap_extent, rb_node);
	214	+ if (offset < se->start_page)
	215	+ rb = rb->rb_left;
	216	+ else if (offset >= se->start_page + se->nr_pages)
	217	+ rb = rb->rb_right;
	218	+ else
	219	+ return se;
	220	+ }
	221	+ /* It must be present */
	222	+ BUG();
	223	+}
	224	+
	225	+sector_t swap_page_sector(struct page *page)
	226	+{
	227	+ struct swap_info_struct *sis = page_swap_info(page);
	228	+ struct swap_extent *se;
	229	+ sector_t sector;
	230	+ pgoff_t offset;
	231	+
	232	+ offset = __page_file_index(page);
	233	+ se = offset_to_swap_extent(sis, offset);
	234	+ sector = se->start_block + (offset - se->start_page);
	235	+ return sector << (PAGE_SHIFT - 9);
	236	+}
	237	+
178	238	/*
179	239	* swap allocation tell device that a cluster of swap can now be discarded,
180	240	* to allow the swap device to optimize its wear-levelling.
..	..	@@ -182,32 +242,25 @@
182	242	static void discard_swap_cluster(struct swap_info_struct *si,
183	243	pgoff_t start_page, pgoff_t nr_pages)
184	244	{
185		- struct swap_extent *se = si->curr_swap_extent;
186		- int found_extent = 0;
	245	+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
187	246
188	247	while (nr_pages) {
189		- if (se->start_page <= start_page &&
190		- start_page < se->start_page + se->nr_pages) {
191		- pgoff_t offset = start_page - se->start_page;
192		- sector_t start_block = se->start_block + offset;
193		- sector_t nr_blocks = se->nr_pages - offset;
	248	+ pgoff_t offset = start_page - se->start_page;
	249	+ sector_t start_block = se->start_block + offset;
	250	+ sector_t nr_blocks = se->nr_pages - offset;
194	251
195		- if (nr_blocks > nr_pages)
196		- nr_blocks = nr_pages;
197		- start_page += nr_blocks;
198		- nr_pages -= nr_blocks;
	252	+ if (nr_blocks > nr_pages)
	253	+ nr_blocks = nr_pages;
	254	+ start_page += nr_blocks;
	255	+ nr_pages -= nr_blocks;
199	256
200		- if (!found_extent++)
201		- si->curr_swap_extent = se;
	257	+ start_block <<= PAGE_SHIFT - 9;
	258	+ nr_blocks <<= PAGE_SHIFT - 9;
	259	+ if (blkdev_issue_discard(si->bdev, start_block,
	260	+ nr_blocks, GFP_NOIO, 0))
	261	+ break;
202	262
203		- start_block <<= PAGE_SHIFT - 9;
204		- nr_blocks <<= PAGE_SHIFT - 9;
205		- if (blkdev_issue_discard(si->bdev, start_block,
206		- nr_blocks, GFP_NOIO, 0))
207		- break;
208		- }
209		-
210		- se = list_next_entry(se, list);
	263	+ se = next_se(se);
211	264	}
212	265	}
213	266
..	..	@@ -562,7 +615,6 @@
562	615	{
563	616	struct percpu_cluster *cluster;
564	617	struct swap_cluster_info *ci;
565		- bool found_free;
566	618	unsigned long tmp, max;
567	619
568	620	new_cluster:
..	..	@@ -575,16 +627,16 @@
575	627	} else if (!cluster_list_empty(&si->discard_clusters)) {
576	628	/*
577	629	* we don't have free cluster but have some clusters in
578		- * discarding, do discard now and reclaim them
	630	+ * discarding, do discard now and reclaim them, then
	631	+ * reread cluster_next_cpu since we dropped si->lock
579	632	*/
580	633	swap_do_scheduled_discard(si);
581		- scan_base = offset = si->cluster_next;
	634	+ scan_base = this_cpu_read(si->cluster_next_cpu);
	635	+ offset = scan_base;
582	636	goto new_cluster;
583	637	} else
584	638	return false;
585	639	}
586		-
587		- found_free = false;
588	640
589	641	/*
590	642	* Other CPUs can use our cluster if they can't find a free cluster,
..	..	@@ -593,27 +645,23 @@
593	645	tmp = cluster->next;
594	646	max = min_t(unsigned long, si->max,
595	647	(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
596		- if (tmp >= max) {
597		- cluster_set_null(&cluster->index);
598		- goto new_cluster;
599		- }
600		- ci = lock_cluster(si, tmp);
601		- while (tmp < max) {
602		- if (!si->swap_map[tmp]) {
603		- found_free = true;
604		- break;
	648	+ if (tmp < max) {
	649	+ ci = lock_cluster(si, tmp);
	650	+ while (tmp < max) {
	651	+ if (!si->swap_map[tmp])
	652	+ break;
	653	+ tmp++;
605	654	}
606		- tmp++;
	655	+ unlock_cluster(ci);
607	656	}
608		- unlock_cluster(ci);
609		- if (!found_free) {
	657	+ if (tmp >= max) {
610	658	cluster_set_null(&cluster->index);
611	659	goto new_cluster;
612	660	}
613	661	cluster->next = tmp + 1;
614	662	*offset = tmp;
615	663	*scan_base = tmp;
616		- return found_free;
	664	+ return true;
617	665	}
618	666
619	667	static void __del_from_avail_list(struct swap_info_struct *p)
..	..	@@ -639,7 +687,7 @@
639	687	if (offset == si->lowest_bit)
640	688	si->lowest_bit += nr_entries;
641	689	if (end == si->highest_bit)
642		- si->highest_bit -= nr_entries;
	690	+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
643	691	si->inuse_pages += nr_entries;
644	692	if (si->inuse_pages == si->pages) {
645	693	si->lowest_bit = si->max;
..	..	@@ -663,19 +711,23 @@
663	711	static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
664	712	unsigned int nr_entries)
665	713	{
	714	+ unsigned long begin = offset;
666	715	unsigned long end = offset + nr_entries - 1;
667	716	void (swap_slot_free_notify)(struct block_device , unsigned long);
	717	+ bool skip = false;
668	718
669	719	if (offset < si->lowest_bit)
670	720	si->lowest_bit = offset;
671	721	if (end > si->highest_bit) {
672	722	bool was_full = !si->highest_bit;
673	723
674		- si->highest_bit = end;
	724	+ WRITE_ONCE(si->highest_bit, end);
675	725	if (was_full && (si->flags & SWP_WRITEOK))
676	726	add_to_avail_list(si);
677	727	}
678		- atomic_long_add(nr_entries, &nr_swap_pages);
	728	+ trace_android_vh_account_swap_pages(si, &skip);
	729	+ if (!skip)
	730	+ atomic_long_add(nr_entries, &nr_swap_pages);
679	731	si->inuse_pages -= nr_entries;
680	732	if (si->flags & SWP_BLKDEV)
681	733	swap_slot_free_notify =
..	..	@@ -683,14 +735,44 @@
683	735	else
684	736	swap_slot_free_notify = NULL;
685	737	while (offset <= end) {
	738	+ arch_swap_invalidate_page(si->type, offset);
686	739	frontswap_invalidate_page(si->type, offset);
687	740	if (swap_slot_free_notify)
688	741	swap_slot_free_notify(si->bdev, offset);
689	742	offset++;
690	743	}
	744	+ clear_shadow_from_swap_cache(si->type, begin, end);
691	745	}
692	746
693		-static int scan_swap_map_slots(struct swap_info_struct *si,
	747	+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
	748	+{
	749	+ unsigned long prev;
	750	+
	751	+ if (!(si->flags & SWP_SOLIDSTATE)) {
	752	+ si->cluster_next = next;
	753	+ return;
	754	+ }
	755	+
	756	+ prev = this_cpu_read(*si->cluster_next_cpu);
	757	+ /*
	758	+ * Cross the swap address space size aligned trunk, choose
	759	+ * another trunk randomly to avoid lock contention on swap
	760	+ * address space if possible.
	761	+ */
	762	+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
	763	+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
	764	+ /* No free swap slots available */
	765	+ if (si->highest_bit <= si->lowest_bit)
	766	+ return;
	767	+ next = si->lowest_bit +
	768	+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
	769	+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
	770	+ next = max_t(unsigned int, next, si->lowest_bit);
	771	+ }
	772	+ this_cpu_write(*si->cluster_next_cpu, next);
	773	+}
	774	+
	775	+int scan_swap_map_slots(struct swap_info_struct *si,
694	776	unsigned char usage, int nr,
695	777	swp_entry_t slots[])
696	778	{
..	..	@@ -700,9 +782,7 @@
700	782	unsigned long last_in_cluster = 0;
701	783	int latency_ration = LATENCY_LIMIT;
702	784	int n_ret = 0;
703		-
704		- if (nr > SWAP_BATCH)
705		- nr = SWAP_BATCH;
	785	+ bool scanned_many = false;
706	786
707	787	/*
708	788	* We try to cluster swap pages by allocating them sequentially
..	..	@@ -716,17 +796,22 @@
716	796	*/
717	797
718	798	si->flags += SWP_SCANNING;
719		- scan_base = offset = si->cluster_next;
	799	+ /*
	800	+ * Use percpu scan base for SSD to reduce lock contention on
	801	+ * cluster and swap cache. For HDD, sequential access is more
	802	+ * important.
	803	+ */
	804	+ if (si->flags & SWP_SOLIDSTATE)
	805	+ scan_base = this_cpu_read(*si->cluster_next_cpu);
	806	+ else
	807	+ scan_base = si->cluster_next;
	808	+ offset = scan_base;
720	809
721	810	/* SSD algorithm */
722	811	if (si->cluster_info) {
723		- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
724		- goto checks;
725		- else
	812	+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
726	813	goto scan;
727		- }
728		-
729		- if (unlikely(!si->cluster_nr--)) {
	814	+ } else if (unlikely(!si->cluster_nr--)) {
730	815	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
731	816	si->cluster_nr = SWAPFILE_CLUSTER - 1;
732	817	goto checks;
..	..	@@ -789,7 +874,7 @@
789	874	int swap_was_freed;
790	875	unlock_cluster(ci);
791	876	spin_unlock(&si->lock);
792		- swap_was_freed = __try_to_reclaim_swap(si, offset);
	877	+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
793	878	spin_lock(&si->lock);
794	879	/* entry was freed successfully, try to use this again */
795	880	if (swap_was_freed)
..	..	@@ -804,12 +889,11 @@
804	889	else
805	890	goto done;
806	891	}
807		- si->swap_map[offset] = usage;
	892	+ WRITE_ONCE(si->swap_map[offset], usage);
808	893	inc_cluster_info_page(si, si->cluster_info, offset);
809	894	unlock_cluster(ci);
810	895
811	896	swap_range_alloc(si, offset, 1);
812		- si->cluster_next = offset + 1;
813	897	slots[n_ret++] = swp_entry(si->type, offset);
814	898
815	899	/* got enough slots or reach max slots? */
..	..	@@ -832,51 +916,69 @@
832	916	if (si->cluster_info) {
833	917	if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
834	918	goto checks;
835		- else
836		- goto done;
837		- }
838		- /* non-ssd case */
839		- ++offset;
840		-
841		- /* non-ssd case, still more slots in cluster? */
842		- if (si->cluster_nr && !si->swap_map[offset]) {
	919	+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
	920	+ /* non-ssd case, still more slots in cluster? */
843	921	--si->cluster_nr;
844	922	goto checks;
845	923	}
846	924
	925	+ /*
	926	+ * Even if there's no free clusters available (fragmented),
	927	+ * try to scan a little more quickly with lock held unless we
	928	+ * have scanned too many slots already.
	929	+ */
	930	+ if (!scanned_many) {
	931	+ unsigned long scan_limit;
	932	+
	933	+ if (offset < scan_base)
	934	+ scan_limit = scan_base;
	935	+ else
	936	+ scan_limit = si->highest_bit;
	937	+ for (; offset <= scan_limit && --latency_ration > 0;
	938	+ offset++) {
	939	+ if (!si->swap_map[offset])
	940	+ goto checks;
	941	+ }
	942	+ }
	943	+
847	944	done:
	945	+ set_cluster_next(si, offset + 1);
848	946	si->flags -= SWP_SCANNING;
849	947	return n_ret;
850	948
851	949	scan:
852	950	spin_unlock(&si->lock);
853		- while (++offset <= si->highest_bit) {
854		- if (!si->swap_map[offset]) {
	951	+ while (++offset <= READ_ONCE(si->highest_bit)) {
	952	+ if (data_race(!si->swap_map[offset])) {
855	953	spin_lock(&si->lock);
856	954	goto checks;
857	955	}
858		- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
	956	+ if (vm_swap_full() &&
	957	+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
859	958	spin_lock(&si->lock);
860	959	goto checks;
861	960	}
862	961	if (unlikely(--latency_ration < 0)) {
863	962	cond_resched();
864	963	latency_ration = LATENCY_LIMIT;
	964	+ scanned_many = true;
865	965	}
866	966	}
867	967	offset = si->lowest_bit;
868	968	while (offset < scan_base) {
869		- if (!si->swap_map[offset]) {
	969	+ if (data_race(!si->swap_map[offset])) {
870	970	spin_lock(&si->lock);
871	971	goto checks;
872	972	}
873		- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
	973	+ if (vm_swap_full() &&
	974	+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
874	975	spin_lock(&si->lock);
875	976	goto checks;
876	977	}
877	978	if (unlikely(--latency_ration < 0)) {
878	979	cond_resched();
879	980	latency_ration = LATENCY_LIMIT;
	981	+ scanned_many = true;
880	982	}
881	983	offset++;
882	984	}
..	..	@@ -886,8 +988,9 @@
886	988	si->flags -= SWP_SCANNING;
887	989	return n_ret;
888	990	}
	991	+EXPORT_SYMBOL_GPL(scan_swap_map_slots);
889	992
890		-static int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
	993	+int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
891	994	{
892	995	unsigned long idx;
893	996	struct swap_cluster_info *ci;
..	..	@@ -921,6 +1024,7 @@
921	1024
922	1025	return 1;
923	1026	}
	1027	+EXPORT_SYMBOL_GPL(swap_alloc_cluster);
924	1028
925	1029	static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
926	1030	{
..	..	@@ -928,6 +1032,7 @@
928	1032	struct swap_cluster_info *ci;
929	1033
930	1034	ci = lock_cluster(si, offset);
	1035	+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
931	1036	cluster_set_count_flag(ci, 0, 0);
932	1037	free_cluster(si, idx);
933	1038	unlock_cluster(ci);
..	..	@@ -960,19 +1065,17 @@
960	1065	/* Only single cluster request supported */
961	1066	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
962	1067
	1068	+ spin_lock(&swap_avail_lock);
	1069	+
963	1070	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
964		- if (avail_pgs <= 0)
	1071	+ if (avail_pgs <= 0) {
	1072	+ spin_unlock(&swap_avail_lock);
965	1073	goto noswap;
	1074	+ }
966	1075
967		- if (n_goal > SWAP_BATCH)
968		- n_goal = SWAP_BATCH;
969		-
970		- if (n_goal > avail_pgs)
971		- n_goal = avail_pgs;
	1076	+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
972	1077
973	1078	atomic_long_sub(n_goal * size, &nr_swap_pages);
974		-
975		- spin_lock(&swap_avail_lock);
976	1079
977	1080	start_over:
978	1081	node = numa_node_id();
..	..	@@ -1041,20 +1144,22 @@
1041	1144	{
1042	1145	struct swap_info_struct *si = swap_type_to_swap_info(type);
1043	1146	pgoff_t offset;
	1147	+ bool skip = false;
1044	1148
1045	1149	if (!si)
1046	1150	goto fail;
1047	1151
1048	1152	spin_lock(&si->lock);
1049	1153	if (si->flags & SWP_WRITEOK) {
1050		- atomic_long_dec(&nr_swap_pages);
1051	1154	/* This is called for allocating swap entry, not cache */
1052	1155	offset = scan_swap_map(si, 1);
1053	1156	if (offset) {
	1157	+ trace_android_vh_account_swap_pages(si, &skip);
	1158	+ if (!skip)
	1159	+ atomic_long_dec(&nr_swap_pages);
1054	1160	spin_unlock(&si->lock);
1055	1161	return swp_entry(type, offset);
1056	1162	}
1057		- atomic_long_inc(&nr_swap_pages);
1058	1163	}
1059	1164	spin_unlock(&si->lock);
1060	1165	fail:
..	..	@@ -1064,15 +1169,14 @@
1064	1169	static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1065	1170	{
1066	1171	struct swap_info_struct *p;
1067		- unsigned long offset, type;
	1172	+ unsigned long offset;
1068	1173
1069	1174	if (!entry.val)
1070	1175	goto out;
1071		- type = swp_type(entry);
1072		- p = swap_type_to_swap_info(type);
	1176	+ p = swp_swap_info(entry);
1073	1177	if (!p)
1074	1178	goto bad_nofile;
1075		- if (!(p->flags & SWP_USED))
	1179	+ if (data_race(!(p->flags & SWP_USED)))
1076	1180	goto bad_device;
1077	1181	offset = swp_offset(entry);
1078	1182	if (offset >= p->max)
..	..	@@ -1098,13 +1202,12 @@
1098	1202	p = __swap_info_get(entry);
1099	1203	if (!p)
1100	1204	goto out;
1101		- if (!p->swap_map[swp_offset(entry)])
	1205	+ if (data_race(!p->swap_map[swp_offset(entry)]))
1102	1206	goto bad_free;
1103	1207	return p;
1104	1208
1105	1209	bad_free:
1106	1210	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1107		- goto out;
1108	1211	out:
1109	1212	return NULL;
1110	1213	}
..	..	@@ -1167,20 +1270,89 @@
1167	1270	}
1168	1271
1169	1272	usage = count \| has_cache;
1170		- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
	1273	+ if (usage)
	1274	+ WRITE_ONCE(p->swap_map[offset], usage);
	1275	+ else
	1276	+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
1171	1277
1172	1278	return usage;
1173	1279	}
1174	1280
	1281	+/*
	1282	+ * Check whether swap entry is valid in the swap device. If so,
	1283	+ * return pointer to swap_info_struct, and keep the swap entry valid
	1284	+ * via preventing the swap device from being swapoff, until
	1285	+ * put_swap_device() is called. Otherwise return NULL.
	1286	+ *
	1287	+ * The entirety of the RCU read critical section must come before the
	1288	+ * return from or after the call to synchronize_rcu() in
	1289	+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
	1290	+ * true, the si->map, si->cluster_info, etc. must be valid in the
	1291	+ * critical section.
	1292	+ *
	1293	+ * Notice that swapoff or swapoff+swapon can still happen before the
	1294	+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
	1295	+ * in put_swap_device() if there isn't any other way to prevent
	1296	+ * swapoff, such as page lock, page table lock, etc. The caller must
	1297	+ * be prepared for that. For example, the following situation is
	1298	+ * possible.
	1299	+ *
	1300	+ * CPU1 CPU2
	1301	+ * do_swap_page()
	1302	+ * ... swapoff+swapon
	1303	+ * __read_swap_cache_async()
	1304	+ * swapcache_prepare()
	1305	+ * __swap_duplicate()
	1306	+ * // check swap_map
	1307	+ * // verify PTE not changed
	1308	+ *
	1309	+ * In __swap_duplicate(), the swap_map need to be checked before
	1310	+ * changing partly because the specified swap entry may be for another
	1311	+ * swap device which has been swapoff. And in do_swap_page(), after
	1312	+ * the page is read from the swap device, the PTE is verified not
	1313	+ * changed with the page table locked to check whether the swap device
	1314	+ * has been swapoff or swapoff+swapon.
	1315	+ */
	1316	+struct swap_info_struct *get_swap_device(swp_entry_t entry)
	1317	+{
	1318	+ struct swap_info_struct *si;
	1319	+ unsigned long offset;
	1320	+
	1321	+ if (!entry.val)
	1322	+ goto out;
	1323	+ si = swp_swap_info(entry);
	1324	+ if (!si)
	1325	+ goto bad_nofile;
	1326	+
	1327	+ rcu_read_lock();
	1328	+ if (data_race(!(si->flags & SWP_VALID)))
	1329	+ goto unlock_out;
	1330	+ offset = swp_offset(entry);
	1331	+ if (offset >= si->max)
	1332	+ goto unlock_out;
	1333	+
	1334	+ return si;
	1335	+bad_nofile:
	1336	+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
	1337	+out:
	1338	+ return NULL;
	1339	+unlock_out:
	1340	+ rcu_read_unlock();
	1341	+ return NULL;
	1342	+}
	1343	+
1175	1344	static unsigned char __swap_entry_free(struct swap_info_struct *p,
1176		- swp_entry_t entry, unsigned char usage)
	1345	+ swp_entry_t entry)
1177	1346	{
1178	1347	struct swap_cluster_info *ci;
1179	1348	unsigned long offset = swp_offset(entry);
	1349	+ unsigned char usage;
1180	1350
1181	1351	ci = lock_cluster_or_swap_info(p, offset);
1182		- usage = __swap_entry_free_locked(p, offset, usage);
	1352	+ usage = __swap_entry_free_locked(p, offset, 1);
1183	1353	unlock_cluster_or_swap_info(p, ci);
	1354	+ if (!usage)
	1355	+ free_swap_slot(entry);
1184	1356
1185	1357	return usage;
1186	1358	}
..	..	@@ -1211,10 +1383,8 @@
1211	1383	struct swap_info_struct *p;
1212	1384
1213	1385	p = _swap_info_get(entry);
1214		- if (p) {
1215		- if (!__swap_entry_free(p, entry, 1))
1216		- free_swap_slot(entry);
1217		- }
	1386	+ if (p)
	1387	+ __swap_entry_free(p, entry);
1218	1388	}
1219	1389
1220	1390	/*
..	..	@@ -1229,7 +1399,7 @@
1229	1399	unsigned char *map;
1230	1400	unsigned int i, free_entries = 0;
1231	1401	unsigned char val;
1232		- int size = swap_entry_size(hpage_nr_pages(page));
	1402	+ int size = swap_entry_size(thp_nr_pages(page));
1233	1403
1234	1404	si = _swap_info_get(entry);
1235	1405	if (!si)
..	..	@@ -1249,9 +1419,6 @@
1249	1419	if (free_entries == SWAPFILE_CLUSTER) {
1250	1420	unlock_cluster_or_swap_info(si, ci);
1251	1421	spin_lock(&si->lock);
1252		- ci = lock_cluster(si, offset);
1253		- memset(map, 0, SWAPFILE_CLUSTER);
1254		- unlock_cluster(ci);
1255	1422	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1256	1423	swap_free_cluster(si, idx);
1257	1424	spin_unlock(&si->lock);
..	..	@@ -1321,6 +1488,7 @@
1321	1488	if (p)
1322	1489	spin_unlock(&p->lock);
1323	1490	}
	1491	+EXPORT_SYMBOL_GPL(swapcache_free_entries);
1324	1492
1325	1493	/*
1326	1494	* How many references to page are currently swapped out?
..	..	@@ -1346,11 +1514,18 @@
1346	1514	return count;
1347	1515	}
1348	1516
1349		-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
	1517	+int __swap_count(swp_entry_t entry)
1350	1518	{
	1519	+ struct swap_info_struct *si;
1351	1520	pgoff_t offset = swp_offset(entry);
	1521	+ int count = 0;
1352	1522
1353		- return swap_count(si->swap_map[offset]);
	1523	+ si = get_swap_device(entry);
	1524	+ if (si) {
	1525	+ count = swap_count(si->swap_map[offset]);
	1526	+ put_swap_device(si);
	1527	+ }
	1528	+ return count;
1354	1529	}
1355	1530
1356	1531	static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
..	..	@@ -1375,9 +1550,11 @@
1375	1550	int count = 0;
1376	1551	struct swap_info_struct *si;
1377	1552
1378		- si = __swap_info_get(entry);
1379		- if (si)
	1553	+ si = get_swap_device(entry);
	1554	+ if (si) {
1380	1555	count = swap_swapcount(si, entry);
	1556	+ put_swap_device(si);
	1557	+ }
1381	1558	return count;
1382	1559	}
1383	1560
..	..	@@ -1624,7 +1801,6 @@
1624	1801	int free_swap_and_cache(swp_entry_t entry)
1625	1802	{
1626	1803	struct swap_info_struct *p;
1627		- struct page *page = NULL;
1628	1804	unsigned char count;
1629	1805
1630	1806	if (non_swap_entry(entry))
..	..	@@ -1632,32 +1808,11 @@
1632	1808
1633	1809	p = _swap_info_get(entry);
1634	1810	if (p) {
1635		- count = __swap_entry_free(p, entry, 1);
	1811	+ count = __swap_entry_free(p, entry);
1636	1812	if (count == SWAP_HAS_CACHE &&
1637		- !swap_page_trans_huge_swapped(p, entry)) {
1638		- page = find_get_page(swap_address_space(entry),
1639		- swp_offset(entry));
1640		- if (page && !trylock_page(page)) {
1641		- put_page(page);
1642		- page = NULL;
1643		- }
1644		- } else if (!count)
1645		- free_swap_slot(entry);
1646		- }
1647		- if (page) {
1648		- /*
1649		- * Not mapped elsewhere, or swap space full? Free it!
1650		- * Also recheck PageSwapCache now page is locked (above).
1651		- */
1652		- if (PageSwapCache(page) && !PageWriteback(page) &&
1653		- (!page_mapped(page) \|\| mem_cgroup_swap_full(page)) &&
1654		- !swap_page_trans_huge_swapped(p, entry)) {
1655		- page = compound_head(page);
1656		- delete_from_swap_cache(page);
1657		- SetPageDirty(page);
1658		- }
1659		- unlock_page(page);
1660		- put_page(page);
	1813	+ !swap_page_trans_huge_swapped(p, entry))
	1814	+ __try_to_reclaim_swap(p, swp_offset(entry),
	1815	+ TTRS_UNMAPPED \| TTRS_FULL);
1661	1816	}
1662	1817	return p != NULL;
1663	1818	}
..	..	@@ -1671,13 +1826,12 @@
1671	1826	*
1672	1827	* This is needed for the suspend to disk (aka swsusp).
1673	1828	*/
1674		-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
	1829	+int swap_type_of(dev_t device, sector_t offset)
1675	1830	{
1676		- struct block_device *bdev = NULL;
1677	1831	int type;
1678	1832
1679		- if (device)
1680		- bdev = bdget(device);
	1833	+ if (!device)
	1834	+ return -1;
1681	1835
1682	1836	spin_lock(&swap_lock);
1683	1837	for (type = 0; type < nr_swapfiles; type++) {
..	..	@@ -1686,30 +1840,34 @@
1686	1840	if (!(sis->flags & SWP_WRITEOK))
1687	1841	continue;
1688	1842
1689		- if (!bdev) {
1690		- if (bdev_p)
1691		- *bdev_p = bdgrab(sis->bdev);
1692		-
1693		- spin_unlock(&swap_lock);
1694		- return type;
1695		- }
1696		- if (bdev == sis->bdev) {
1697		- struct swap_extent *se = &sis->first_swap_extent;
	1843	+ if (device == sis->bdev->bd_dev) {
	1844	+ struct swap_extent *se = first_se(sis);
1698	1845
1699	1846	if (se->start_block == offset) {
1700		- if (bdev_p)
1701		- *bdev_p = bdgrab(sis->bdev);
1702		-
1703	1847	spin_unlock(&swap_lock);
1704		- bdput(bdev);
1705	1848	return type;
1706	1849	}
1707	1850	}
1708	1851	}
1709	1852	spin_unlock(&swap_lock);
1710		- if (bdev)
1711		- bdput(bdev);
	1853	+ return -ENODEV;
	1854	+}
1712	1855
	1856	+int find_first_swap(dev_t *device)
	1857	+{
	1858	+ int type;
	1859	+
	1860	+ spin_lock(&swap_lock);
	1861	+ for (type = 0; type < nr_swapfiles; type++) {
	1862	+ struct swap_info_struct *sis = swap_info[type];
	1863	+
	1864	+ if (!(sis->flags & SWP_WRITEOK))
	1865	+ continue;
	1866	+ *device = sis->bdev->bd_dev;
	1867	+ spin_unlock(&swap_lock);
	1868	+ return type;
	1869	+ }
	1870	+ spin_unlock(&swap_lock);
1713	1871	return -ENODEV;
1714	1872	}
1715	1873
..	..	@@ -1756,7 +1914,7 @@
1756	1914
1757	1915	static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1758	1916	{
1759		- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
	1917	+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
1760	1918	}
1761	1919
1762	1920	/*
..	..	@@ -1768,7 +1926,6 @@
1768	1926	unsigned long addr, swp_entry_t entry, struct page *page)
1769	1927	{
1770	1928	struct page *swapcache;
1771		- struct mem_cgroup *memcg;
1772	1929	spinlock_t *ptl;
1773	1930	pte_t *pte;
1774	1931	int ret = 1;
..	..	@@ -1778,15 +1935,8 @@
1778	1935	if (unlikely(!page))
1779	1936	return -ENOMEM;
1780	1937
1781		- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1782		- &memcg, false)) {
1783		- ret = -ENOMEM;
1784		- goto out_nolock;
1785		- }
1786		-
1787	1938	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1788	1939	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1789		- mem_cgroup_cancel_charge(page, memcg, false);
1790	1940	ret = 0;
1791	1941	goto out;
1792	1942	}
..	..	@@ -1798,21 +1948,13 @@
1798	1948	pte_mkold(mk_pte(page, vma->vm_page_prot)));
1799	1949	if (page == swapcache) {
1800	1950	page_add_anon_rmap(page, vma, addr, false);
1801		- mem_cgroup_commit_charge(page, memcg, true, false);
1802	1951	} else { /* ksm created a completely new copy */
1803	1952	page_add_new_anon_rmap(page, vma, addr, false);
1804		- mem_cgroup_commit_charge(page, memcg, false, false);
1805		- lru_cache_add_active_or_unevictable(page, vma);
	1953	+ lru_cache_add_inactive_or_unevictable(page, vma);
1806	1954	}
1807	1955	swap_free(entry);
1808		- /*
1809		- * Move the page to the active list so it is not
1810		- * immediately swapped out again after swapon.
1811		- */
1812		- activate_page(page);
1813	1956	out:
1814	1957	pte_unmap_unlock(pte, ptl);
1815		-out_nolock:
1816	1958	if (page != swapcache) {
1817	1959	unlock_page(page);
1818	1960	put_page(page);
..	..	@@ -1821,44 +1963,83 @@
1821	1963	}
1822	1964
1823	1965	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
1824		- unsigned long addr, unsigned long end,
1825		- swp_entry_t entry, struct page *page)
	1966	+ unsigned long addr, unsigned long end,
	1967	+ unsigned int type, bool frontswap,
	1968	+ unsigned long *fs_pages_to_unuse)
1826	1969	{
1827		- pte_t swp_pte = swp_entry_to_pte(entry);
	1970	+ struct page *page;
	1971	+ swp_entry_t entry;
1828	1972	pte_t *pte;
	1973	+ struct swap_info_struct *si;
	1974	+ unsigned long offset;
1829	1975	int ret = 0;
	1976	+ volatile unsigned char *swap_map;
1830	1977
1831		- /*
1832		- * We don't actually need pte lock while scanning for swp_pte: since
1833		- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1834		- * page table while we're scanning; though it could get zapped, and on
1835		- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1836		- * of unmatched parts which look like swp_pte, so unuse_pte must
1837		- * recheck under pte lock. Scanning without pte lock lets it be
1838		- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1839		- */
	1978	+ si = swap_info[type];
1840	1979	pte = pte_offset_map(pmd, addr);
1841	1980	do {
1842		- /*
1843		- * swapoff spends a _lot_ of time in this loop!
1844		- * Test inline before going to call unuse_pte.
1845		- */
1846		- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1847		- pte_unmap(pte);
1848		- ret = unuse_pte(vma, pmd, addr, entry, page);
1849		- if (ret)
1850		- goto out;
1851		- pte = pte_offset_map(pmd, addr);
	1981	+ if (!is_swap_pte(*pte))
	1982	+ continue;
	1983	+
	1984	+ entry = pte_to_swp_entry(*pte);
	1985	+ if (swp_type(entry) != type)
	1986	+ continue;
	1987	+
	1988	+ offset = swp_offset(entry);
	1989	+ if (frontswap && !frontswap_test(si, offset))
	1990	+ continue;
	1991	+
	1992	+ pte_unmap(pte);
	1993	+ swap_map = &si->swap_map[offset];
	1994	+ page = lookup_swap_cache(entry, vma, addr);
	1995	+ if (!page) {
	1996	+ struct vm_fault vmf = {
	1997	+ .vma = vma,
	1998	+ .address = addr,
	1999	+ .pmd = pmd,
	2000	+ };
	2001	+
	2002	+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
	2003	+ &vmf);
1852	2004	}
	2005	+ if (!page) {
	2006	+ if (swap_map == 0 \|\| swap_map == SWAP_MAP_BAD)
	2007	+ goto try_next;
	2008	+ return -ENOMEM;
	2009	+ }
	2010	+
	2011	+ lock_page(page);
	2012	+ wait_on_page_writeback(page);
	2013	+ ret = unuse_pte(vma, pmd, addr, entry, page);
	2014	+ if (ret < 0) {
	2015	+ unlock_page(page);
	2016	+ put_page(page);
	2017	+ goto out;
	2018	+ }
	2019	+
	2020	+ try_to_free_swap(page);
	2021	+ trace_android_vh_unuse_swap_page(si, page);
	2022	+ unlock_page(page);
	2023	+ put_page(page);
	2024	+
	2025	+ if (fs_pages_to_unuse && !--(fs_pages_to_unuse)) {
	2026	+ ret = FRONTSWAP_PAGES_UNUSED;
	2027	+ goto out;
	2028	+ }
	2029	+try_next:
	2030	+ pte = pte_offset_map(pmd, addr);
1853	2031	} while (pte++, addr += PAGE_SIZE, addr != end);
1854	2032	pte_unmap(pte - 1);
	2033	+
	2034	+ ret = 0;
1855	2035	out:
1856	2036	return ret;
1857	2037	}
1858	2038
1859	2039	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
1860	2040	unsigned long addr, unsigned long end,
1861		- swp_entry_t entry, struct page *page)
	2041	+ unsigned int type, bool frontswap,
	2042	+ unsigned long *fs_pages_to_unuse)
1862	2043	{
1863	2044	pmd_t *pmd;
1864	2045	unsigned long next;
..	..	@@ -1870,7 +2051,8 @@
1870	2051	next = pmd_addr_end(addr, end);
1871	2052	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1872	2053	continue;
1873		- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
	2054	+ ret = unuse_pte_range(vma, pmd, addr, next, type,
	2055	+ frontswap, fs_pages_to_unuse);
1874	2056	if (ret)
1875	2057	return ret;
1876	2058	} while (pmd++, addr = next, addr != end);
..	..	@@ -1879,7 +2061,8 @@
1879	2061
1880	2062	static inline int unuse_pud_range(struct vm_area_struct vma, p4d_t p4d,
1881	2063	unsigned long addr, unsigned long end,
1882		- swp_entry_t entry, struct page *page)
	2064	+ unsigned int type, bool frontswap,
	2065	+ unsigned long *fs_pages_to_unuse)
1883	2066	{
1884	2067	pud_t *pud;
1885	2068	unsigned long next;
..	..	@@ -1890,7 +2073,8 @@
1890	2073	next = pud_addr_end(addr, end);
1891	2074	if (pud_none_or_clear_bad(pud))
1892	2075	continue;
1893		- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
	2076	+ ret = unuse_pmd_range(vma, pud, addr, next, type,
	2077	+ frontswap, fs_pages_to_unuse);
1894	2078	if (ret)
1895	2079	return ret;
1896	2080	} while (pud++, addr = next, addr != end);
..	..	@@ -1899,7 +2083,8 @@
1899	2083
1900	2084	static inline int unuse_p4d_range(struct vm_area_struct vma, pgd_t pgd,
1901	2085	unsigned long addr, unsigned long end,
1902		- swp_entry_t entry, struct page *page)
	2086	+ unsigned int type, bool frontswap,
	2087	+ unsigned long *fs_pages_to_unuse)
1903	2088	{
1904	2089	p4d_t *p4d;
1905	2090	unsigned long next;
..	..	@@ -1910,78 +2095,66 @@
1910	2095	next = p4d_addr_end(addr, end);
1911	2096	if (p4d_none_or_clear_bad(p4d))
1912	2097	continue;
1913		- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
	2098	+ ret = unuse_pud_range(vma, p4d, addr, next, type,
	2099	+ frontswap, fs_pages_to_unuse);
1914	2100	if (ret)
1915	2101	return ret;
1916	2102	} while (p4d++, addr = next, addr != end);
1917	2103	return 0;
1918	2104	}
1919	2105
1920		-static int unuse_vma(struct vm_area_struct *vma,
1921		- swp_entry_t entry, struct page *page)
	2106	+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
	2107	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1922	2108	{
1923	2109	pgd_t *pgd;
1924	2110	unsigned long addr, end, next;
1925	2111	int ret;
1926	2112
1927		- if (page_anon_vma(page)) {
1928		- addr = page_address_in_vma(page, vma);
1929		- if (addr == -EFAULT)
1930		- return 0;
1931		- else
1932		- end = addr + PAGE_SIZE;
1933		- } else {
1934		- addr = vma->vm_start;
1935		- end = vma->vm_end;
1936		- }
	2113	+ addr = vma->vm_start;
	2114	+ end = vma->vm_end;
1937	2115
1938	2116	pgd = pgd_offset(vma->vm_mm, addr);
1939	2117	do {
1940	2118	next = pgd_addr_end(addr, end);
1941	2119	if (pgd_none_or_clear_bad(pgd))
1942	2120	continue;
1943		- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
	2121	+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
	2122	+ frontswap, fs_pages_to_unuse);
1944	2123	if (ret)
1945	2124	return ret;
1946	2125	} while (pgd++, addr = next, addr != end);
1947	2126	return 0;
1948	2127	}
1949	2128
1950		-static int unuse_mm(struct mm_struct *mm,
1951		- swp_entry_t entry, struct page *page)
	2129	+static int unuse_mm(struct mm_struct *mm, unsigned int type,
	2130	+ bool frontswap, unsigned long *fs_pages_to_unuse)
1952	2131	{
1953	2132	struct vm_area_struct *vma;
1954	2133	int ret = 0;
1955	2134
1956		- if (!down_read_trylock(&mm->mmap_sem)) {
1957		- /*
1958		- * Activate page so shrink_inactive_list is unlikely to unmap
1959		- * its ptes while lock is dropped, so swapoff can make progress.
1960		- */
1961		- activate_page(page);
1962		- unlock_page(page);
1963		- down_read(&mm->mmap_sem);
1964		- lock_page(page);
1965		- }
	2135	+ mmap_read_lock(mm);
1966	2136	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1967		- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1968		- break;
	2137	+ if (vma->anon_vma) {
	2138	+ ret = unuse_vma(vma, type, frontswap,
	2139	+ fs_pages_to_unuse);
	2140	+ if (ret)
	2141	+ break;
	2142	+ }
1969	2143	cond_resched();
1970	2144	}
1971		- up_read(&mm->mmap_sem);
1972		- return (ret < 0)? ret: 0;
	2145	+ mmap_read_unlock(mm);
	2146	+ return ret;
1973	2147	}
1974	2148
1975	2149	/*
1976	2150	* Scan swap_map (or frontswap_map if frontswap parameter is true)
1977		- * from current position to next entry still in use.
1978		- * Recycle to start on reaching the end, returning 0 when empty.
	2151	+ * from current position to next entry still in use. Return 0
	2152	+ * if there are no inuse entries after prev till end of the map.
1979	2153	*/
1980	2154	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1981	2155	unsigned int prev, bool frontswap)
1982	2156	{
1983		- unsigned int max = si->max;
1984		- unsigned int i = prev;
	2157	+ unsigned int i;
1985	2158	unsigned char count;
1986	2159
1987	2160	/*
..	..	@@ -1990,20 +2163,7 @@
1990	2163	* hits are okay, and sys_swapoff() has already prevented new
1991	2164	* allocations from this area (while holding swap_lock).
1992	2165	*/
1993		- for (;;) {
1994		- if (++i >= max) {
1995		- if (!prev) {
1996		- i = 0;
1997		- break;
1998		- }
1999		- /*
2000		- * No entries in use at top of swap_map,
2001		- * loop back to start and recheck there.
2002		- */
2003		- max = prev + 1;
2004		- prev = 0;
2005		- i = 1;
2006		- }
	2166	+ for (i = prev + 1; i < si->max; i++) {
2007	2167	count = READ_ONCE(si->swap_map[i]);
2008	2168	if (count && swap_count(count) != SWAP_MAP_BAD)
2009	2169	if (!frontswap \|\| frontswap_test(si, i))
..	..	@@ -2011,240 +2171,124 @@
2011	2171	if ((i % LATENCY_LIMIT) == 0)
2012	2172	cond_resched();
2013	2173	}
	2174	+
	2175	+ if (i == si->max)
	2176	+ i = 0;
	2177	+
2014	2178	return i;
2015	2179	}
2016	2180
2017	2181	/*
2018		- * We completely avoid races by reading each swap page in advance,
2019		- * and then search for the process using it. All the necessary
2020		- * page table adjustments can then be made atomically.
2021		- *
2022		- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
	2182	+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
2023	2183	* pages_to_unuse==0 means all pages; ignored if frontswap is false
2024	2184	*/
2025	2185	int try_to_unuse(unsigned int type, bool frontswap,
2026	2186	unsigned long pages_to_unuse)
2027	2187	{
	2188	+ struct mm_struct *prev_mm;
	2189	+ struct mm_struct *mm;
	2190	+ struct list_head *p;
	2191	+ int retval = 0;
2028	2192	struct swap_info_struct *si = swap_info[type];
2029		- struct mm_struct *start_mm;
2030		- volatile unsigned char swap_map; / swap_map is accessed without
2031		- * locking. Mark it as volatile
2032		- * to prevent compiler doing
2033		- * something odd.
2034		- */
2035		- unsigned char swcount;
2036	2193	struct page *page;
2037	2194	swp_entry_t entry;
2038		- unsigned int i = 0;
2039		- int retval = 0;
	2195	+ unsigned int i;
2040	2196
2041		- /*
2042		- * When searching mms for an entry, a good strategy is to
2043		- * start at the first mm we freed the previous entry from
2044		- * (though actually we don't notice whether we or coincidence
2045		- * freed the entry). Initialize this start_mm with a hold.
2046		- *
2047		- * A simpler strategy would be to start at the last mm we
2048		- * freed the previous entry from; but that would take less
2049		- * advantage of mmlist ordering, which clusters forked mms
2050		- * together, child after parent. If we race with dup_mmap(), we
2051		- * prefer to resolve parent before child, lest we miss entries
2052		- * duplicated after we scanned child: using last mm would invert
2053		- * that.
2054		- */
2055		- start_mm = &init_mm;
2056		- mmget(&init_mm);
	2197	+ if (!READ_ONCE(si->inuse_pages))
	2198	+ return 0;
2057	2199
2058		- /*
2059		- * Keep on scanning until all entries have gone. Usually,
2060		- * one pass through swap_map is enough, but not necessarily:
2061		- * there are races when an instance of an entry might be missed.
2062		- */
2063		- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2064		- if (signal_pending(current)) {
2065		- retval = -EINTR;
2066		- break;
2067		- }
	2200	+ if (!frontswap)
	2201	+ pages_to_unuse = 0;
2068	2202
2069		- /*
2070		- * Get a page for the entry, using the existing swap
2071		- * cache page if there is one. Otherwise, get a clean
2072		- * page and read the swap into it.
2073		- */
2074		- swap_map = &si->swap_map[i];
2075		- entry = swp_entry(type, i);
2076		- page = read_swap_cache_async(entry,
2077		- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2078		- if (!page) {
2079		- /*
2080		- * Either swap_duplicate() failed because entry
2081		- * has been freed independently, and will not be
2082		- * reused since sys_swapoff() already disabled
2083		- * allocation from here, or alloc_page() failed.
2084		- */
2085		- swcount = *swap_map;
2086		- /*
2087		- * We don't hold lock here, so the swap entry could be
2088		- * SWAP_MAP_BAD (when the cluster is discarding).
2089		- * Instead of fail out, We can just skip the swap
2090		- * entry because swapoff will wait for discarding
2091		- * finish anyway.
2092		- */
2093		- if (!swcount \|\| swcount == SWAP_MAP_BAD)
2094		- continue;
2095		- retval = -ENOMEM;
2096		- break;
2097		- }
	2203	+retry:
	2204	+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
	2205	+ if (retval)
	2206	+ goto out;
2098	2207
2099		- /*
2100		- * Don't hold on to start_mm if it looks like exiting.
2101		- */
2102		- if (atomic_read(&start_mm->mm_users) == 1) {
2103		- mmput(start_mm);
2104		- start_mm = &init_mm;
2105		- mmget(&init_mm);
2106		- }
	2208	+ prev_mm = &init_mm;
	2209	+ mmget(prev_mm);
2107	2210
2108		- /*
2109		- * Wait for and lock page. When do_swap_page races with
2110		- * try_to_unuse, do_swap_page can handle the fault much
2111		- * faster than try_to_unuse can locate the entry. This
2112		- * apparently redundant "wait_on_page_locked" lets try_to_unuse
2113		- * defer to do_swap_page in such a case - in some tests,
2114		- * do_swap_page and try_to_unuse repeatedly compete.
2115		- */
2116		- wait_on_page_locked(page);
2117		- wait_on_page_writeback(page);
2118		- lock_page(page);
2119		- wait_on_page_writeback(page);
	2211	+ spin_lock(&mmlist_lock);
	2212	+ p = &init_mm.mmlist;
	2213	+ while (READ_ONCE(si->inuse_pages) &&
	2214	+ !signal_pending(current) &&
	2215	+ (p = p->next) != &init_mm.mmlist) {
2120	2216
2121		- /*
2122		- * Remove all references to entry.
2123		- */
2124		- swcount = *swap_map;
2125		- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2126		- retval = shmem_unuse(entry, page);
2127		- /* page has already been unlocked and released */
2128		- if (retval < 0)
2129		- break;
	2217	+ mm = list_entry(p, struct mm_struct, mmlist);
	2218	+ if (!mmget_not_zero(mm))
2130	2219	continue;
2131		- }
2132		- if (swap_count(swcount) && start_mm != &init_mm)
2133		- retval = unuse_mm(start_mm, entry, page);
	2220	+ spin_unlock(&mmlist_lock);
	2221	+ mmput(prev_mm);
	2222	+ prev_mm = mm;
	2223	+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2134	2224
2135		- if (swap_count(*swap_map)) {
2136		- int set_start_mm = (*swap_map >= swcount);
2137		- struct list_head *p = &start_mm->mmlist;
2138		- struct mm_struct *new_start_mm = start_mm;
2139		- struct mm_struct *prev_mm = start_mm;
2140		- struct mm_struct *mm;
2141		-
2142		- mmget(new_start_mm);
2143		- mmget(prev_mm);
2144		- spin_lock(&mmlist_lock);
2145		- while (swap_count(*swap_map) && !retval &&
2146		- (p = p->next) != &start_mm->mmlist) {
2147		- mm = list_entry(p, struct mm_struct, mmlist);
2148		- if (!mmget_not_zero(mm))
2149		- continue;
2150		- spin_unlock(&mmlist_lock);
2151		- mmput(prev_mm);
2152		- prev_mm = mm;
2153		-
2154		- cond_resched();
2155		-
2156		- swcount = *swap_map;
2157		- if (!swap_count(swcount)) /* any usage ? */
2158		- ;
2159		- else if (mm == &init_mm)
2160		- set_start_mm = 1;
2161		- else
2162		- retval = unuse_mm(mm, entry, page);
2163		-
2164		- if (set_start_mm && *swap_map < swcount) {
2165		- mmput(new_start_mm);
2166		- mmget(mm);
2167		- new_start_mm = mm;
2168		- set_start_mm = 0;
2169		- }
2170		- spin_lock(&mmlist_lock);
2171		- }
2172		- spin_unlock(&mmlist_lock);
2173		- mmput(prev_mm);
2174		- mmput(start_mm);
2175		- start_mm = new_start_mm;
2176		- }
2177	2225	if (retval) {
2178		- unlock_page(page);
2179		- put_page(page);
2180		- break;
	2226	+ mmput(prev_mm);
	2227	+ goto out;
2181	2228	}
2182		-
2183		- /*
2184		- * If a reference remains (rare), we would like to leave
2185		- * the page in the swap cache; but try_to_unmap could
2186		- * then re-duplicate the entry once we drop page lock,
2187		- * so we might loop indefinitely; also, that page could
2188		- * not be swapped out to other storage meanwhile. So:
2189		- * delete from cache even if there's another reference,
2190		- * after ensuring that the data has been saved to disk -
2191		- * since if the reference remains (rarer), it will be
2192		- * read from disk into another page. Splitting into two
2193		- * pages would be incorrect if swap supported "shared
2194		- * private" pages, but they are handled by tmpfs files.
2195		- *
2196		- * Given how unuse_vma() targets one particular offset
2197		- * in an anon_vma, once the anon_vma has been determined,
2198		- * this splitting happens to be just what is needed to
2199		- * handle where KSM pages have been swapped out: re-reading
2200		- * is unnecessarily slow, but we can fix that later on.
2201		- */
2202		- if (swap_count(*swap_map) &&
2203		- PageDirty(page) && PageSwapCache(page)) {
2204		- struct writeback_control wbc = {
2205		- .sync_mode = WB_SYNC_NONE,
2206		- };
2207		-
2208		- swap_writepage(compound_head(page), &wbc);
2209		- lock_page(page);
2210		- wait_on_page_writeback(page);
2211		- }
2212		-
2213		- /*
2214		- * It is conceivable that a racing task removed this page from
2215		- * swap cache just before we acquired the page lock at the top,
2216		- * or while we dropped it in unuse_mm(). The page might even
2217		- * be back in swap cache on another swap area: that we must not
2218		- * delete, since it may not have been written out to swap yet.
2219		- */
2220		- if (PageSwapCache(page) &&
2221		- likely(page_private(page) == entry.val) &&
2222		- (!PageTransCompound(page) \|\|
2223		- !swap_page_trans_huge_swapped(si, entry)))
2224		- delete_from_swap_cache(compound_head(page));
2225		-
2226		- /*
2227		- * So we could skip searching mms once swap count went
2228		- * to 1, we did not mark any present ptes as dirty: must
2229		- * mark page dirty so shrink_page_list will preserve it.
2230		- */
2231		- SetPageDirty(page);
2232		- unlock_page(page);
2233		- put_page(page);
2234	2229
2235	2230	/*
2236	2231	* Make sure that we aren't completely killing
2237	2232	* interactive performance.
2238	2233	*/
2239	2234	cond_resched();
2240		- if (frontswap && pages_to_unuse > 0) {
2241		- if (!--pages_to_unuse)
2242		- break;
2243		- }
	2235	+ spin_lock(&mmlist_lock);
	2236	+ }
	2237	+ spin_unlock(&mmlist_lock);
	2238	+
	2239	+ mmput(prev_mm);
	2240	+
	2241	+ i = 0;
	2242	+ while (READ_ONCE(si->inuse_pages) &&
	2243	+ !signal_pending(current) &&
	2244	+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
	2245	+
	2246	+ entry = swp_entry(type, i);
	2247	+ page = find_get_page(swap_address_space(entry), i);
	2248	+ if (!page)
	2249	+ continue;
	2250	+
	2251	+ /*
	2252	+ * It is conceivable that a racing task removed this page from
	2253	+ * swap cache just before we acquired the page lock. The page
	2254	+ * might even be back in swap cache on another swap area. But
	2255	+ * that is okay, try_to_free_swap() only removes stale pages.
	2256	+ */
	2257	+ lock_page(page);
	2258	+ wait_on_page_writeback(page);
	2259	+ try_to_free_swap(page);
	2260	+ trace_android_vh_unuse_swap_page(si, page);
	2261	+ unlock_page(page);
	2262	+ put_page(page);
	2263	+
	2264	+ /*
	2265	+ * For frontswap, we just need to unuse pages_to_unuse, if
	2266	+ * it was specified. Need not check frontswap again here as
	2267	+ * we already zeroed out pages_to_unuse if not frontswap.
	2268	+ */
	2269	+ if (pages_to_unuse && --pages_to_unuse == 0)
	2270	+ goto out;
2244	2271	}
2245	2272
2246		- mmput(start_mm);
2247		- return retval;
	2273	+ /*
	2274	+ * Lets check again to see if there are still swap entries in the map.
	2275	+ * If yes, we would need to do retry the unuse logic again.
	2276	+ * Under global memory pressure, swap entries can be reinserted back
	2277	+ * into process space after the mmlist loop above passes over them.
	2278	+ *
	2279	+ * Limit the number of retries? No: when mmget_not_zero() above fails,
	2280	+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
	2281	+ * at its own independent pace; and even shmem_writepage() could have
	2282	+ * been preempted after get_swap_page(), temporarily hiding that swap.
	2283	+ * It's easy and robust (though cpu-intensive) just to keep retrying.
	2284	+ */
	2285	+ if (READ_ONCE(si->inuse_pages)) {
	2286	+ if (!signal_pending(current))
	2287	+ goto retry;
	2288	+ retval = -EINTR;
	2289	+ }
	2290	+out:
	2291	+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2248	2292	}
2249	2293
2250	2294	/*
..	..	@@ -2276,7 +2320,6 @@
2276	2320	static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2277	2321	{
2278	2322	struct swap_info_struct *sis;
2279		- struct swap_extent *start_se;
2280	2323	struct swap_extent *se;
2281	2324	pgoff_t offset;
2282	2325
..	..	@@ -2284,18 +2327,8 @@
2284	2327	*bdev = sis->bdev;
2285	2328
2286	2329	offset = swp_offset(entry);
2287		- start_se = sis->curr_swap_extent;
2288		- se = start_se;
2289		-
2290		- for ( ; ; ) {
2291		- if (se->start_page <= offset &&
2292		- offset < (se->start_page + se->nr_pages)) {
2293		- return se->start_block + (offset - se->start_page);
2294		- }
2295		- se = list_next_entry(se, list);
2296		- sis->curr_swap_extent = se;
2297		- BUG_ON(se == start_se); /* It must be present */
2298		- }
	2330	+ se = offset_to_swap_extent(sis, offset);
	2331	+ return se->start_block + (offset - se->start_page);
2299	2332	}
2300	2333
2301	2334	/*
..	..	@@ -2305,7 +2338,7 @@
2305	2338	{
2306	2339	swp_entry_t entry;
2307	2340	entry.val = page_private(page);
2308		- return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9);
	2341	+ return map_swap_entry(entry, bdev);
2309	2342	}
2310	2343
2311	2344	/*
..	..	@@ -2313,27 +2346,27 @@
2313	2346	*/
2314	2347	static void destroy_swap_extents(struct swap_info_struct *sis)
2315	2348	{
2316		- while (!list_empty(&sis->first_swap_extent.list)) {
2317		- struct swap_extent *se;
	2349	+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
	2350	+ struct rb_node *rb = sis->swap_extent_root.rb_node;
	2351	+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2318	2352
2319		- se = list_first_entry(&sis->first_swap_extent.list,
2320		- struct swap_extent, list);
2321		- list_del(&se->list);
	2353	+ rb_erase(rb, &sis->swap_extent_root);
2322	2354	kfree(se);
2323	2355	}
2324	2356
2325		- if (sis->flags & SWP_FILE) {
	2357	+ if (sis->flags & SWP_ACTIVATED) {
2326	2358	struct file *swap_file = sis->swap_file;
2327	2359	struct address_space *mapping = swap_file->f_mapping;
2328	2360
2329		- sis->flags &= ~SWP_FILE;
2330		- mapping->a_ops->swap_deactivate(swap_file);
	2361	+ sis->flags &= ~SWP_ACTIVATED;
	2362	+ if (mapping->a_ops->swap_deactivate)
	2363	+ mapping->a_ops->swap_deactivate(swap_file);
2331	2364	}
2332	2365	}
2333	2366
2334	2367	/*
2335	2368	* Add a block range (and the corresponding page range) into this swapdev's
2336		- * extent list. The extent list is kept sorted in page order.
	2369	+ * extent tree.
2337	2370	*
2338	2371	* This function rather assumes that it is called in ascending page order.
2339	2372	*/
..	..	@@ -2341,20 +2374,21 @@
2341	2374	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2342	2375	unsigned long nr_pages, sector_t start_block)
2343	2376	{
	2377	+ struct rb_node *link = &sis->swap_extent_root.rb_node, parent = NULL;
2344	2378	struct swap_extent *se;
2345	2379	struct swap_extent *new_se;
2346		- struct list_head *lh;
2347	2380
2348		- if (start_page == 0) {
2349		- se = &sis->first_swap_extent;
2350		- sis->curr_swap_extent = se;
2351		- se->start_page = 0;
2352		- se->nr_pages = nr_pages;
2353		- se->start_block = start_block;
2354		- return 1;
2355		- } else {
2356		- lh = sis->first_swap_extent.list.prev; /* Highest extent */
2357		- se = list_entry(lh, struct swap_extent, list);
	2381	+ /*
	2382	+ * place the new node at the right most since the
	2383	+ * function is called in ascending page order.
	2384	+ */
	2385	+ while (*link) {
	2386	+ parent = *link;
	2387	+ link = &parent->rb_right;
	2388	+ }
	2389	+
	2390	+ if (parent) {
	2391	+ se = rb_entry(parent, struct swap_extent, rb_node);
2358	2392	BUG_ON(se->start_page + se->nr_pages != start_page);
2359	2393	if (se->start_block + se->nr_pages == start_block) {
2360	2394	/* Merge it */
..	..	@@ -2363,9 +2397,7 @@
2363	2397	}
2364	2398	}
2365	2399
2366		- /*
2367		- * No merge. Insert a new extent, preserving ordering.
2368		- */
	2400	+ /* No merge, insert a new extent. */
2369	2401	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2370	2402	if (new_se == NULL)
2371	2403	return -ENOMEM;
..	..	@@ -2373,7 +2405,8 @@
2373	2405	new_se->nr_pages = nr_pages;
2374	2406	new_se->start_block = start_block;
2375	2407
2376		- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
	2408	+ rb_link_node(&new_se->rb_node, parent, link);
	2409	+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2377	2410	return 1;
2378	2411	}
2379	2412	EXPORT_SYMBOL_GPL(add_swap_extent);
..	..	@@ -2423,8 +2456,10 @@
2423	2456
2424	2457	if (mapping->a_ops->swap_activate) {
2425	2458	ret = mapping->a_ops->swap_activate(sis, swap_file, span);
	2459	+ if (ret >= 0)
	2460	+ sis->flags \|= SWP_ACTIVATED;
2426	2461	if (!ret) {
2427		- sis->flags \|= SWP_FILE;
	2462	+ sis->flags \|= SWP_FS_OPS;
2428	2463	ret = add_swap_extent(sis, 0, sis->max, 0);
2429	2464	*span = sis->pages;
2430	2465	}
..	..	@@ -2446,9 +2481,9 @@
2446	2481	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2447	2482	}
2448	2483
2449		-static void _enable_swap_info(struct swap_info_struct *p, int prio,
2450		- unsigned char *swap_map,
2451		- struct swap_cluster_info *cluster_info)
	2484	+static void setup_swap_info(struct swap_info_struct *p, int prio,
	2485	+ unsigned char *swap_map,
	2486	+ struct swap_cluster_info *cluster_info)
2452	2487	{
2453	2488	int i;
2454	2489
..	..	@@ -2473,10 +2508,18 @@
2473	2508	}
2474	2509	p->swap_map = swap_map;
2475	2510	p->cluster_info = cluster_info;
2476		- p->flags \|= SWP_WRITEOK;
2477		- atomic_long_add(p->pages, &nr_swap_pages);
2478		- total_swap_pages += p->pages;
	2511	+}
2479	2512
	2513	+static void _enable_swap_info(struct swap_info_struct *p)
	2514	+{
	2515	+ bool skip = false;
	2516	+
	2517	+ p->flags \|= SWP_WRITEOK \| SWP_VALID;
	2518	+ trace_android_vh_account_swap_pages(p, &skip);
	2519	+ if (!skip) {
	2520	+ atomic_long_add(p->pages, &nr_swap_pages);
	2521	+ total_swap_pages += p->pages;
	2522	+ }
2480	2523	assert_spin_locked(&swap_lock);
2481	2524	/*
2482	2525	* both lists are plists, and thus priority ordered.
..	..	@@ -2500,7 +2543,17 @@
2500	2543	frontswap_init(p->type, frontswap_map);
2501	2544	spin_lock(&swap_lock);
2502	2545	spin_lock(&p->lock);
2503		- _enable_swap_info(p, prio, swap_map, cluster_info);
	2546	+ setup_swap_info(p, prio, swap_map, cluster_info);
	2547	+ spin_unlock(&p->lock);
	2548	+ spin_unlock(&swap_lock);
	2549	+ /*
	2550	+ * Guarantee swap_map, cluster_info, etc. fields are valid
	2551	+ * between get/put_swap_device() if SWP_VALID bit is set
	2552	+ */
	2553	+ synchronize_rcu();
	2554	+ spin_lock(&swap_lock);
	2555	+ spin_lock(&p->lock);
	2556	+ _enable_swap_info(p);
2504	2557	spin_unlock(&p->lock);
2505	2558	spin_unlock(&swap_lock);
2506	2559	}
..	..	@@ -2509,7 +2562,8 @@
2509	2562	{
2510	2563	spin_lock(&swap_lock);
2511	2564	spin_lock(&p->lock);
2512		- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
	2565	+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
	2566	+ _enable_swap_info(p);
2513	2567	spin_unlock(&p->lock);
2514	2568	spin_unlock(&swap_lock);
2515	2569	}
..	..	@@ -2537,6 +2591,7 @@
2537	2591	struct filename *pathname;
2538	2592	int err, found = 0;
2539	2593	unsigned int old_block_size;
	2594	+ bool skip = false;
2540	2595
2541	2596	if (!capable(CAP_SYS_ADMIN))
2542	2597	return -EPERM;
..	..	@@ -2591,8 +2646,11 @@
2591	2646	least_priority++;
2592	2647	}
2593	2648	plist_del(&p->list, &swap_active_head);
2594		- atomic_long_sub(p->pages, &nr_swap_pages);
2595		- total_swap_pages -= p->pages;
	2649	+ trace_android_vh_account_swap_pages(p, &skip);
	2650	+ if (!skip) {
	2651	+ atomic_long_sub(p->pages, &nr_swap_pages);
	2652	+ total_swap_pages -= p->pages;
	2653	+ }
2596	2654	p->flags &= ~SWP_WRITEOK;
2597	2655	spin_unlock(&p->lock);
2598	2656	spin_unlock(&swap_lock);
..	..	@@ -2611,6 +2669,17 @@
2611	2669	}
2612	2670
2613	2671	reenable_swap_slots_cache_unlock();
	2672	+
	2673	+ spin_lock(&swap_lock);
	2674	+ spin_lock(&p->lock);
	2675	+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
	2676	+ spin_unlock(&p->lock);
	2677	+ spin_unlock(&swap_lock);
	2678	+ /*
	2679	+ * wait for swap operations protected by get/put_swap_device()
	2680	+ * to complete
	2681	+ */
	2682	+ synchronize_rcu();
2614	2683
2615	2684	flush_work(&p->discard_work);
2616	2685
..	..	@@ -2647,11 +2716,14 @@
2647	2716	frontswap_map = frontswap_map_get(p);
2648	2717	spin_unlock(&p->lock);
2649	2718	spin_unlock(&swap_lock);
	2719	+ arch_swap_invalidate_area(p->type);
2650	2720	frontswap_invalidate_area(p->type);
2651	2721	frontswap_map_set(p, NULL);
2652	2722	mutex_unlock(&swapon_mutex);
2653	2723	free_percpu(p->percpu_cluster);
2654	2724	p->percpu_cluster = NULL;
	2725	+ free_percpu(p->cluster_next_cpu);
	2726	+ p->cluster_next_cpu = NULL;
2655	2727	vfree(swap_map);
2656	2728	kvfree(cluster_info);
2657	2729	kvfree(frontswap_map);
..	..	@@ -2759,20 +2831,24 @@
2759	2831	struct swap_info_struct *si = v;
2760	2832	struct file *file;
2761	2833	int len;
	2834	+ unsigned int bytes, inuse;
2762	2835
2763	2836	if (si == SEQ_START_TOKEN) {
2764		- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
	2837	+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2765	2838	return 0;
2766	2839	}
2767	2840
	2841	+ bytes = si->pages << (PAGE_SHIFT - 10);
	2842	+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
	2843	+
2768	2844	file = si->swap_file;
2769	2845	len = seq_file_path(swap, file, " \t\n\\");
2770		- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
	2846	+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
2771	2847	len < 40 ? 40 - len : 1, " ",
2772	2848	S_ISBLK(file_inode(file)->i_mode) ?
2773	2849	"partition" : "file\t",
2774		- si->pages << (PAGE_SHIFT - 10),
2775		- si->inuse_pages << (PAGE_SHIFT - 10),
	2850	+ bytes, bytes < 10000000 ? "\t" : "",
	2851	+ inuse, inuse < 10000000 ? "\t" : "",
2776	2852	si->prio);
2777	2853	return 0;
2778	2854	}
..	..	@@ -2798,17 +2874,18 @@
2798	2874	return 0;
2799	2875	}
2800	2876
2801		-static const struct file_operations proc_swaps_operations = {
2802		- .open = swaps_open,
2803		- .read = seq_read,
2804		- .llseek = seq_lseek,
2805		- .release = seq_release,
2806		- .poll = swaps_poll,
	2877	+static const struct proc_ops swaps_proc_ops = {
	2878	+ .proc_flags = PROC_ENTRY_PERMANENT,
	2879	+ .proc_open = swaps_open,
	2880	+ .proc_read = seq_read,
	2881	+ .proc_lseek = seq_lseek,
	2882	+ .proc_release = seq_release,
	2883	+ .proc_poll = swaps_poll,
2807	2884	};
2808	2885
2809	2886	static int __init procswaps_init(void)
2810	2887	{
2811		- proc_create("swaps", 0, NULL, &proc_swaps_operations);
	2888	+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
2812	2889	return 0;
2813	2890	}
2814	2891	__initcall(procswaps_init);
..	..	@@ -2825,13 +2902,16 @@
2825	2902
2826	2903	static struct swap_info_struct *alloc_swap_info(void)
2827	2904	{
2828		- struct swap_info_struct *p;
	2905	+ struct swap_info_struct *p = NULL;
2829	2906	struct swap_info_struct *defer = NULL;
2830	2907	unsigned int type;
2831	2908	int i;
2832		- int size = sizeof(p) + nr_node_ids sizeof(struct plist_node);
	2909	+ bool skip = false;
2833	2910
2834		- p = kvzalloc(size, GFP_KERNEL);
	2911	+ trace_android_rvh_alloc_si(&p, &skip);
	2912	+ trace_android_vh_alloc_si(&p, &skip);
	2913	+ if (!skip)
	2914	+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2835	2915	if (!p)
2836	2916	return ERR_PTR(-ENOMEM);
2837	2917
..	..	@@ -2863,7 +2943,7 @@
2863	2943	* would be relying on p->type to remain valid.
2864	2944	*/
2865	2945	}
2866		- INIT_LIST_HEAD(&p->first_swap_extent.list);
	2946	+ p->swap_extent_root = RB_ROOT;
2867	2947	plist_node_init(&p->list, 0);
2868	2948	for_each_node(i)
2869	2949	plist_node_init(&p->avail_lists[i], 0);
..	..	@@ -2881,10 +2961,10 @@
2881	2961	int error;
2882	2962
2883	2963	if (S_ISBLK(inode->i_mode)) {
2884		- p->bdev = bdgrab(I_BDEV(inode));
2885		- error = blkdev_get(p->bdev,
	2964	+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
2886	2965	FMODE_READ \| FMODE_WRITE \| FMODE_EXCL, p);
2887		- if (error < 0) {
	2966	+ if (IS_ERR(p->bdev)) {
	2967	+ error = PTR_ERR(p->bdev);
2888	2968	p->bdev = NULL;
2889	2969	return error;
2890	2970	}
..	..	@@ -2892,6 +2972,13 @@
2892	2972	error = set_blocksize(p->bdev, PAGE_SIZE);
2893	2973	if (error < 0)
2894	2974	return error;
	2975	+ /*
	2976	+ * Zoned block devices contain zones that have a sequential
	2977	+ * write only restriction. Hence zoned block devices are not
	2978	+ * suitable for swapping. Disallow them here.
	2979	+ */
	2980	+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
	2981	+ return -EINVAL;
2895	2982	p->flags \|= SWP_BLKDEV;
2896	2983	} else if (S_ISREG(inode->i_mode)) {
2897	2984	p->bdev = inode->i_sb->s_bdev;
..	..	@@ -3188,10 +3275,10 @@
3188	3275	goto bad_swap_unlock_inode;
3189	3276	}
3190	3277
3191		- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
	3278	+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
3192	3279	p->flags \|= SWP_STABLE_WRITES;
3193	3280
3194		- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
	3281	+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
3195	3282	p->flags \|= SWP_SYNCHRONOUS_IO;
3196	3283
3197	3284	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
..	..	@@ -3199,11 +3286,19 @@
3199	3286	unsigned long ci, nr_cluster;
3200	3287
3201	3288	p->flags \|= SWP_SOLIDSTATE;
	3289	+ p->cluster_next_cpu = alloc_percpu(unsigned int);
	3290	+ if (!p->cluster_next_cpu) {
	3291	+ error = -ENOMEM;
	3292	+ goto bad_swap_unlock_inode;
	3293	+ }
3202	3294	/*
3203	3295	* select a random position to start with to help wear leveling
3204	3296	* SSD
3205	3297	*/
3206		- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
	3298	+ for_each_possible_cpu(cpu) {
	3299	+ per_cpu(*p->cluster_next_cpu, cpu) =
	3300	+ 1 + prandom_u32_max(p->highest_bit);
	3301	+ }
3207	3302	nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3208	3303
3209	3304	cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
..	..	@@ -3289,7 +3384,7 @@
3289	3384	error = inode_drain_writes(inode);
3290	3385	if (error) {
3291	3386	inode->i_flags &= ~S_SWAPFILE;
3292		- goto bad_swap_unlock_inode;
	3387	+ goto free_swap_address_space;
3293	3388	}
3294	3389
3295	3390	mutex_lock(&swapon_mutex);
..	..	@@ -3299,6 +3394,7 @@
3299	3394	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3300	3395	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
3301	3396
	3397	+ trace_android_vh_init_swap_info_struct(p, swap_avail_heads);
3302	3398	pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
3303	3399	p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
3304	3400	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
..	..	@@ -3314,11 +3410,15 @@
3314	3410
3315	3411	error = 0;
3316	3412	goto out;
	3413	+free_swap_address_space:
	3414	+ exit_swap_address_space(p->type);
3317	3415	bad_swap_unlock_inode:
3318	3416	inode_unlock(inode);
3319	3417	bad_swap:
3320	3418	free_percpu(p->percpu_cluster);
3321	3419	p->percpu_cluster = NULL;
	3420	+ free_percpu(p->cluster_next_cpu);
	3421	+ p->cluster_next_cpu = NULL;
3322	3422	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
3323	3423	set_blocksize(p->bdev, p->old_block_size);
3324	3424	blkdev_put(p->bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);
..	..	@@ -3359,14 +3459,17 @@
3359	3459	spin_lock(&swap_lock);
3360	3460	for (type = 0; type < nr_swapfiles; type++) {
3361	3461	struct swap_info_struct *si = swap_info[type];
	3462	+ bool skip = false;
3362	3463
3363		- if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
	3464	+ trace_android_vh_si_swapinfo(si, &skip);
	3465	+ if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3364	3466	nr_to_be_unused += si->inuse_pages;
3365	3467	}
3366	3468	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3367	3469	val->totalswap = total_swap_pages + nr_to_be_unused;
3368	3470	spin_unlock(&swap_lock);
3369	3471	}
	3472	+EXPORT_SYMBOL_GPL(si_swapinfo);
3370	3473
3371	3474	/*
3372	3475	* Verify that a swap entry is valid and increment its swap map count.
..	..	@@ -3388,17 +3491,11 @@
3388	3491	unsigned char has_cache;
3389	3492	int err = -EINVAL;
3390	3493
3391		- if (non_swap_entry(entry))
3392		- goto out;
3393		-
3394		- p = swp_swap_info(entry);
	3494	+ p = get_swap_device(entry);
3395	3495	if (!p)
3396		- goto bad_file;
	3496	+ goto out;
3397	3497
3398	3498	offset = swp_offset(entry);
3399		- if (unlikely(offset >= p->max))
3400		- goto out;
3401		-
3402	3499	ci = lock_cluster_or_swap_info(p, offset);
3403	3500
3404	3501	count = p->swap_map[offset];
..	..	@@ -3439,16 +3536,14 @@
3439	3536	} else
3440	3537	err = -ENOENT; /* unused swap entry */
3441	3538
3442		- p->swap_map[offset] = count \| has_cache;
	3539	+ WRITE_ONCE(p->swap_map[offset], count \| has_cache);
3443	3540
3444	3541	unlock_out:
3445	3542	unlock_cluster_or_swap_info(p, ci);
3446	3543	out:
	3544	+ if (p)
	3545	+ put_swap_device(p);
3447	3546	return err;
3448		-
3449		-bad_file:
3450		- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3451		- goto out;
3452	3547	}
3453	3548
3454	3549	/*
..	..	@@ -3481,7 +3576,7 @@
3481	3576	*
3482	3577	* Called when allocating swap cache for existing swap entry,
3483	3578	* This can return error codes. Returns 0 at success.
3484		- * -EBUSY means there is a swap cache.
	3579	+ * -EEXIST means there is a swap cache.
3485	3580	* Note: return code is different from swap_duplicate().
3486	3581	*/
3487	3582	int swapcache_prepare(swp_entry_t entry)
..	..	@@ -3493,6 +3588,7 @@
3493	3588	{
3494	3589	return swap_type_to_swap_info(swp_type(entry));
3495	3590	}
	3591	+EXPORT_SYMBOL_GPL(swp_swap_info);
3496	3592
3497	3593	struct swap_info_struct page_swap_info(struct page page)
3498	3594	{
..	..	@@ -3540,6 +3636,7 @@
3540	3636	struct page *list_page;
3541	3637	pgoff_t offset;
3542	3638	unsigned char count;
	3639	+ int ret = 0;
3543	3640
3544	3641	/*
3545	3642	* When debugging, it's easier to use __GFP_ZERO here; but it's better
..	..	@@ -3547,15 +3644,15 @@
3547	3644	*/
3548	3645	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
3549	3646
3550		- si = swap_info_get(entry);
	3647	+ si = get_swap_device(entry);
3551	3648	if (!si) {
3552	3649	/*
3553	3650	* An acceptable race has occurred since the failing
3554		- * __swap_duplicate(): the swap entry has been freed,
3555		- * perhaps even the whole swap_map cleared for swapoff.
	3651	+ * __swap_duplicate(): the swap device may be swapoff
3556	3652	*/
3557	3653	goto outer;
3558	3654	}
	3655	+ spin_lock(&si->lock);
3559	3656
3560	3657	offset = swp_offset(entry);
3561	3658
..	..	@@ -3573,9 +3670,8 @@
3573	3670	}
3574	3671
3575	3672	if (!page) {
3576		- unlock_cluster(ci);
3577		- spin_unlock(&si->lock);
3578		- return -ENOMEM;
	3673	+ ret = -ENOMEM;
	3674	+ goto out;
3579	3675	}
3580	3676
3581	3677	/*
..	..	@@ -3627,10 +3723,11 @@
3627	3723	out:
3628	3724	unlock_cluster(ci);
3629	3725	spin_unlock(&si->lock);
	3726	+ put_swap_device(si);
3630	3727	outer:
3631	3728	if (page)
3632	3729	__free_page(page);
3633		- return 0;
	3730	+ return ret;
3634	3731	}
3635	3732
3636	3733	/*
..	..	@@ -3658,7 +3755,7 @@
3658	3755
3659	3756	spin_lock(&si->cont_lock);
3660	3757	offset &= ~PAGE_MASK;
3661		- page = list_entry(head->lru.next, struct page, lru);
	3758	+ page = list_next_entry(head, lru);
3662	3759	map = kmap_atomic(page) + offset;
3663	3760
3664	3761	if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
..	..	@@ -3670,13 +3767,13 @@
3670	3767	*/
3671	3768	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
3672	3769	kunmap_atomic(map);
3673		- page = list_entry(page->lru.next, struct page, lru);
	3770	+ page = list_next_entry(page, lru);
3674	3771	BUG_ON(page == head);
3675	3772	map = kmap_atomic(page) + offset;
3676	3773	}
3677	3774	if (*map == SWAP_CONT_MAX) {
3678	3775	kunmap_atomic(map);
3679		- page = list_entry(page->lru.next, struct page, lru);
	3776	+ page = list_next_entry(page, lru);
3680	3777	if (page == head) {
3681	3778	ret = false; /* add count continuation */
3682	3779	goto out;
..	..	@@ -3686,12 +3783,10 @@
3686	3783	}
3687	3784	*map += 1;
3688	3785	kunmap_atomic(map);
3689		- page = list_entry(page->lru.prev, struct page, lru);
3690		- while (page != head) {
	3786	+ while ((page = list_prev_entry(page, lru)) != head) {
3691	3787	map = kmap_atomic(page) + offset;
3692	3788	*map = COUNT_CONTINUED;
3693	3789	kunmap_atomic(map);
3694		- page = list_entry(page->lru.prev, struct page, lru);
3695	3790	}
3696	3791	ret = true; /* incremented */
3697	3792
..	..	@@ -3702,7 +3797,7 @@
3702	3797	BUG_ON(count != COUNT_CONTINUED);
3703	3798	while (*map == COUNT_CONTINUED) {
3704	3799	kunmap_atomic(map);
3705		- page = list_entry(page->lru.next, struct page, lru);
	3800	+ page = list_next_entry(page, lru);
3706	3801	BUG_ON(page == head);
3707	3802	map = kmap_atomic(page) + offset;
3708	3803	}
..	..	@@ -3711,13 +3806,11 @@
3711	3806	if (*map == 0)
3712	3807	count = 0;
3713	3808	kunmap_atomic(map);
3714		- page = list_entry(page->lru.prev, struct page, lru);
3715		- while (page != head) {
	3809	+ while ((page = list_prev_entry(page, lru)) != head) {
3716	3810	map = kmap_atomic(page) + offset;
3717	3811	*map = SWAP_CONT_MAX \| count;
3718	3812	count = COUNT_CONTINUED;
3719	3813	kunmap_atomic(map);
3720		- page = list_entry(page->lru.prev, struct page, lru);
3721	3814	}
3722	3815	ret = count == COUNT_CONTINUED;
3723	3816	}
..	..	@@ -3749,11 +3842,12 @@
3749	3842	}
3750	3843
3751	3844	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3752		-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3753		- gfp_t gfp_mask)
	3845	+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
3754	3846	{
3755	3847	struct swap_info_struct si, next;
3756		- if (!(gfp_mask & __GFP_IO) \|\| !memcg)
	3848	+ int nid = page_to_nid(page);
	3849	+
	3850	+ if (!(gfp_mask & __GFP_IO))
3757	3851	return;
3758	3852
3759	3853	if (!blk_cgroup_congested())
..	..	@@ -3767,11 +3861,10 @@
3767	3861	return;
3768	3862
3769	3863	spin_lock(&swap_avail_lock);
3770		- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3771		- avail_lists[node]) {
	3864	+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
	3865	+ avail_lists[nid]) {
3772	3866	if (si->bdev) {
3773		- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3774		- true);
	3867	+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
3775	3868	break;
3776	3869	}
3777	3870	}