hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/mm/swapfile.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/swapfile.c
34 *
....@@ -39,10 +40,10 @@
3940 #include <linux/swap_slots.h>
4041 #include <linux/sort.h>
4142
42
-#include <asm/pgtable.h>
4343 #include <asm/tlbflush.h>
4444 #include <linux/swapops.h>
4545 #include <linux/swap_cgroup.h>
46
+#include <trace/hooks/mm.h>
4647
4748 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
4849 unsigned char);
....@@ -98,7 +99,7 @@
9899
99100 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100101
101
-static struct swap_info_struct *swap_type_to_swap_info(int type)
102
+struct swap_info_struct *swap_type_to_swap_info(int type)
102103 {
103104 if (type >= READ_ONCE(nr_swapfiles))
104105 return NULL;
....@@ -106,36 +107,62 @@
106107 smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
107108 return READ_ONCE(swap_info[type]);
108109 }
110
+EXPORT_SYMBOL_GPL(swap_type_to_swap_info);
109111
110112 static inline unsigned char swap_count(unsigned char ent)
111113 {
112114 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
113115 }
114116
117
+/* Reclaim the swap entry anyway if possible */
118
+#define TTRS_ANYWAY 0x1
119
+/*
120
+ * Reclaim the swap entry if there are no more mappings of the
121
+ * corresponding page
122
+ */
123
+#define TTRS_UNMAPPED 0x2
124
+/* Reclaim the swap entry if swap is getting full*/
125
+#define TTRS_FULL 0x4
126
+
115127 /* returns 1 if swap entry is freed */
116
-static int
117
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
128
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
129
+ unsigned long offset, unsigned long flags)
118130 {
119131 swp_entry_t entry = swp_entry(si->type, offset);
120132 struct page *page;
121133 int ret = 0;
122134
123
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
135
+ page = find_get_page(swap_address_space(entry), offset);
124136 if (!page)
125137 return 0;
126138 /*
127
- * This function is called from scan_swap_map() and it's called
128
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
129
- * We have to use trylock for avoiding deadlock. This is a special
139
+ * When this function is called from scan_swap_map_slots() and it's
140
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
141
+ * here. We have to use trylock for avoiding deadlock. This is a special
130142 * case and you should use try_to_free_swap() with explicit lock_page()
131143 * in usual operations.
132144 */
133145 if (trylock_page(page)) {
134
- ret = try_to_free_swap(page);
146
+ if ((flags & TTRS_ANYWAY) ||
147
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
148
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
149
+ ret = try_to_free_swap(page);
135150 unlock_page(page);
136151 }
137152 put_page(page);
138153 return ret;
154
+}
155
+
156
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
157
+{
158
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
159
+ return rb_entry(rb, struct swap_extent, rb_node);
160
+}
161
+
162
+static inline struct swap_extent *next_se(struct swap_extent *se)
163
+{
164
+ struct rb_node *rb = rb_next(&se->rb_node);
165
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
139166 }
140167
141168 /*
....@@ -150,7 +177,7 @@
150177 int err = 0;
151178
152179 /* Do not discard the swap header page! */
153
- se = &si->first_swap_extent;
180
+ se = first_se(si);
154181 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
155182 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
156183 if (nr_blocks) {
....@@ -161,7 +188,7 @@
161188 cond_resched();
162189 }
163190
164
- list_for_each_entry(se, &si->first_swap_extent.list, list) {
191
+ for (se = next_se(se); se; se = next_se(se)) {
165192 start_block = se->start_block << (PAGE_SHIFT - 9);
166193 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
167194
....@@ -175,6 +202,39 @@
175202 return err; /* That will often be -EOPNOTSUPP */
176203 }
177204
205
+static struct swap_extent *
206
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
207
+{
208
+ struct swap_extent *se;
209
+ struct rb_node *rb;
210
+
211
+ rb = sis->swap_extent_root.rb_node;
212
+ while (rb) {
213
+ se = rb_entry(rb, struct swap_extent, rb_node);
214
+ if (offset < se->start_page)
215
+ rb = rb->rb_left;
216
+ else if (offset >= se->start_page + se->nr_pages)
217
+ rb = rb->rb_right;
218
+ else
219
+ return se;
220
+ }
221
+ /* It *must* be present */
222
+ BUG();
223
+}
224
+
225
+sector_t swap_page_sector(struct page *page)
226
+{
227
+ struct swap_info_struct *sis = page_swap_info(page);
228
+ struct swap_extent *se;
229
+ sector_t sector;
230
+ pgoff_t offset;
231
+
232
+ offset = __page_file_index(page);
233
+ se = offset_to_swap_extent(sis, offset);
234
+ sector = se->start_block + (offset - se->start_page);
235
+ return sector << (PAGE_SHIFT - 9);
236
+}
237
+
178238 /*
179239 * swap allocation tell device that a cluster of swap can now be discarded,
180240 * to allow the swap device to optimize its wear-levelling.
....@@ -182,32 +242,25 @@
182242 static void discard_swap_cluster(struct swap_info_struct *si,
183243 pgoff_t start_page, pgoff_t nr_pages)
184244 {
185
- struct swap_extent *se = si->curr_swap_extent;
186
- int found_extent = 0;
245
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
187246
188247 while (nr_pages) {
189
- if (se->start_page <= start_page &&
190
- start_page < se->start_page + se->nr_pages) {
191
- pgoff_t offset = start_page - se->start_page;
192
- sector_t start_block = se->start_block + offset;
193
- sector_t nr_blocks = se->nr_pages - offset;
248
+ pgoff_t offset = start_page - se->start_page;
249
+ sector_t start_block = se->start_block + offset;
250
+ sector_t nr_blocks = se->nr_pages - offset;
194251
195
- if (nr_blocks > nr_pages)
196
- nr_blocks = nr_pages;
197
- start_page += nr_blocks;
198
- nr_pages -= nr_blocks;
252
+ if (nr_blocks > nr_pages)
253
+ nr_blocks = nr_pages;
254
+ start_page += nr_blocks;
255
+ nr_pages -= nr_blocks;
199256
200
- if (!found_extent++)
201
- si->curr_swap_extent = se;
257
+ start_block <<= PAGE_SHIFT - 9;
258
+ nr_blocks <<= PAGE_SHIFT - 9;
259
+ if (blkdev_issue_discard(si->bdev, start_block,
260
+ nr_blocks, GFP_NOIO, 0))
261
+ break;
202262
203
- start_block <<= PAGE_SHIFT - 9;
204
- nr_blocks <<= PAGE_SHIFT - 9;
205
- if (blkdev_issue_discard(si->bdev, start_block,
206
- nr_blocks, GFP_NOIO, 0))
207
- break;
208
- }
209
-
210
- se = list_next_entry(se, list);
263
+ se = next_se(se);
211264 }
212265 }
213266
....@@ -562,7 +615,6 @@
562615 {
563616 struct percpu_cluster *cluster;
564617 struct swap_cluster_info *ci;
565
- bool found_free;
566618 unsigned long tmp, max;
567619
568620 new_cluster:
....@@ -575,16 +627,16 @@
575627 } else if (!cluster_list_empty(&si->discard_clusters)) {
576628 /*
577629 * we don't have free cluster but have some clusters in
578
- * discarding, do discard now and reclaim them
630
+ * discarding, do discard now and reclaim them, then
631
+ * reread cluster_next_cpu since we dropped si->lock
579632 */
580633 swap_do_scheduled_discard(si);
581
- *scan_base = *offset = si->cluster_next;
634
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
635
+ *offset = *scan_base;
582636 goto new_cluster;
583637 } else
584638 return false;
585639 }
586
-
587
- found_free = false;
588640
589641 /*
590642 * Other CPUs can use our cluster if they can't find a free cluster,
....@@ -593,27 +645,23 @@
593645 tmp = cluster->next;
594646 max = min_t(unsigned long, si->max,
595647 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
596
- if (tmp >= max) {
597
- cluster_set_null(&cluster->index);
598
- goto new_cluster;
599
- }
600
- ci = lock_cluster(si, tmp);
601
- while (tmp < max) {
602
- if (!si->swap_map[tmp]) {
603
- found_free = true;
604
- break;
648
+ if (tmp < max) {
649
+ ci = lock_cluster(si, tmp);
650
+ while (tmp < max) {
651
+ if (!si->swap_map[tmp])
652
+ break;
653
+ tmp++;
605654 }
606
- tmp++;
655
+ unlock_cluster(ci);
607656 }
608
- unlock_cluster(ci);
609
- if (!found_free) {
657
+ if (tmp >= max) {
610658 cluster_set_null(&cluster->index);
611659 goto new_cluster;
612660 }
613661 cluster->next = tmp + 1;
614662 *offset = tmp;
615663 *scan_base = tmp;
616
- return found_free;
664
+ return true;
617665 }
618666
619667 static void __del_from_avail_list(struct swap_info_struct *p)
....@@ -639,7 +687,7 @@
639687 if (offset == si->lowest_bit)
640688 si->lowest_bit += nr_entries;
641689 if (end == si->highest_bit)
642
- si->highest_bit -= nr_entries;
690
+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
643691 si->inuse_pages += nr_entries;
644692 if (si->inuse_pages == si->pages) {
645693 si->lowest_bit = si->max;
....@@ -663,19 +711,23 @@
663711 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
664712 unsigned int nr_entries)
665713 {
714
+ unsigned long begin = offset;
666715 unsigned long end = offset + nr_entries - 1;
667716 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
717
+ bool skip = false;
668718
669719 if (offset < si->lowest_bit)
670720 si->lowest_bit = offset;
671721 if (end > si->highest_bit) {
672722 bool was_full = !si->highest_bit;
673723
674
- si->highest_bit = end;
724
+ WRITE_ONCE(si->highest_bit, end);
675725 if (was_full && (si->flags & SWP_WRITEOK))
676726 add_to_avail_list(si);
677727 }
678
- atomic_long_add(nr_entries, &nr_swap_pages);
728
+ trace_android_vh_account_swap_pages(si, &skip);
729
+ if (!skip)
730
+ atomic_long_add(nr_entries, &nr_swap_pages);
679731 si->inuse_pages -= nr_entries;
680732 if (si->flags & SWP_BLKDEV)
681733 swap_slot_free_notify =
....@@ -683,14 +735,44 @@
683735 else
684736 swap_slot_free_notify = NULL;
685737 while (offset <= end) {
738
+ arch_swap_invalidate_page(si->type, offset);
686739 frontswap_invalidate_page(si->type, offset);
687740 if (swap_slot_free_notify)
688741 swap_slot_free_notify(si->bdev, offset);
689742 offset++;
690743 }
744
+ clear_shadow_from_swap_cache(si->type, begin, end);
691745 }
692746
693
-static int scan_swap_map_slots(struct swap_info_struct *si,
747
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
748
+{
749
+ unsigned long prev;
750
+
751
+ if (!(si->flags & SWP_SOLIDSTATE)) {
752
+ si->cluster_next = next;
753
+ return;
754
+ }
755
+
756
+ prev = this_cpu_read(*si->cluster_next_cpu);
757
+ /*
758
+ * Cross the swap address space size aligned trunk, choose
759
+ * another trunk randomly to avoid lock contention on swap
760
+ * address space if possible.
761
+ */
762
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
763
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
764
+ /* No free swap slots available */
765
+ if (si->highest_bit <= si->lowest_bit)
766
+ return;
767
+ next = si->lowest_bit +
768
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
769
+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
770
+ next = max_t(unsigned int, next, si->lowest_bit);
771
+ }
772
+ this_cpu_write(*si->cluster_next_cpu, next);
773
+}
774
+
775
+int scan_swap_map_slots(struct swap_info_struct *si,
694776 unsigned char usage, int nr,
695777 swp_entry_t slots[])
696778 {
....@@ -700,9 +782,7 @@
700782 unsigned long last_in_cluster = 0;
701783 int latency_ration = LATENCY_LIMIT;
702784 int n_ret = 0;
703
-
704
- if (nr > SWAP_BATCH)
705
- nr = SWAP_BATCH;
785
+ bool scanned_many = false;
706786
707787 /*
708788 * We try to cluster swap pages by allocating them sequentially
....@@ -716,17 +796,22 @@
716796 */
717797
718798 si->flags += SWP_SCANNING;
719
- scan_base = offset = si->cluster_next;
799
+ /*
800
+ * Use percpu scan base for SSD to reduce lock contention on
801
+ * cluster and swap cache. For HDD, sequential access is more
802
+ * important.
803
+ */
804
+ if (si->flags & SWP_SOLIDSTATE)
805
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
806
+ else
807
+ scan_base = si->cluster_next;
808
+ offset = scan_base;
720809
721810 /* SSD algorithm */
722811 if (si->cluster_info) {
723
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
724
- goto checks;
725
- else
812
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
726813 goto scan;
727
- }
728
-
729
- if (unlikely(!si->cluster_nr--)) {
814
+ } else if (unlikely(!si->cluster_nr--)) {
730815 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
731816 si->cluster_nr = SWAPFILE_CLUSTER - 1;
732817 goto checks;
....@@ -789,7 +874,7 @@
789874 int swap_was_freed;
790875 unlock_cluster(ci);
791876 spin_unlock(&si->lock);
792
- swap_was_freed = __try_to_reclaim_swap(si, offset);
877
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
793878 spin_lock(&si->lock);
794879 /* entry was freed successfully, try to use this again */
795880 if (swap_was_freed)
....@@ -804,12 +889,11 @@
804889 else
805890 goto done;
806891 }
807
- si->swap_map[offset] = usage;
892
+ WRITE_ONCE(si->swap_map[offset], usage);
808893 inc_cluster_info_page(si, si->cluster_info, offset);
809894 unlock_cluster(ci);
810895
811896 swap_range_alloc(si, offset, 1);
812
- si->cluster_next = offset + 1;
813897 slots[n_ret++] = swp_entry(si->type, offset);
814898
815899 /* got enough slots or reach max slots? */
....@@ -832,51 +916,69 @@
832916 if (si->cluster_info) {
833917 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
834918 goto checks;
835
- else
836
- goto done;
837
- }
838
- /* non-ssd case */
839
- ++offset;
840
-
841
- /* non-ssd case, still more slots in cluster? */
842
- if (si->cluster_nr && !si->swap_map[offset]) {
919
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
920
+ /* non-ssd case, still more slots in cluster? */
843921 --si->cluster_nr;
844922 goto checks;
845923 }
846924
925
+ /*
926
+ * Even if there's no free clusters available (fragmented),
927
+ * try to scan a little more quickly with lock held unless we
928
+ * have scanned too many slots already.
929
+ */
930
+ if (!scanned_many) {
931
+ unsigned long scan_limit;
932
+
933
+ if (offset < scan_base)
934
+ scan_limit = scan_base;
935
+ else
936
+ scan_limit = si->highest_bit;
937
+ for (; offset <= scan_limit && --latency_ration > 0;
938
+ offset++) {
939
+ if (!si->swap_map[offset])
940
+ goto checks;
941
+ }
942
+ }
943
+
847944 done:
945
+ set_cluster_next(si, offset + 1);
848946 si->flags -= SWP_SCANNING;
849947 return n_ret;
850948
851949 scan:
852950 spin_unlock(&si->lock);
853
- while (++offset <= si->highest_bit) {
854
- if (!si->swap_map[offset]) {
951
+ while (++offset <= READ_ONCE(si->highest_bit)) {
952
+ if (data_race(!si->swap_map[offset])) {
855953 spin_lock(&si->lock);
856954 goto checks;
857955 }
858
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
956
+ if (vm_swap_full() &&
957
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
859958 spin_lock(&si->lock);
860959 goto checks;
861960 }
862961 if (unlikely(--latency_ration < 0)) {
863962 cond_resched();
864963 latency_ration = LATENCY_LIMIT;
964
+ scanned_many = true;
865965 }
866966 }
867967 offset = si->lowest_bit;
868968 while (offset < scan_base) {
869
- if (!si->swap_map[offset]) {
969
+ if (data_race(!si->swap_map[offset])) {
870970 spin_lock(&si->lock);
871971 goto checks;
872972 }
873
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
973
+ if (vm_swap_full() &&
974
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
874975 spin_lock(&si->lock);
875976 goto checks;
876977 }
877978 if (unlikely(--latency_ration < 0)) {
878979 cond_resched();
879980 latency_ration = LATENCY_LIMIT;
981
+ scanned_many = true;
880982 }
881983 offset++;
882984 }
....@@ -886,8 +988,9 @@
886988 si->flags -= SWP_SCANNING;
887989 return n_ret;
888990 }
991
+EXPORT_SYMBOL_GPL(scan_swap_map_slots);
889992
890
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
993
+int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
891994 {
892995 unsigned long idx;
893996 struct swap_cluster_info *ci;
....@@ -921,6 +1024,7 @@
9211024
9221025 return 1;
9231026 }
1027
+EXPORT_SYMBOL_GPL(swap_alloc_cluster);
9241028
9251029 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
9261030 {
....@@ -928,6 +1032,7 @@
9281032 struct swap_cluster_info *ci;
9291033
9301034 ci = lock_cluster(si, offset);
1035
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
9311036 cluster_set_count_flag(ci, 0, 0);
9321037 free_cluster(si, idx);
9331038 unlock_cluster(ci);
....@@ -960,19 +1065,17 @@
9601065 /* Only single cluster request supported */
9611066 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
9621067
1068
+ spin_lock(&swap_avail_lock);
1069
+
9631070 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
964
- if (avail_pgs <= 0)
1071
+ if (avail_pgs <= 0) {
1072
+ spin_unlock(&swap_avail_lock);
9651073 goto noswap;
1074
+ }
9661075
967
- if (n_goal > SWAP_BATCH)
968
- n_goal = SWAP_BATCH;
969
-
970
- if (n_goal > avail_pgs)
971
- n_goal = avail_pgs;
1076
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
9721077
9731078 atomic_long_sub(n_goal * size, &nr_swap_pages);
974
-
975
- spin_lock(&swap_avail_lock);
9761079
9771080 start_over:
9781081 node = numa_node_id();
....@@ -1041,20 +1144,22 @@
10411144 {
10421145 struct swap_info_struct *si = swap_type_to_swap_info(type);
10431146 pgoff_t offset;
1147
+ bool skip = false;
10441148
10451149 if (!si)
10461150 goto fail;
10471151
10481152 spin_lock(&si->lock);
10491153 if (si->flags & SWP_WRITEOK) {
1050
- atomic_long_dec(&nr_swap_pages);
10511154 /* This is called for allocating swap entry, not cache */
10521155 offset = scan_swap_map(si, 1);
10531156 if (offset) {
1157
+ trace_android_vh_account_swap_pages(si, &skip);
1158
+ if (!skip)
1159
+ atomic_long_dec(&nr_swap_pages);
10541160 spin_unlock(&si->lock);
10551161 return swp_entry(type, offset);
10561162 }
1057
- atomic_long_inc(&nr_swap_pages);
10581163 }
10591164 spin_unlock(&si->lock);
10601165 fail:
....@@ -1064,15 +1169,14 @@
10641169 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
10651170 {
10661171 struct swap_info_struct *p;
1067
- unsigned long offset, type;
1172
+ unsigned long offset;
10681173
10691174 if (!entry.val)
10701175 goto out;
1071
- type = swp_type(entry);
1072
- p = swap_type_to_swap_info(type);
1176
+ p = swp_swap_info(entry);
10731177 if (!p)
10741178 goto bad_nofile;
1075
- if (!(p->flags & SWP_USED))
1179
+ if (data_race(!(p->flags & SWP_USED)))
10761180 goto bad_device;
10771181 offset = swp_offset(entry);
10781182 if (offset >= p->max)
....@@ -1098,13 +1202,12 @@
10981202 p = __swap_info_get(entry);
10991203 if (!p)
11001204 goto out;
1101
- if (!p->swap_map[swp_offset(entry)])
1205
+ if (data_race(!p->swap_map[swp_offset(entry)]))
11021206 goto bad_free;
11031207 return p;
11041208
11051209 bad_free:
11061210 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1107
- goto out;
11081211 out:
11091212 return NULL;
11101213 }
....@@ -1167,20 +1270,89 @@
11671270 }
11681271
11691272 usage = count | has_cache;
1170
- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1273
+ if (usage)
1274
+ WRITE_ONCE(p->swap_map[offset], usage);
1275
+ else
1276
+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
11711277
11721278 return usage;
11731279 }
11741280
1281
+/*
1282
+ * Check whether swap entry is valid in the swap device. If so,
1283
+ * return pointer to swap_info_struct, and keep the swap entry valid
1284
+ * via preventing the swap device from being swapoff, until
1285
+ * put_swap_device() is called. Otherwise return NULL.
1286
+ *
1287
+ * The entirety of the RCU read critical section must come before the
1288
+ * return from or after the call to synchronize_rcu() in
1289
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1290
+ * true, the si->map, si->cluster_info, etc. must be valid in the
1291
+ * critical section.
1292
+ *
1293
+ * Notice that swapoff or swapoff+swapon can still happen before the
1294
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
1295
+ * in put_swap_device() if there isn't any other way to prevent
1296
+ * swapoff, such as page lock, page table lock, etc. The caller must
1297
+ * be prepared for that. For example, the following situation is
1298
+ * possible.
1299
+ *
1300
+ * CPU1 CPU2
1301
+ * do_swap_page()
1302
+ * ... swapoff+swapon
1303
+ * __read_swap_cache_async()
1304
+ * swapcache_prepare()
1305
+ * __swap_duplicate()
1306
+ * // check swap_map
1307
+ * // verify PTE not changed
1308
+ *
1309
+ * In __swap_duplicate(), the swap_map need to be checked before
1310
+ * changing partly because the specified swap entry may be for another
1311
+ * swap device which has been swapoff. And in do_swap_page(), after
1312
+ * the page is read from the swap device, the PTE is verified not
1313
+ * changed with the page table locked to check whether the swap device
1314
+ * has been swapoff or swapoff+swapon.
1315
+ */
1316
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
1317
+{
1318
+ struct swap_info_struct *si;
1319
+ unsigned long offset;
1320
+
1321
+ if (!entry.val)
1322
+ goto out;
1323
+ si = swp_swap_info(entry);
1324
+ if (!si)
1325
+ goto bad_nofile;
1326
+
1327
+ rcu_read_lock();
1328
+ if (data_race(!(si->flags & SWP_VALID)))
1329
+ goto unlock_out;
1330
+ offset = swp_offset(entry);
1331
+ if (offset >= si->max)
1332
+ goto unlock_out;
1333
+
1334
+ return si;
1335
+bad_nofile:
1336
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1337
+out:
1338
+ return NULL;
1339
+unlock_out:
1340
+ rcu_read_unlock();
1341
+ return NULL;
1342
+}
1343
+
11751344 static unsigned char __swap_entry_free(struct swap_info_struct *p,
1176
- swp_entry_t entry, unsigned char usage)
1345
+ swp_entry_t entry)
11771346 {
11781347 struct swap_cluster_info *ci;
11791348 unsigned long offset = swp_offset(entry);
1349
+ unsigned char usage;
11801350
11811351 ci = lock_cluster_or_swap_info(p, offset);
1182
- usage = __swap_entry_free_locked(p, offset, usage);
1352
+ usage = __swap_entry_free_locked(p, offset, 1);
11831353 unlock_cluster_or_swap_info(p, ci);
1354
+ if (!usage)
1355
+ free_swap_slot(entry);
11841356
11851357 return usage;
11861358 }
....@@ -1211,10 +1383,8 @@
12111383 struct swap_info_struct *p;
12121384
12131385 p = _swap_info_get(entry);
1214
- if (p) {
1215
- if (!__swap_entry_free(p, entry, 1))
1216
- free_swap_slot(entry);
1217
- }
1386
+ if (p)
1387
+ __swap_entry_free(p, entry);
12181388 }
12191389
12201390 /*
....@@ -1229,7 +1399,7 @@
12291399 unsigned char *map;
12301400 unsigned int i, free_entries = 0;
12311401 unsigned char val;
1232
- int size = swap_entry_size(hpage_nr_pages(page));
1402
+ int size = swap_entry_size(thp_nr_pages(page));
12331403
12341404 si = _swap_info_get(entry);
12351405 if (!si)
....@@ -1249,9 +1419,6 @@
12491419 if (free_entries == SWAPFILE_CLUSTER) {
12501420 unlock_cluster_or_swap_info(si, ci);
12511421 spin_lock(&si->lock);
1252
- ci = lock_cluster(si, offset);
1253
- memset(map, 0, SWAPFILE_CLUSTER);
1254
- unlock_cluster(ci);
12551422 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
12561423 swap_free_cluster(si, idx);
12571424 spin_unlock(&si->lock);
....@@ -1321,6 +1488,7 @@
13211488 if (p)
13221489 spin_unlock(&p->lock);
13231490 }
1491
+EXPORT_SYMBOL_GPL(swapcache_free_entries);
13241492
13251493 /*
13261494 * How many references to page are currently swapped out?
....@@ -1346,11 +1514,18 @@
13461514 return count;
13471515 }
13481516
1349
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1517
+int __swap_count(swp_entry_t entry)
13501518 {
1519
+ struct swap_info_struct *si;
13511520 pgoff_t offset = swp_offset(entry);
1521
+ int count = 0;
13521522
1353
- return swap_count(si->swap_map[offset]);
1523
+ si = get_swap_device(entry);
1524
+ if (si) {
1525
+ count = swap_count(si->swap_map[offset]);
1526
+ put_swap_device(si);
1527
+ }
1528
+ return count;
13541529 }
13551530
13561531 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
....@@ -1375,9 +1550,11 @@
13751550 int count = 0;
13761551 struct swap_info_struct *si;
13771552
1378
- si = __swap_info_get(entry);
1379
- if (si)
1553
+ si = get_swap_device(entry);
1554
+ if (si) {
13801555 count = swap_swapcount(si, entry);
1556
+ put_swap_device(si);
1557
+ }
13811558 return count;
13821559 }
13831560
....@@ -1624,7 +1801,6 @@
16241801 int free_swap_and_cache(swp_entry_t entry)
16251802 {
16261803 struct swap_info_struct *p;
1627
- struct page *page = NULL;
16281804 unsigned char count;
16291805
16301806 if (non_swap_entry(entry))
....@@ -1632,32 +1808,11 @@
16321808
16331809 p = _swap_info_get(entry);
16341810 if (p) {
1635
- count = __swap_entry_free(p, entry, 1);
1811
+ count = __swap_entry_free(p, entry);
16361812 if (count == SWAP_HAS_CACHE &&
1637
- !swap_page_trans_huge_swapped(p, entry)) {
1638
- page = find_get_page(swap_address_space(entry),
1639
- swp_offset(entry));
1640
- if (page && !trylock_page(page)) {
1641
- put_page(page);
1642
- page = NULL;
1643
- }
1644
- } else if (!count)
1645
- free_swap_slot(entry);
1646
- }
1647
- if (page) {
1648
- /*
1649
- * Not mapped elsewhere, or swap space full? Free it!
1650
- * Also recheck PageSwapCache now page is locked (above).
1651
- */
1652
- if (PageSwapCache(page) && !PageWriteback(page) &&
1653
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1654
- !swap_page_trans_huge_swapped(p, entry)) {
1655
- page = compound_head(page);
1656
- delete_from_swap_cache(page);
1657
- SetPageDirty(page);
1658
- }
1659
- unlock_page(page);
1660
- put_page(page);
1813
+ !swap_page_trans_huge_swapped(p, entry))
1814
+ __try_to_reclaim_swap(p, swp_offset(entry),
1815
+ TTRS_UNMAPPED | TTRS_FULL);
16611816 }
16621817 return p != NULL;
16631818 }
....@@ -1671,13 +1826,12 @@
16711826 *
16721827 * This is needed for the suspend to disk (aka swsusp).
16731828 */
1674
-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1829
+int swap_type_of(dev_t device, sector_t offset)
16751830 {
1676
- struct block_device *bdev = NULL;
16771831 int type;
16781832
1679
- if (device)
1680
- bdev = bdget(device);
1833
+ if (!device)
1834
+ return -1;
16811835
16821836 spin_lock(&swap_lock);
16831837 for (type = 0; type < nr_swapfiles; type++) {
....@@ -1686,30 +1840,34 @@
16861840 if (!(sis->flags & SWP_WRITEOK))
16871841 continue;
16881842
1689
- if (!bdev) {
1690
- if (bdev_p)
1691
- *bdev_p = bdgrab(sis->bdev);
1692
-
1693
- spin_unlock(&swap_lock);
1694
- return type;
1695
- }
1696
- if (bdev == sis->bdev) {
1697
- struct swap_extent *se = &sis->first_swap_extent;
1843
+ if (device == sis->bdev->bd_dev) {
1844
+ struct swap_extent *se = first_se(sis);
16981845
16991846 if (se->start_block == offset) {
1700
- if (bdev_p)
1701
- *bdev_p = bdgrab(sis->bdev);
1702
-
17031847 spin_unlock(&swap_lock);
1704
- bdput(bdev);
17051848 return type;
17061849 }
17071850 }
17081851 }
17091852 spin_unlock(&swap_lock);
1710
- if (bdev)
1711
- bdput(bdev);
1853
+ return -ENODEV;
1854
+}
17121855
1856
+int find_first_swap(dev_t *device)
1857
+{
1858
+ int type;
1859
+
1860
+ spin_lock(&swap_lock);
1861
+ for (type = 0; type < nr_swapfiles; type++) {
1862
+ struct swap_info_struct *sis = swap_info[type];
1863
+
1864
+ if (!(sis->flags & SWP_WRITEOK))
1865
+ continue;
1866
+ *device = sis->bdev->bd_dev;
1867
+ spin_unlock(&swap_lock);
1868
+ return type;
1869
+ }
1870
+ spin_unlock(&swap_lock);
17131871 return -ENODEV;
17141872 }
17151873
....@@ -1756,7 +1914,7 @@
17561914
17571915 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
17581916 {
1759
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1917
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
17601918 }
17611919
17621920 /*
....@@ -1768,7 +1926,6 @@
17681926 unsigned long addr, swp_entry_t entry, struct page *page)
17691927 {
17701928 struct page *swapcache;
1771
- struct mem_cgroup *memcg;
17721929 spinlock_t *ptl;
17731930 pte_t *pte;
17741931 int ret = 1;
....@@ -1778,15 +1935,8 @@
17781935 if (unlikely(!page))
17791936 return -ENOMEM;
17801937
1781
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1782
- &memcg, false)) {
1783
- ret = -ENOMEM;
1784
- goto out_nolock;
1785
- }
1786
-
17871938 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
17881939 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1789
- mem_cgroup_cancel_charge(page, memcg, false);
17901940 ret = 0;
17911941 goto out;
17921942 }
....@@ -1798,21 +1948,13 @@
17981948 pte_mkold(mk_pte(page, vma->vm_page_prot)));
17991949 if (page == swapcache) {
18001950 page_add_anon_rmap(page, vma, addr, false);
1801
- mem_cgroup_commit_charge(page, memcg, true, false);
18021951 } else { /* ksm created a completely new copy */
18031952 page_add_new_anon_rmap(page, vma, addr, false);
1804
- mem_cgroup_commit_charge(page, memcg, false, false);
1805
- lru_cache_add_active_or_unevictable(page, vma);
1953
+ lru_cache_add_inactive_or_unevictable(page, vma);
18061954 }
18071955 swap_free(entry);
1808
- /*
1809
- * Move the page to the active list so it is not
1810
- * immediately swapped out again after swapon.
1811
- */
1812
- activate_page(page);
18131956 out:
18141957 pte_unmap_unlock(pte, ptl);
1815
-out_nolock:
18161958 if (page != swapcache) {
18171959 unlock_page(page);
18181960 put_page(page);
....@@ -1821,44 +1963,83 @@
18211963 }
18221964
18231965 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1824
- unsigned long addr, unsigned long end,
1825
- swp_entry_t entry, struct page *page)
1966
+ unsigned long addr, unsigned long end,
1967
+ unsigned int type, bool frontswap,
1968
+ unsigned long *fs_pages_to_unuse)
18261969 {
1827
- pte_t swp_pte = swp_entry_to_pte(entry);
1970
+ struct page *page;
1971
+ swp_entry_t entry;
18281972 pte_t *pte;
1973
+ struct swap_info_struct *si;
1974
+ unsigned long offset;
18291975 int ret = 0;
1976
+ volatile unsigned char *swap_map;
18301977
1831
- /*
1832
- * We don't actually need pte lock while scanning for swp_pte: since
1833
- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1834
- * page table while we're scanning; though it could get zapped, and on
1835
- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1836
- * of unmatched parts which look like swp_pte, so unuse_pte must
1837
- * recheck under pte lock. Scanning without pte lock lets it be
1838
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1839
- */
1978
+ si = swap_info[type];
18401979 pte = pte_offset_map(pmd, addr);
18411980 do {
1842
- /*
1843
- * swapoff spends a _lot_ of time in this loop!
1844
- * Test inline before going to call unuse_pte.
1845
- */
1846
- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1847
- pte_unmap(pte);
1848
- ret = unuse_pte(vma, pmd, addr, entry, page);
1849
- if (ret)
1850
- goto out;
1851
- pte = pte_offset_map(pmd, addr);
1981
+ if (!is_swap_pte(*pte))
1982
+ continue;
1983
+
1984
+ entry = pte_to_swp_entry(*pte);
1985
+ if (swp_type(entry) != type)
1986
+ continue;
1987
+
1988
+ offset = swp_offset(entry);
1989
+ if (frontswap && !frontswap_test(si, offset))
1990
+ continue;
1991
+
1992
+ pte_unmap(pte);
1993
+ swap_map = &si->swap_map[offset];
1994
+ page = lookup_swap_cache(entry, vma, addr);
1995
+ if (!page) {
1996
+ struct vm_fault vmf = {
1997
+ .vma = vma,
1998
+ .address = addr,
1999
+ .pmd = pmd,
2000
+ };
2001
+
2002
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2003
+ &vmf);
18522004 }
2005
+ if (!page) {
2006
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
2007
+ goto try_next;
2008
+ return -ENOMEM;
2009
+ }
2010
+
2011
+ lock_page(page);
2012
+ wait_on_page_writeback(page);
2013
+ ret = unuse_pte(vma, pmd, addr, entry, page);
2014
+ if (ret < 0) {
2015
+ unlock_page(page);
2016
+ put_page(page);
2017
+ goto out;
2018
+ }
2019
+
2020
+ try_to_free_swap(page);
2021
+ trace_android_vh_unuse_swap_page(si, page);
2022
+ unlock_page(page);
2023
+ put_page(page);
2024
+
2025
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
2026
+ ret = FRONTSWAP_PAGES_UNUSED;
2027
+ goto out;
2028
+ }
2029
+try_next:
2030
+ pte = pte_offset_map(pmd, addr);
18532031 } while (pte++, addr += PAGE_SIZE, addr != end);
18542032 pte_unmap(pte - 1);
2033
+
2034
+ ret = 0;
18552035 out:
18562036 return ret;
18572037 }
18582038
18592039 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
18602040 unsigned long addr, unsigned long end,
1861
- swp_entry_t entry, struct page *page)
2041
+ unsigned int type, bool frontswap,
2042
+ unsigned long *fs_pages_to_unuse)
18622043 {
18632044 pmd_t *pmd;
18642045 unsigned long next;
....@@ -1870,7 +2051,8 @@
18702051 next = pmd_addr_end(addr, end);
18712052 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
18722053 continue;
1873
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
2054
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
2055
+ frontswap, fs_pages_to_unuse);
18742056 if (ret)
18752057 return ret;
18762058 } while (pmd++, addr = next, addr != end);
....@@ -1879,7 +2061,8 @@
18792061
18802062 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
18812063 unsigned long addr, unsigned long end,
1882
- swp_entry_t entry, struct page *page)
2064
+ unsigned int type, bool frontswap,
2065
+ unsigned long *fs_pages_to_unuse)
18832066 {
18842067 pud_t *pud;
18852068 unsigned long next;
....@@ -1890,7 +2073,8 @@
18902073 next = pud_addr_end(addr, end);
18912074 if (pud_none_or_clear_bad(pud))
18922075 continue;
1893
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
2076
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
2077
+ frontswap, fs_pages_to_unuse);
18942078 if (ret)
18952079 return ret;
18962080 } while (pud++, addr = next, addr != end);
....@@ -1899,7 +2083,8 @@
18992083
19002084 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
19012085 unsigned long addr, unsigned long end,
1902
- swp_entry_t entry, struct page *page)
2086
+ unsigned int type, bool frontswap,
2087
+ unsigned long *fs_pages_to_unuse)
19032088 {
19042089 p4d_t *p4d;
19052090 unsigned long next;
....@@ -1910,78 +2095,66 @@
19102095 next = p4d_addr_end(addr, end);
19112096 if (p4d_none_or_clear_bad(p4d))
19122097 continue;
1913
- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
2098
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
2099
+ frontswap, fs_pages_to_unuse);
19142100 if (ret)
19152101 return ret;
19162102 } while (p4d++, addr = next, addr != end);
19172103 return 0;
19182104 }
19192105
1920
-static int unuse_vma(struct vm_area_struct *vma,
1921
- swp_entry_t entry, struct page *page)
2106
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
2107
+ bool frontswap, unsigned long *fs_pages_to_unuse)
19222108 {
19232109 pgd_t *pgd;
19242110 unsigned long addr, end, next;
19252111 int ret;
19262112
1927
- if (page_anon_vma(page)) {
1928
- addr = page_address_in_vma(page, vma);
1929
- if (addr == -EFAULT)
1930
- return 0;
1931
- else
1932
- end = addr + PAGE_SIZE;
1933
- } else {
1934
- addr = vma->vm_start;
1935
- end = vma->vm_end;
1936
- }
2113
+ addr = vma->vm_start;
2114
+ end = vma->vm_end;
19372115
19382116 pgd = pgd_offset(vma->vm_mm, addr);
19392117 do {
19402118 next = pgd_addr_end(addr, end);
19412119 if (pgd_none_or_clear_bad(pgd))
19422120 continue;
1943
- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
2121
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
2122
+ frontswap, fs_pages_to_unuse);
19442123 if (ret)
19452124 return ret;
19462125 } while (pgd++, addr = next, addr != end);
19472126 return 0;
19482127 }
19492128
1950
-static int unuse_mm(struct mm_struct *mm,
1951
- swp_entry_t entry, struct page *page)
2129
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
2130
+ bool frontswap, unsigned long *fs_pages_to_unuse)
19522131 {
19532132 struct vm_area_struct *vma;
19542133 int ret = 0;
19552134
1956
- if (!down_read_trylock(&mm->mmap_sem)) {
1957
- /*
1958
- * Activate page so shrink_inactive_list is unlikely to unmap
1959
- * its ptes while lock is dropped, so swapoff can make progress.
1960
- */
1961
- activate_page(page);
1962
- unlock_page(page);
1963
- down_read(&mm->mmap_sem);
1964
- lock_page(page);
1965
- }
2135
+ mmap_read_lock(mm);
19662136 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1967
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1968
- break;
2137
+ if (vma->anon_vma) {
2138
+ ret = unuse_vma(vma, type, frontswap,
2139
+ fs_pages_to_unuse);
2140
+ if (ret)
2141
+ break;
2142
+ }
19692143 cond_resched();
19702144 }
1971
- up_read(&mm->mmap_sem);
1972
- return (ret < 0)? ret: 0;
2145
+ mmap_read_unlock(mm);
2146
+ return ret;
19732147 }
19742148
19752149 /*
19762150 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1977
- * from current position to next entry still in use.
1978
- * Recycle to start on reaching the end, returning 0 when empty.
2151
+ * from current position to next entry still in use. Return 0
2152
+ * if there are no inuse entries after prev till end of the map.
19792153 */
19802154 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
19812155 unsigned int prev, bool frontswap)
19822156 {
1983
- unsigned int max = si->max;
1984
- unsigned int i = prev;
2157
+ unsigned int i;
19852158 unsigned char count;
19862159
19872160 /*
....@@ -1990,20 +2163,7 @@
19902163 * hits are okay, and sys_swapoff() has already prevented new
19912164 * allocations from this area (while holding swap_lock).
19922165 */
1993
- for (;;) {
1994
- if (++i >= max) {
1995
- if (!prev) {
1996
- i = 0;
1997
- break;
1998
- }
1999
- /*
2000
- * No entries in use at top of swap_map,
2001
- * loop back to start and recheck there.
2002
- */
2003
- max = prev + 1;
2004
- prev = 0;
2005
- i = 1;
2006
- }
2166
+ for (i = prev + 1; i < si->max; i++) {
20072167 count = READ_ONCE(si->swap_map[i]);
20082168 if (count && swap_count(count) != SWAP_MAP_BAD)
20092169 if (!frontswap || frontswap_test(si, i))
....@@ -2011,240 +2171,124 @@
20112171 if ((i % LATENCY_LIMIT) == 0)
20122172 cond_resched();
20132173 }
2174
+
2175
+ if (i == si->max)
2176
+ i = 0;
2177
+
20142178 return i;
20152179 }
20162180
20172181 /*
2018
- * We completely avoid races by reading each swap page in advance,
2019
- * and then search for the process using it. All the necessary
2020
- * page table adjustments can then be made atomically.
2021
- *
2022
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
2182
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
20232183 * pages_to_unuse==0 means all pages; ignored if frontswap is false
20242184 */
20252185 int try_to_unuse(unsigned int type, bool frontswap,
20262186 unsigned long pages_to_unuse)
20272187 {
2188
+ struct mm_struct *prev_mm;
2189
+ struct mm_struct *mm;
2190
+ struct list_head *p;
2191
+ int retval = 0;
20282192 struct swap_info_struct *si = swap_info[type];
2029
- struct mm_struct *start_mm;
2030
- volatile unsigned char *swap_map; /* swap_map is accessed without
2031
- * locking. Mark it as volatile
2032
- * to prevent compiler doing
2033
- * something odd.
2034
- */
2035
- unsigned char swcount;
20362193 struct page *page;
20372194 swp_entry_t entry;
2038
- unsigned int i = 0;
2039
- int retval = 0;
2195
+ unsigned int i;
20402196
2041
- /*
2042
- * When searching mms for an entry, a good strategy is to
2043
- * start at the first mm we freed the previous entry from
2044
- * (though actually we don't notice whether we or coincidence
2045
- * freed the entry). Initialize this start_mm with a hold.
2046
- *
2047
- * A simpler strategy would be to start at the last mm we
2048
- * freed the previous entry from; but that would take less
2049
- * advantage of mmlist ordering, which clusters forked mms
2050
- * together, child after parent. If we race with dup_mmap(), we
2051
- * prefer to resolve parent before child, lest we miss entries
2052
- * duplicated after we scanned child: using last mm would invert
2053
- * that.
2054
- */
2055
- start_mm = &init_mm;
2056
- mmget(&init_mm);
2197
+ if (!READ_ONCE(si->inuse_pages))
2198
+ return 0;
20572199
2058
- /*
2059
- * Keep on scanning until all entries have gone. Usually,
2060
- * one pass through swap_map is enough, but not necessarily:
2061
- * there are races when an instance of an entry might be missed.
2062
- */
2063
- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2064
- if (signal_pending(current)) {
2065
- retval = -EINTR;
2066
- break;
2067
- }
2200
+ if (!frontswap)
2201
+ pages_to_unuse = 0;
20682202
2069
- /*
2070
- * Get a page for the entry, using the existing swap
2071
- * cache page if there is one. Otherwise, get a clean
2072
- * page and read the swap into it.
2073
- */
2074
- swap_map = &si->swap_map[i];
2075
- entry = swp_entry(type, i);
2076
- page = read_swap_cache_async(entry,
2077
- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2078
- if (!page) {
2079
- /*
2080
- * Either swap_duplicate() failed because entry
2081
- * has been freed independently, and will not be
2082
- * reused since sys_swapoff() already disabled
2083
- * allocation from here, or alloc_page() failed.
2084
- */
2085
- swcount = *swap_map;
2086
- /*
2087
- * We don't hold lock here, so the swap entry could be
2088
- * SWAP_MAP_BAD (when the cluster is discarding).
2089
- * Instead of fail out, We can just skip the swap
2090
- * entry because swapoff will wait for discarding
2091
- * finish anyway.
2092
- */
2093
- if (!swcount || swcount == SWAP_MAP_BAD)
2094
- continue;
2095
- retval = -ENOMEM;
2096
- break;
2097
- }
2203
+retry:
2204
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2205
+ if (retval)
2206
+ goto out;
20982207
2099
- /*
2100
- * Don't hold on to start_mm if it looks like exiting.
2101
- */
2102
- if (atomic_read(&start_mm->mm_users) == 1) {
2103
- mmput(start_mm);
2104
- start_mm = &init_mm;
2105
- mmget(&init_mm);
2106
- }
2208
+ prev_mm = &init_mm;
2209
+ mmget(prev_mm);
21072210
2108
- /*
2109
- * Wait for and lock page. When do_swap_page races with
2110
- * try_to_unuse, do_swap_page can handle the fault much
2111
- * faster than try_to_unuse can locate the entry. This
2112
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
2113
- * defer to do_swap_page in such a case - in some tests,
2114
- * do_swap_page and try_to_unuse repeatedly compete.
2115
- */
2116
- wait_on_page_locked(page);
2117
- wait_on_page_writeback(page);
2118
- lock_page(page);
2119
- wait_on_page_writeback(page);
2211
+ spin_lock(&mmlist_lock);
2212
+ p = &init_mm.mmlist;
2213
+ while (READ_ONCE(si->inuse_pages) &&
2214
+ !signal_pending(current) &&
2215
+ (p = p->next) != &init_mm.mmlist) {
21202216
2121
- /*
2122
- * Remove all references to entry.
2123
- */
2124
- swcount = *swap_map;
2125
- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2126
- retval = shmem_unuse(entry, page);
2127
- /* page has already been unlocked and released */
2128
- if (retval < 0)
2129
- break;
2217
+ mm = list_entry(p, struct mm_struct, mmlist);
2218
+ if (!mmget_not_zero(mm))
21302219 continue;
2131
- }
2132
- if (swap_count(swcount) && start_mm != &init_mm)
2133
- retval = unuse_mm(start_mm, entry, page);
2220
+ spin_unlock(&mmlist_lock);
2221
+ mmput(prev_mm);
2222
+ prev_mm = mm;
2223
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
21342224
2135
- if (swap_count(*swap_map)) {
2136
- int set_start_mm = (*swap_map >= swcount);
2137
- struct list_head *p = &start_mm->mmlist;
2138
- struct mm_struct *new_start_mm = start_mm;
2139
- struct mm_struct *prev_mm = start_mm;
2140
- struct mm_struct *mm;
2141
-
2142
- mmget(new_start_mm);
2143
- mmget(prev_mm);
2144
- spin_lock(&mmlist_lock);
2145
- while (swap_count(*swap_map) && !retval &&
2146
- (p = p->next) != &start_mm->mmlist) {
2147
- mm = list_entry(p, struct mm_struct, mmlist);
2148
- if (!mmget_not_zero(mm))
2149
- continue;
2150
- spin_unlock(&mmlist_lock);
2151
- mmput(prev_mm);
2152
- prev_mm = mm;
2153
-
2154
- cond_resched();
2155
-
2156
- swcount = *swap_map;
2157
- if (!swap_count(swcount)) /* any usage ? */
2158
- ;
2159
- else if (mm == &init_mm)
2160
- set_start_mm = 1;
2161
- else
2162
- retval = unuse_mm(mm, entry, page);
2163
-
2164
- if (set_start_mm && *swap_map < swcount) {
2165
- mmput(new_start_mm);
2166
- mmget(mm);
2167
- new_start_mm = mm;
2168
- set_start_mm = 0;
2169
- }
2170
- spin_lock(&mmlist_lock);
2171
- }
2172
- spin_unlock(&mmlist_lock);
2173
- mmput(prev_mm);
2174
- mmput(start_mm);
2175
- start_mm = new_start_mm;
2176
- }
21772225 if (retval) {
2178
- unlock_page(page);
2179
- put_page(page);
2180
- break;
2226
+ mmput(prev_mm);
2227
+ goto out;
21812228 }
2182
-
2183
- /*
2184
- * If a reference remains (rare), we would like to leave
2185
- * the page in the swap cache; but try_to_unmap could
2186
- * then re-duplicate the entry once we drop page lock,
2187
- * so we might loop indefinitely; also, that page could
2188
- * not be swapped out to other storage meanwhile. So:
2189
- * delete from cache even if there's another reference,
2190
- * after ensuring that the data has been saved to disk -
2191
- * since if the reference remains (rarer), it will be
2192
- * read from disk into another page. Splitting into two
2193
- * pages would be incorrect if swap supported "shared
2194
- * private" pages, but they are handled by tmpfs files.
2195
- *
2196
- * Given how unuse_vma() targets one particular offset
2197
- * in an anon_vma, once the anon_vma has been determined,
2198
- * this splitting happens to be just what is needed to
2199
- * handle where KSM pages have been swapped out: re-reading
2200
- * is unnecessarily slow, but we can fix that later on.
2201
- */
2202
- if (swap_count(*swap_map) &&
2203
- PageDirty(page) && PageSwapCache(page)) {
2204
- struct writeback_control wbc = {
2205
- .sync_mode = WB_SYNC_NONE,
2206
- };
2207
-
2208
- swap_writepage(compound_head(page), &wbc);
2209
- lock_page(page);
2210
- wait_on_page_writeback(page);
2211
- }
2212
-
2213
- /*
2214
- * It is conceivable that a racing task removed this page from
2215
- * swap cache just before we acquired the page lock at the top,
2216
- * or while we dropped it in unuse_mm(). The page might even
2217
- * be back in swap cache on another swap area: that we must not
2218
- * delete, since it may not have been written out to swap yet.
2219
- */
2220
- if (PageSwapCache(page) &&
2221
- likely(page_private(page) == entry.val) &&
2222
- (!PageTransCompound(page) ||
2223
- !swap_page_trans_huge_swapped(si, entry)))
2224
- delete_from_swap_cache(compound_head(page));
2225
-
2226
- /*
2227
- * So we could skip searching mms once swap count went
2228
- * to 1, we did not mark any present ptes as dirty: must
2229
- * mark page dirty so shrink_page_list will preserve it.
2230
- */
2231
- SetPageDirty(page);
2232
- unlock_page(page);
2233
- put_page(page);
22342229
22352230 /*
22362231 * Make sure that we aren't completely killing
22372232 * interactive performance.
22382233 */
22392234 cond_resched();
2240
- if (frontswap && pages_to_unuse > 0) {
2241
- if (!--pages_to_unuse)
2242
- break;
2243
- }
2235
+ spin_lock(&mmlist_lock);
2236
+ }
2237
+ spin_unlock(&mmlist_lock);
2238
+
2239
+ mmput(prev_mm);
2240
+
2241
+ i = 0;
2242
+ while (READ_ONCE(si->inuse_pages) &&
2243
+ !signal_pending(current) &&
2244
+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2245
+
2246
+ entry = swp_entry(type, i);
2247
+ page = find_get_page(swap_address_space(entry), i);
2248
+ if (!page)
2249
+ continue;
2250
+
2251
+ /*
2252
+ * It is conceivable that a racing task removed this page from
2253
+ * swap cache just before we acquired the page lock. The page
2254
+ * might even be back in swap cache on another swap area. But
2255
+ * that is okay, try_to_free_swap() only removes stale pages.
2256
+ */
2257
+ lock_page(page);
2258
+ wait_on_page_writeback(page);
2259
+ try_to_free_swap(page);
2260
+ trace_android_vh_unuse_swap_page(si, page);
2261
+ unlock_page(page);
2262
+ put_page(page);
2263
+
2264
+ /*
2265
+ * For frontswap, we just need to unuse pages_to_unuse, if
2266
+ * it was specified. Need not check frontswap again here as
2267
+ * we already zeroed out pages_to_unuse if not frontswap.
2268
+ */
2269
+ if (pages_to_unuse && --pages_to_unuse == 0)
2270
+ goto out;
22442271 }
22452272
2246
- mmput(start_mm);
2247
- return retval;
2273
+ /*
2274
+ * Lets check again to see if there are still swap entries in the map.
2275
+ * If yes, we would need to do retry the unuse logic again.
2276
+ * Under global memory pressure, swap entries can be reinserted back
2277
+ * into process space after the mmlist loop above passes over them.
2278
+ *
2279
+ * Limit the number of retries? No: when mmget_not_zero() above fails,
2280
+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
2281
+ * at its own independent pace; and even shmem_writepage() could have
2282
+ * been preempted after get_swap_page(), temporarily hiding that swap.
2283
+ * It's easy and robust (though cpu-intensive) just to keep retrying.
2284
+ */
2285
+ if (READ_ONCE(si->inuse_pages)) {
2286
+ if (!signal_pending(current))
2287
+ goto retry;
2288
+ retval = -EINTR;
2289
+ }
2290
+out:
2291
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
22482292 }
22492293
22502294 /*
....@@ -2276,7 +2320,6 @@
22762320 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
22772321 {
22782322 struct swap_info_struct *sis;
2279
- struct swap_extent *start_se;
22802323 struct swap_extent *se;
22812324 pgoff_t offset;
22822325
....@@ -2284,18 +2327,8 @@
22842327 *bdev = sis->bdev;
22852328
22862329 offset = swp_offset(entry);
2287
- start_se = sis->curr_swap_extent;
2288
- se = start_se;
2289
-
2290
- for ( ; ; ) {
2291
- if (se->start_page <= offset &&
2292
- offset < (se->start_page + se->nr_pages)) {
2293
- return se->start_block + (offset - se->start_page);
2294
- }
2295
- se = list_next_entry(se, list);
2296
- sis->curr_swap_extent = se;
2297
- BUG_ON(se == start_se); /* It *must* be present */
2298
- }
2330
+ se = offset_to_swap_extent(sis, offset);
2331
+ return se->start_block + (offset - se->start_page);
22992332 }
23002333
23012334 /*
....@@ -2305,7 +2338,7 @@
23052338 {
23062339 swp_entry_t entry;
23072340 entry.val = page_private(page);
2308
- return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9);
2341
+ return map_swap_entry(entry, bdev);
23092342 }
23102343
23112344 /*
....@@ -2313,27 +2346,27 @@
23132346 */
23142347 static void destroy_swap_extents(struct swap_info_struct *sis)
23152348 {
2316
- while (!list_empty(&sis->first_swap_extent.list)) {
2317
- struct swap_extent *se;
2349
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2350
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
2351
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
23182352
2319
- se = list_first_entry(&sis->first_swap_extent.list,
2320
- struct swap_extent, list);
2321
- list_del(&se->list);
2353
+ rb_erase(rb, &sis->swap_extent_root);
23222354 kfree(se);
23232355 }
23242356
2325
- if (sis->flags & SWP_FILE) {
2357
+ if (sis->flags & SWP_ACTIVATED) {
23262358 struct file *swap_file = sis->swap_file;
23272359 struct address_space *mapping = swap_file->f_mapping;
23282360
2329
- sis->flags &= ~SWP_FILE;
2330
- mapping->a_ops->swap_deactivate(swap_file);
2361
+ sis->flags &= ~SWP_ACTIVATED;
2362
+ if (mapping->a_ops->swap_deactivate)
2363
+ mapping->a_ops->swap_deactivate(swap_file);
23312364 }
23322365 }
23332366
23342367 /*
23352368 * Add a block range (and the corresponding page range) into this swapdev's
2336
- * extent list. The extent list is kept sorted in page order.
2369
+ * extent tree.
23372370 *
23382371 * This function rather assumes that it is called in ascending page order.
23392372 */
....@@ -2341,20 +2374,21 @@
23412374 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
23422375 unsigned long nr_pages, sector_t start_block)
23432376 {
2377
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
23442378 struct swap_extent *se;
23452379 struct swap_extent *new_se;
2346
- struct list_head *lh;
23472380
2348
- if (start_page == 0) {
2349
- se = &sis->first_swap_extent;
2350
- sis->curr_swap_extent = se;
2351
- se->start_page = 0;
2352
- se->nr_pages = nr_pages;
2353
- se->start_block = start_block;
2354
- return 1;
2355
- } else {
2356
- lh = sis->first_swap_extent.list.prev; /* Highest extent */
2357
- se = list_entry(lh, struct swap_extent, list);
2381
+ /*
2382
+ * place the new node at the right most since the
2383
+ * function is called in ascending page order.
2384
+ */
2385
+ while (*link) {
2386
+ parent = *link;
2387
+ link = &parent->rb_right;
2388
+ }
2389
+
2390
+ if (parent) {
2391
+ se = rb_entry(parent, struct swap_extent, rb_node);
23582392 BUG_ON(se->start_page + se->nr_pages != start_page);
23592393 if (se->start_block + se->nr_pages == start_block) {
23602394 /* Merge it */
....@@ -2363,9 +2397,7 @@
23632397 }
23642398 }
23652399
2366
- /*
2367
- * No merge. Insert a new extent, preserving ordering.
2368
- */
2400
+ /* No merge, insert a new extent. */
23692401 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
23702402 if (new_se == NULL)
23712403 return -ENOMEM;
....@@ -2373,7 +2405,8 @@
23732405 new_se->nr_pages = nr_pages;
23742406 new_se->start_block = start_block;
23752407
2376
- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2408
+ rb_link_node(&new_se->rb_node, parent, link);
2409
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
23772410 return 1;
23782411 }
23792412 EXPORT_SYMBOL_GPL(add_swap_extent);
....@@ -2423,8 +2456,10 @@
24232456
24242457 if (mapping->a_ops->swap_activate) {
24252458 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2459
+ if (ret >= 0)
2460
+ sis->flags |= SWP_ACTIVATED;
24262461 if (!ret) {
2427
- sis->flags |= SWP_FILE;
2462
+ sis->flags |= SWP_FS_OPS;
24282463 ret = add_swap_extent(sis, 0, sis->max, 0);
24292464 *span = sis->pages;
24302465 }
....@@ -2446,9 +2481,9 @@
24462481 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
24472482 }
24482483
2449
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
2450
- unsigned char *swap_map,
2451
- struct swap_cluster_info *cluster_info)
2484
+static void setup_swap_info(struct swap_info_struct *p, int prio,
2485
+ unsigned char *swap_map,
2486
+ struct swap_cluster_info *cluster_info)
24522487 {
24532488 int i;
24542489
....@@ -2473,10 +2508,18 @@
24732508 }
24742509 p->swap_map = swap_map;
24752510 p->cluster_info = cluster_info;
2476
- p->flags |= SWP_WRITEOK;
2477
- atomic_long_add(p->pages, &nr_swap_pages);
2478
- total_swap_pages += p->pages;
2511
+}
24792512
2513
+static void _enable_swap_info(struct swap_info_struct *p)
2514
+{
2515
+ bool skip = false;
2516
+
2517
+ p->flags |= SWP_WRITEOK | SWP_VALID;
2518
+ trace_android_vh_account_swap_pages(p, &skip);
2519
+ if (!skip) {
2520
+ atomic_long_add(p->pages, &nr_swap_pages);
2521
+ total_swap_pages += p->pages;
2522
+ }
24802523 assert_spin_locked(&swap_lock);
24812524 /*
24822525 * both lists are plists, and thus priority ordered.
....@@ -2500,7 +2543,17 @@
25002543 frontswap_init(p->type, frontswap_map);
25012544 spin_lock(&swap_lock);
25022545 spin_lock(&p->lock);
2503
- _enable_swap_info(p, prio, swap_map, cluster_info);
2546
+ setup_swap_info(p, prio, swap_map, cluster_info);
2547
+ spin_unlock(&p->lock);
2548
+ spin_unlock(&swap_lock);
2549
+ /*
2550
+ * Guarantee swap_map, cluster_info, etc. fields are valid
2551
+ * between get/put_swap_device() if SWP_VALID bit is set
2552
+ */
2553
+ synchronize_rcu();
2554
+ spin_lock(&swap_lock);
2555
+ spin_lock(&p->lock);
2556
+ _enable_swap_info(p);
25042557 spin_unlock(&p->lock);
25052558 spin_unlock(&swap_lock);
25062559 }
....@@ -2509,7 +2562,8 @@
25092562 {
25102563 spin_lock(&swap_lock);
25112564 spin_lock(&p->lock);
2512
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2565
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2566
+ _enable_swap_info(p);
25132567 spin_unlock(&p->lock);
25142568 spin_unlock(&swap_lock);
25152569 }
....@@ -2537,6 +2591,7 @@
25372591 struct filename *pathname;
25382592 int err, found = 0;
25392593 unsigned int old_block_size;
2594
+ bool skip = false;
25402595
25412596 if (!capable(CAP_SYS_ADMIN))
25422597 return -EPERM;
....@@ -2591,8 +2646,11 @@
25912646 least_priority++;
25922647 }
25932648 plist_del(&p->list, &swap_active_head);
2594
- atomic_long_sub(p->pages, &nr_swap_pages);
2595
- total_swap_pages -= p->pages;
2649
+ trace_android_vh_account_swap_pages(p, &skip);
2650
+ if (!skip) {
2651
+ atomic_long_sub(p->pages, &nr_swap_pages);
2652
+ total_swap_pages -= p->pages;
2653
+ }
25962654 p->flags &= ~SWP_WRITEOK;
25972655 spin_unlock(&p->lock);
25982656 spin_unlock(&swap_lock);
....@@ -2611,6 +2669,17 @@
26112669 }
26122670
26132671 reenable_swap_slots_cache_unlock();
2672
+
2673
+ spin_lock(&swap_lock);
2674
+ spin_lock(&p->lock);
2675
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
2676
+ spin_unlock(&p->lock);
2677
+ spin_unlock(&swap_lock);
2678
+ /*
2679
+ * wait for swap operations protected by get/put_swap_device()
2680
+ * to complete
2681
+ */
2682
+ synchronize_rcu();
26142683
26152684 flush_work(&p->discard_work);
26162685
....@@ -2647,11 +2716,14 @@
26472716 frontswap_map = frontswap_map_get(p);
26482717 spin_unlock(&p->lock);
26492718 spin_unlock(&swap_lock);
2719
+ arch_swap_invalidate_area(p->type);
26502720 frontswap_invalidate_area(p->type);
26512721 frontswap_map_set(p, NULL);
26522722 mutex_unlock(&swapon_mutex);
26532723 free_percpu(p->percpu_cluster);
26542724 p->percpu_cluster = NULL;
2725
+ free_percpu(p->cluster_next_cpu);
2726
+ p->cluster_next_cpu = NULL;
26552727 vfree(swap_map);
26562728 kvfree(cluster_info);
26572729 kvfree(frontswap_map);
....@@ -2759,20 +2831,24 @@
27592831 struct swap_info_struct *si = v;
27602832 struct file *file;
27612833 int len;
2834
+ unsigned int bytes, inuse;
27622835
27632836 if (si == SEQ_START_TOKEN) {
2764
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2837
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
27652838 return 0;
27662839 }
27672840
2841
+ bytes = si->pages << (PAGE_SHIFT - 10);
2842
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
2843
+
27682844 file = si->swap_file;
27692845 len = seq_file_path(swap, file, " \t\n\\");
2770
- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2846
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
27712847 len < 40 ? 40 - len : 1, " ",
27722848 S_ISBLK(file_inode(file)->i_mode) ?
27732849 "partition" : "file\t",
2774
- si->pages << (PAGE_SHIFT - 10),
2775
- si->inuse_pages << (PAGE_SHIFT - 10),
2850
+ bytes, bytes < 10000000 ? "\t" : "",
2851
+ inuse, inuse < 10000000 ? "\t" : "",
27762852 si->prio);
27772853 return 0;
27782854 }
....@@ -2798,17 +2874,18 @@
27982874 return 0;
27992875 }
28002876
2801
-static const struct file_operations proc_swaps_operations = {
2802
- .open = swaps_open,
2803
- .read = seq_read,
2804
- .llseek = seq_lseek,
2805
- .release = seq_release,
2806
- .poll = swaps_poll,
2877
+static const struct proc_ops swaps_proc_ops = {
2878
+ .proc_flags = PROC_ENTRY_PERMANENT,
2879
+ .proc_open = swaps_open,
2880
+ .proc_read = seq_read,
2881
+ .proc_lseek = seq_lseek,
2882
+ .proc_release = seq_release,
2883
+ .proc_poll = swaps_poll,
28072884 };
28082885
28092886 static int __init procswaps_init(void)
28102887 {
2811
- proc_create("swaps", 0, NULL, &proc_swaps_operations);
2888
+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
28122889 return 0;
28132890 }
28142891 __initcall(procswaps_init);
....@@ -2825,13 +2902,16 @@
28252902
28262903 static struct swap_info_struct *alloc_swap_info(void)
28272904 {
2828
- struct swap_info_struct *p;
2905
+ struct swap_info_struct *p = NULL;
28292906 struct swap_info_struct *defer = NULL;
28302907 unsigned int type;
28312908 int i;
2832
- int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
2909
+ bool skip = false;
28332910
2834
- p = kvzalloc(size, GFP_KERNEL);
2911
+ trace_android_rvh_alloc_si(&p, &skip);
2912
+ trace_android_vh_alloc_si(&p, &skip);
2913
+ if (!skip)
2914
+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
28352915 if (!p)
28362916 return ERR_PTR(-ENOMEM);
28372917
....@@ -2863,7 +2943,7 @@
28632943 * would be relying on p->type to remain valid.
28642944 */
28652945 }
2866
- INIT_LIST_HEAD(&p->first_swap_extent.list);
2946
+ p->swap_extent_root = RB_ROOT;
28672947 plist_node_init(&p->list, 0);
28682948 for_each_node(i)
28692949 plist_node_init(&p->avail_lists[i], 0);
....@@ -2881,10 +2961,10 @@
28812961 int error;
28822962
28832963 if (S_ISBLK(inode->i_mode)) {
2884
- p->bdev = bdgrab(I_BDEV(inode));
2885
- error = blkdev_get(p->bdev,
2964
+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
28862965 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2887
- if (error < 0) {
2966
+ if (IS_ERR(p->bdev)) {
2967
+ error = PTR_ERR(p->bdev);
28882968 p->bdev = NULL;
28892969 return error;
28902970 }
....@@ -2892,6 +2972,13 @@
28922972 error = set_blocksize(p->bdev, PAGE_SIZE);
28932973 if (error < 0)
28942974 return error;
2975
+ /*
2976
+ * Zoned block devices contain zones that have a sequential
2977
+ * write only restriction. Hence zoned block devices are not
2978
+ * suitable for swapping. Disallow them here.
2979
+ */
2980
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
2981
+ return -EINVAL;
28952982 p->flags |= SWP_BLKDEV;
28962983 } else if (S_ISREG(inode->i_mode)) {
28972984 p->bdev = inode->i_sb->s_bdev;
....@@ -3188,10 +3275,10 @@
31883275 goto bad_swap_unlock_inode;
31893276 }
31903277
3191
- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3278
+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
31923279 p->flags |= SWP_STABLE_WRITES;
31933280
3194
- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3281
+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
31953282 p->flags |= SWP_SYNCHRONOUS_IO;
31963283
31973284 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
....@@ -3199,11 +3286,19 @@
31993286 unsigned long ci, nr_cluster;
32003287
32013288 p->flags |= SWP_SOLIDSTATE;
3289
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
3290
+ if (!p->cluster_next_cpu) {
3291
+ error = -ENOMEM;
3292
+ goto bad_swap_unlock_inode;
3293
+ }
32023294 /*
32033295 * select a random position to start with to help wear leveling
32043296 * SSD
32053297 */
3206
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3298
+ for_each_possible_cpu(cpu) {
3299
+ per_cpu(*p->cluster_next_cpu, cpu) =
3300
+ 1 + prandom_u32_max(p->highest_bit);
3301
+ }
32073302 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
32083303
32093304 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
....@@ -3289,7 +3384,7 @@
32893384 error = inode_drain_writes(inode);
32903385 if (error) {
32913386 inode->i_flags &= ~S_SWAPFILE;
3292
- goto bad_swap_unlock_inode;
3387
+ goto free_swap_address_space;
32933388 }
32943389
32953390 mutex_lock(&swapon_mutex);
....@@ -3299,6 +3394,7 @@
32993394 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
33003395 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
33013396
3397
+ trace_android_vh_init_swap_info_struct(p, swap_avail_heads);
33023398 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
33033399 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
33043400 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
....@@ -3314,11 +3410,15 @@
33143410
33153411 error = 0;
33163412 goto out;
3413
+free_swap_address_space:
3414
+ exit_swap_address_space(p->type);
33173415 bad_swap_unlock_inode:
33183416 inode_unlock(inode);
33193417 bad_swap:
33203418 free_percpu(p->percpu_cluster);
33213419 p->percpu_cluster = NULL;
3420
+ free_percpu(p->cluster_next_cpu);
3421
+ p->cluster_next_cpu = NULL;
33223422 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
33233423 set_blocksize(p->bdev, p->old_block_size);
33243424 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
....@@ -3359,14 +3459,17 @@
33593459 spin_lock(&swap_lock);
33603460 for (type = 0; type < nr_swapfiles; type++) {
33613461 struct swap_info_struct *si = swap_info[type];
3462
+ bool skip = false;
33623463
3363
- if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3464
+ trace_android_vh_si_swapinfo(si, &skip);
3465
+ if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
33643466 nr_to_be_unused += si->inuse_pages;
33653467 }
33663468 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
33673469 val->totalswap = total_swap_pages + nr_to_be_unused;
33683470 spin_unlock(&swap_lock);
33693471 }
3472
+EXPORT_SYMBOL_GPL(si_swapinfo);
33703473
33713474 /*
33723475 * Verify that a swap entry is valid and increment its swap map count.
....@@ -3388,17 +3491,11 @@
33883491 unsigned char has_cache;
33893492 int err = -EINVAL;
33903493
3391
- if (non_swap_entry(entry))
3392
- goto out;
3393
-
3394
- p = swp_swap_info(entry);
3494
+ p = get_swap_device(entry);
33953495 if (!p)
3396
- goto bad_file;
3496
+ goto out;
33973497
33983498 offset = swp_offset(entry);
3399
- if (unlikely(offset >= p->max))
3400
- goto out;
3401
-
34023499 ci = lock_cluster_or_swap_info(p, offset);
34033500
34043501 count = p->swap_map[offset];
....@@ -3439,16 +3536,14 @@
34393536 } else
34403537 err = -ENOENT; /* unused swap entry */
34413538
3442
- p->swap_map[offset] = count | has_cache;
3539
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
34433540
34443541 unlock_out:
34453542 unlock_cluster_or_swap_info(p, ci);
34463543 out:
3544
+ if (p)
3545
+ put_swap_device(p);
34473546 return err;
3448
-
3449
-bad_file:
3450
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3451
- goto out;
34523547 }
34533548
34543549 /*
....@@ -3481,7 +3576,7 @@
34813576 *
34823577 * Called when allocating swap cache for existing swap entry,
34833578 * This can return error codes. Returns 0 at success.
3484
- * -EBUSY means there is a swap cache.
3579
+ * -EEXIST means there is a swap cache.
34853580 * Note: return code is different from swap_duplicate().
34863581 */
34873582 int swapcache_prepare(swp_entry_t entry)
....@@ -3493,6 +3588,7 @@
34933588 {
34943589 return swap_type_to_swap_info(swp_type(entry));
34953590 }
3591
+EXPORT_SYMBOL_GPL(swp_swap_info);
34963592
34973593 struct swap_info_struct *page_swap_info(struct page *page)
34983594 {
....@@ -3540,6 +3636,7 @@
35403636 struct page *list_page;
35413637 pgoff_t offset;
35423638 unsigned char count;
3639
+ int ret = 0;
35433640
35443641 /*
35453642 * When debugging, it's easier to use __GFP_ZERO here; but it's better
....@@ -3547,15 +3644,15 @@
35473644 */
35483645 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
35493646
3550
- si = swap_info_get(entry);
3647
+ si = get_swap_device(entry);
35513648 if (!si) {
35523649 /*
35533650 * An acceptable race has occurred since the failing
3554
- * __swap_duplicate(): the swap entry has been freed,
3555
- * perhaps even the whole swap_map cleared for swapoff.
3651
+ * __swap_duplicate(): the swap device may be swapoff
35563652 */
35573653 goto outer;
35583654 }
3655
+ spin_lock(&si->lock);
35593656
35603657 offset = swp_offset(entry);
35613658
....@@ -3573,9 +3670,8 @@
35733670 }
35743671
35753672 if (!page) {
3576
- unlock_cluster(ci);
3577
- spin_unlock(&si->lock);
3578
- return -ENOMEM;
3673
+ ret = -ENOMEM;
3674
+ goto out;
35793675 }
35803676
35813677 /*
....@@ -3627,10 +3723,11 @@
36273723 out:
36283724 unlock_cluster(ci);
36293725 spin_unlock(&si->lock);
3726
+ put_swap_device(si);
36303727 outer:
36313728 if (page)
36323729 __free_page(page);
3633
- return 0;
3730
+ return ret;
36343731 }
36353732
36363733 /*
....@@ -3658,7 +3755,7 @@
36583755
36593756 spin_lock(&si->cont_lock);
36603757 offset &= ~PAGE_MASK;
3661
- page = list_entry(head->lru.next, struct page, lru);
3758
+ page = list_next_entry(head, lru);
36623759 map = kmap_atomic(page) + offset;
36633760
36643761 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
....@@ -3670,13 +3767,13 @@
36703767 */
36713768 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
36723769 kunmap_atomic(map);
3673
- page = list_entry(page->lru.next, struct page, lru);
3770
+ page = list_next_entry(page, lru);
36743771 BUG_ON(page == head);
36753772 map = kmap_atomic(page) + offset;
36763773 }
36773774 if (*map == SWAP_CONT_MAX) {
36783775 kunmap_atomic(map);
3679
- page = list_entry(page->lru.next, struct page, lru);
3776
+ page = list_next_entry(page, lru);
36803777 if (page == head) {
36813778 ret = false; /* add count continuation */
36823779 goto out;
....@@ -3686,12 +3783,10 @@
36863783 }
36873784 *map += 1;
36883785 kunmap_atomic(map);
3689
- page = list_entry(page->lru.prev, struct page, lru);
3690
- while (page != head) {
3786
+ while ((page = list_prev_entry(page, lru)) != head) {
36913787 map = kmap_atomic(page) + offset;
36923788 *map = COUNT_CONTINUED;
36933789 kunmap_atomic(map);
3694
- page = list_entry(page->lru.prev, struct page, lru);
36953790 }
36963791 ret = true; /* incremented */
36973792
....@@ -3702,7 +3797,7 @@
37023797 BUG_ON(count != COUNT_CONTINUED);
37033798 while (*map == COUNT_CONTINUED) {
37043799 kunmap_atomic(map);
3705
- page = list_entry(page->lru.next, struct page, lru);
3800
+ page = list_next_entry(page, lru);
37063801 BUG_ON(page == head);
37073802 map = kmap_atomic(page) + offset;
37083803 }
....@@ -3711,13 +3806,11 @@
37113806 if (*map == 0)
37123807 count = 0;
37133808 kunmap_atomic(map);
3714
- page = list_entry(page->lru.prev, struct page, lru);
3715
- while (page != head) {
3809
+ while ((page = list_prev_entry(page, lru)) != head) {
37163810 map = kmap_atomic(page) + offset;
37173811 *map = SWAP_CONT_MAX | count;
37183812 count = COUNT_CONTINUED;
37193813 kunmap_atomic(map);
3720
- page = list_entry(page->lru.prev, struct page, lru);
37213814 }
37223815 ret = count == COUNT_CONTINUED;
37233816 }
....@@ -3749,11 +3842,12 @@
37493842 }
37503843
37513844 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3752
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3753
- gfp_t gfp_mask)
3845
+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
37543846 {
37553847 struct swap_info_struct *si, *next;
3756
- if (!(gfp_mask & __GFP_IO) || !memcg)
3848
+ int nid = page_to_nid(page);
3849
+
3850
+ if (!(gfp_mask & __GFP_IO))
37573851 return;
37583852
37593853 if (!blk_cgroup_congested())
....@@ -3767,11 +3861,10 @@
37673861 return;
37683862
37693863 spin_lock(&swap_avail_lock);
3770
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3771
- avail_lists[node]) {
3864
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3865
+ avail_lists[nid]) {
37723866 if (si->bdev) {
3773
- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3774
- true);
3867
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
37753868 break;
37763869 }
37773870 }