hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/mm/swapfile.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/swapfile.c
34 *
....@@ -39,10 +40,10 @@
3940 #include <linux/swap_slots.h>
4041 #include <linux/sort.h>
4142
42
-#include <asm/pgtable.h>
4343 #include <asm/tlbflush.h>
4444 #include <linux/swapops.h>
4545 #include <linux/swap_cgroup.h>
46
+#include <trace/hooks/mm.h>
4647
4748 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
4849 unsigned char);
....@@ -98,7 +99,7 @@
9899
99100 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
100101
101
-static struct swap_info_struct *swap_type_to_swap_info(int type)
102
+struct swap_info_struct *swap_type_to_swap_info(int type)
102103 {
103104 if (type >= READ_ONCE(nr_swapfiles))
104105 return NULL;
....@@ -106,36 +107,62 @@
106107 smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
107108 return READ_ONCE(swap_info[type]);
108109 }
110
+EXPORT_SYMBOL_GPL(swap_type_to_swap_info);
109111
110112 static inline unsigned char swap_count(unsigned char ent)
111113 {
112114 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
113115 }
114116
117
+/* Reclaim the swap entry anyway if possible */
118
+#define TTRS_ANYWAY 0x1
119
+/*
120
+ * Reclaim the swap entry if there are no more mappings of the
121
+ * corresponding page
122
+ */
123
+#define TTRS_UNMAPPED 0x2
124
+/* Reclaim the swap entry if swap is getting full*/
125
+#define TTRS_FULL 0x4
126
+
115127 /* returns 1 if swap entry is freed */
116
-static int
117
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
128
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
129
+ unsigned long offset, unsigned long flags)
118130 {
119131 swp_entry_t entry = swp_entry(si->type, offset);
120132 struct page *page;
121133 int ret = 0;
122134
123
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
135
+ page = find_get_page(swap_address_space(entry), offset);
124136 if (!page)
125137 return 0;
126138 /*
127
- * This function is called from scan_swap_map() and it's called
128
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
129
- * We have to use trylock for avoiding deadlock. This is a special
139
+ * When this function is called from scan_swap_map_slots() and it's
140
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
141
+ * here. We have to use trylock for avoiding deadlock. This is a special
130142 * case and you should use try_to_free_swap() with explicit lock_page()
131143 * in usual operations.
132144 */
133145 if (trylock_page(page)) {
134
- ret = try_to_free_swap(page);
146
+ if ((flags & TTRS_ANYWAY) ||
147
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
148
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
149
+ ret = try_to_free_swap(page);
135150 unlock_page(page);
136151 }
137152 put_page(page);
138153 return ret;
154
+}
155
+
156
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
157
+{
158
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
159
+ return rb_entry(rb, struct swap_extent, rb_node);
160
+}
161
+
162
+static inline struct swap_extent *next_se(struct swap_extent *se)
163
+{
164
+ struct rb_node *rb = rb_next(&se->rb_node);
165
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
139166 }
140167
141168 /*
....@@ -150,7 +177,7 @@
150177 int err = 0;
151178
152179 /* Do not discard the swap header page! */
153
- se = &si->first_swap_extent;
180
+ se = first_se(si);
154181 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
155182 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
156183 if (nr_blocks) {
....@@ -161,7 +188,7 @@
161188 cond_resched();
162189 }
163190
164
- list_for_each_entry(se, &si->first_swap_extent.list, list) {
191
+ for (se = next_se(se); se; se = next_se(se)) {
165192 start_block = se->start_block << (PAGE_SHIFT - 9);
166193 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
167194
....@@ -175,6 +202,39 @@
175202 return err; /* That will often be -EOPNOTSUPP */
176203 }
177204
205
+static struct swap_extent *
206
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
207
+{
208
+ struct swap_extent *se;
209
+ struct rb_node *rb;
210
+
211
+ rb = sis->swap_extent_root.rb_node;
212
+ while (rb) {
213
+ se = rb_entry(rb, struct swap_extent, rb_node);
214
+ if (offset < se->start_page)
215
+ rb = rb->rb_left;
216
+ else if (offset >= se->start_page + se->nr_pages)
217
+ rb = rb->rb_right;
218
+ else
219
+ return se;
220
+ }
221
+ /* It *must* be present */
222
+ BUG();
223
+}
224
+
225
+sector_t swap_page_sector(struct page *page)
226
+{
227
+ struct swap_info_struct *sis = page_swap_info(page);
228
+ struct swap_extent *se;
229
+ sector_t sector;
230
+ pgoff_t offset;
231
+
232
+ offset = __page_file_index(page);
233
+ se = offset_to_swap_extent(sis, offset);
234
+ sector = se->start_block + (offset - se->start_page);
235
+ return sector << (PAGE_SHIFT - 9);
236
+}
237
+
178238 /*
179239 * swap allocation tell device that a cluster of swap can now be discarded,
180240 * to allow the swap device to optimize its wear-levelling.
....@@ -182,32 +242,25 @@
182242 static void discard_swap_cluster(struct swap_info_struct *si,
183243 pgoff_t start_page, pgoff_t nr_pages)
184244 {
185
- struct swap_extent *se = si->curr_swap_extent;
186
- int found_extent = 0;
245
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
187246
188247 while (nr_pages) {
189
- if (se->start_page <= start_page &&
190
- start_page < se->start_page + se->nr_pages) {
191
- pgoff_t offset = start_page - se->start_page;
192
- sector_t start_block = se->start_block + offset;
193
- sector_t nr_blocks = se->nr_pages - offset;
248
+ pgoff_t offset = start_page - se->start_page;
249
+ sector_t start_block = se->start_block + offset;
250
+ sector_t nr_blocks = se->nr_pages - offset;
194251
195
- if (nr_blocks > nr_pages)
196
- nr_blocks = nr_pages;
197
- start_page += nr_blocks;
198
- nr_pages -= nr_blocks;
252
+ if (nr_blocks > nr_pages)
253
+ nr_blocks = nr_pages;
254
+ start_page += nr_blocks;
255
+ nr_pages -= nr_blocks;
199256
200
- if (!found_extent++)
201
- si->curr_swap_extent = se;
257
+ start_block <<= PAGE_SHIFT - 9;
258
+ nr_blocks <<= PAGE_SHIFT - 9;
259
+ if (blkdev_issue_discard(si->bdev, start_block,
260
+ nr_blocks, GFP_NOIO, 0))
261
+ break;
202262
203
- start_block <<= PAGE_SHIFT - 9;
204
- nr_blocks <<= PAGE_SHIFT - 9;
205
- if (blkdev_issue_discard(si->bdev, start_block,
206
- nr_blocks, GFP_NOIO, 0))
207
- break;
208
- }
209
-
210
- se = list_next_entry(se, list);
263
+ se = next_se(se);
211264 }
212265 }
213266
....@@ -562,7 +615,6 @@
562615 {
563616 struct percpu_cluster *cluster;
564617 struct swap_cluster_info *ci;
565
- bool found_free;
566618 unsigned long tmp, max;
567619
568620 new_cluster:
....@@ -575,16 +627,16 @@
575627 } else if (!cluster_list_empty(&si->discard_clusters)) {
576628 /*
577629 * we don't have free cluster but have some clusters in
578
- * discarding, do discard now and reclaim them
630
+ * discarding, do discard now and reclaim them, then
631
+ * reread cluster_next_cpu since we dropped si->lock
579632 */
580633 swap_do_scheduled_discard(si);
581
- *scan_base = *offset = si->cluster_next;
634
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
635
+ *offset = *scan_base;
582636 goto new_cluster;
583637 } else
584638 return false;
585639 }
586
-
587
- found_free = false;
588640
589641 /*
590642 * Other CPUs can use our cluster if they can't find a free cluster,
....@@ -593,39 +645,42 @@
593645 tmp = cluster->next;
594646 max = min_t(unsigned long, si->max,
595647 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
596
- if (tmp >= max) {
597
- cluster_set_null(&cluster->index);
598
- goto new_cluster;
599
- }
600
- ci = lock_cluster(si, tmp);
601
- while (tmp < max) {
602
- if (!si->swap_map[tmp]) {
603
- found_free = true;
604
- break;
648
+ if (tmp < max) {
649
+ ci = lock_cluster(si, tmp);
650
+ while (tmp < max) {
651
+ if (!si->swap_map[tmp])
652
+ break;
653
+ tmp++;
605654 }
606
- tmp++;
655
+ unlock_cluster(ci);
607656 }
608
- unlock_cluster(ci);
609
- if (!found_free) {
657
+ if (tmp >= max) {
610658 cluster_set_null(&cluster->index);
611659 goto new_cluster;
612660 }
613661 cluster->next = tmp + 1;
614662 *offset = tmp;
615663 *scan_base = tmp;
616
- return found_free;
664
+ return true;
617665 }
618666
619667 static void __del_from_avail_list(struct swap_info_struct *p)
620668 {
621669 int nid;
622670
671
+ assert_spin_locked(&p->lock);
623672 for_each_node(nid)
624673 plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
625674 }
626675
627676 static void del_from_avail_list(struct swap_info_struct *p)
628677 {
678
+ bool skip = false;
679
+
680
+ trace_android_vh_del_from_avail_list(p, &skip);
681
+ if (skip)
682
+ return;
683
+
629684 spin_lock(&swap_avail_lock);
630685 __del_from_avail_list(p);
631686 spin_unlock(&swap_avail_lock);
....@@ -639,7 +694,7 @@
639694 if (offset == si->lowest_bit)
640695 si->lowest_bit += nr_entries;
641696 if (end == si->highest_bit)
642
- si->highest_bit -= nr_entries;
697
+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
643698 si->inuse_pages += nr_entries;
644699 if (si->inuse_pages == si->pages) {
645700 si->lowest_bit = si->max;
....@@ -651,6 +706,11 @@
651706 static void add_to_avail_list(struct swap_info_struct *p)
652707 {
653708 int nid;
709
+ bool skip = false;
710
+
711
+ trace_android_vh_add_to_avail_list(p, &skip);
712
+ if (skip)
713
+ return;
654714
655715 spin_lock(&swap_avail_lock);
656716 for_each_node(nid) {
....@@ -663,19 +723,23 @@
663723 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
664724 unsigned int nr_entries)
665725 {
726
+ unsigned long begin = offset;
666727 unsigned long end = offset + nr_entries - 1;
667728 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
729
+ bool skip = false;
668730
669731 if (offset < si->lowest_bit)
670732 si->lowest_bit = offset;
671733 if (end > si->highest_bit) {
672734 bool was_full = !si->highest_bit;
673735
674
- si->highest_bit = end;
736
+ WRITE_ONCE(si->highest_bit, end);
675737 if (was_full && (si->flags & SWP_WRITEOK))
676738 add_to_avail_list(si);
677739 }
678
- atomic_long_add(nr_entries, &nr_swap_pages);
740
+ trace_android_vh_account_swap_pages(si, &skip);
741
+ if (!skip)
742
+ atomic_long_add(nr_entries, &nr_swap_pages);
679743 si->inuse_pages -= nr_entries;
680744 if (si->flags & SWP_BLKDEV)
681745 swap_slot_free_notify =
....@@ -683,14 +747,44 @@
683747 else
684748 swap_slot_free_notify = NULL;
685749 while (offset <= end) {
750
+ arch_swap_invalidate_page(si->type, offset);
686751 frontswap_invalidate_page(si->type, offset);
687752 if (swap_slot_free_notify)
688753 swap_slot_free_notify(si->bdev, offset);
689754 offset++;
690755 }
756
+ clear_shadow_from_swap_cache(si->type, begin, end);
691757 }
692758
693
-static int scan_swap_map_slots(struct swap_info_struct *si,
759
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
760
+{
761
+ unsigned long prev;
762
+
763
+ if (!(si->flags & SWP_SOLIDSTATE)) {
764
+ si->cluster_next = next;
765
+ return;
766
+ }
767
+
768
+ prev = this_cpu_read(*si->cluster_next_cpu);
769
+ /*
770
+ * Cross the swap address space size aligned trunk, choose
771
+ * another trunk randomly to avoid lock contention on swap
772
+ * address space if possible.
773
+ */
774
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
775
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
776
+ /* No free swap slots available */
777
+ if (si->highest_bit <= si->lowest_bit)
778
+ return;
779
+ next = si->lowest_bit +
780
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
781
+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
782
+ next = max_t(unsigned int, next, si->lowest_bit);
783
+ }
784
+ this_cpu_write(*si->cluster_next_cpu, next);
785
+}
786
+
787
+int scan_swap_map_slots(struct swap_info_struct *si,
694788 unsigned char usage, int nr,
695789 swp_entry_t slots[])
696790 {
....@@ -700,9 +794,7 @@
700794 unsigned long last_in_cluster = 0;
701795 int latency_ration = LATENCY_LIMIT;
702796 int n_ret = 0;
703
-
704
- if (nr > SWAP_BATCH)
705
- nr = SWAP_BATCH;
797
+ bool scanned_many = false;
706798
707799 /*
708800 * We try to cluster swap pages by allocating them sequentially
....@@ -716,17 +808,22 @@
716808 */
717809
718810 si->flags += SWP_SCANNING;
719
- scan_base = offset = si->cluster_next;
811
+ /*
812
+ * Use percpu scan base for SSD to reduce lock contention on
813
+ * cluster and swap cache. For HDD, sequential access is more
814
+ * important.
815
+ */
816
+ if (si->flags & SWP_SOLIDSTATE)
817
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
818
+ else
819
+ scan_base = si->cluster_next;
820
+ offset = scan_base;
720821
721822 /* SSD algorithm */
722823 if (si->cluster_info) {
723
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
724
- goto checks;
725
- else
824
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
726825 goto scan;
727
- }
728
-
729
- if (unlikely(!si->cluster_nr--)) {
826
+ } else if (unlikely(!si->cluster_nr--)) {
730827 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
731828 si->cluster_nr = SWAPFILE_CLUSTER - 1;
732829 goto checks;
....@@ -789,7 +886,7 @@
789886 int swap_was_freed;
790887 unlock_cluster(ci);
791888 spin_unlock(&si->lock);
792
- swap_was_freed = __try_to_reclaim_swap(si, offset);
889
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
793890 spin_lock(&si->lock);
794891 /* entry was freed successfully, try to use this again */
795892 if (swap_was_freed)
....@@ -804,12 +901,11 @@
804901 else
805902 goto done;
806903 }
807
- si->swap_map[offset] = usage;
904
+ WRITE_ONCE(si->swap_map[offset], usage);
808905 inc_cluster_info_page(si, si->cluster_info, offset);
809906 unlock_cluster(ci);
810907
811908 swap_range_alloc(si, offset, 1);
812
- si->cluster_next = offset + 1;
813909 slots[n_ret++] = swp_entry(si->type, offset);
814910
815911 /* got enough slots or reach max slots? */
....@@ -832,51 +928,69 @@
832928 if (si->cluster_info) {
833929 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
834930 goto checks;
835
- else
836
- goto done;
837
- }
838
- /* non-ssd case */
839
- ++offset;
840
-
841
- /* non-ssd case, still more slots in cluster? */
842
- if (si->cluster_nr && !si->swap_map[offset]) {
931
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
932
+ /* non-ssd case, still more slots in cluster? */
843933 --si->cluster_nr;
844934 goto checks;
845935 }
846936
937
+ /*
938
+ * Even if there's no free clusters available (fragmented),
939
+ * try to scan a little more quickly with lock held unless we
940
+ * have scanned too many slots already.
941
+ */
942
+ if (!scanned_many) {
943
+ unsigned long scan_limit;
944
+
945
+ if (offset < scan_base)
946
+ scan_limit = scan_base;
947
+ else
948
+ scan_limit = si->highest_bit;
949
+ for (; offset <= scan_limit && --latency_ration > 0;
950
+ offset++) {
951
+ if (!si->swap_map[offset])
952
+ goto checks;
953
+ }
954
+ }
955
+
847956 done:
957
+ set_cluster_next(si, offset + 1);
848958 si->flags -= SWP_SCANNING;
849959 return n_ret;
850960
851961 scan:
852962 spin_unlock(&si->lock);
853
- while (++offset <= si->highest_bit) {
854
- if (!si->swap_map[offset]) {
963
+ while (++offset <= READ_ONCE(si->highest_bit)) {
964
+ if (data_race(!si->swap_map[offset])) {
855965 spin_lock(&si->lock);
856966 goto checks;
857967 }
858
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
968
+ if (vm_swap_full() &&
969
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
859970 spin_lock(&si->lock);
860971 goto checks;
861972 }
862973 if (unlikely(--latency_ration < 0)) {
863974 cond_resched();
864975 latency_ration = LATENCY_LIMIT;
976
+ scanned_many = true;
865977 }
866978 }
867979 offset = si->lowest_bit;
868980 while (offset < scan_base) {
869
- if (!si->swap_map[offset]) {
981
+ if (data_race(!si->swap_map[offset])) {
870982 spin_lock(&si->lock);
871983 goto checks;
872984 }
873
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
985
+ if (vm_swap_full() &&
986
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
874987 spin_lock(&si->lock);
875988 goto checks;
876989 }
877990 if (unlikely(--latency_ration < 0)) {
878991 cond_resched();
879992 latency_ration = LATENCY_LIMIT;
993
+ scanned_many = true;
880994 }
881995 offset++;
882996 }
....@@ -886,8 +1000,9 @@
8861000 si->flags -= SWP_SCANNING;
8871001 return n_ret;
8881002 }
1003
+EXPORT_SYMBOL_GPL(scan_swap_map_slots);
8891004
890
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
1005
+int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
8911006 {
8921007 unsigned long idx;
8931008 struct swap_cluster_info *ci;
....@@ -921,6 +1036,7 @@
9211036
9221037 return 1;
9231038 }
1039
+EXPORT_SYMBOL_GPL(swap_alloc_cluster);
9241040
9251041 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
9261042 {
....@@ -928,6 +1044,7 @@
9281044 struct swap_cluster_info *ci;
9291045
9301046 ci = lock_cluster(si, offset);
1047
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
9311048 cluster_set_count_flag(ci, 0, 0);
9321049 free_cluster(si, idx);
9331050 unlock_cluster(ci);
....@@ -960,19 +1077,17 @@
9601077 /* Only single cluster request supported */
9611078 WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
9621079
1080
+ spin_lock(&swap_avail_lock);
1081
+
9631082 avail_pgs = atomic_long_read(&nr_swap_pages) / size;
964
- if (avail_pgs <= 0)
1083
+ if (avail_pgs <= 0) {
1084
+ spin_unlock(&swap_avail_lock);
9651085 goto noswap;
1086
+ }
9661087
967
- if (n_goal > SWAP_BATCH)
968
- n_goal = SWAP_BATCH;
969
-
970
- if (n_goal > avail_pgs)
971
- n_goal = avail_pgs;
1088
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
9721089
9731090 atomic_long_sub(n_goal * size, &nr_swap_pages);
974
-
975
- spin_lock(&swap_avail_lock);
9761091
9771092 start_over:
9781093 node = numa_node_id();
....@@ -1008,6 +1123,7 @@
10081123 goto check_out;
10091124 pr_debug("scan_swap_map of si %d failed to find offset\n",
10101125 si->type);
1126
+ cond_resched();
10111127
10121128 spin_lock(&swap_avail_lock);
10131129 nextsi:
....@@ -1041,20 +1157,22 @@
10411157 {
10421158 struct swap_info_struct *si = swap_type_to_swap_info(type);
10431159 pgoff_t offset;
1160
+ bool skip = false;
10441161
10451162 if (!si)
10461163 goto fail;
10471164
10481165 spin_lock(&si->lock);
10491166 if (si->flags & SWP_WRITEOK) {
1050
- atomic_long_dec(&nr_swap_pages);
10511167 /* This is called for allocating swap entry, not cache */
10521168 offset = scan_swap_map(si, 1);
10531169 if (offset) {
1170
+ trace_android_vh_account_swap_pages(si, &skip);
1171
+ if (!skip)
1172
+ atomic_long_dec(&nr_swap_pages);
10541173 spin_unlock(&si->lock);
10551174 return swp_entry(type, offset);
10561175 }
1057
- atomic_long_inc(&nr_swap_pages);
10581176 }
10591177 spin_unlock(&si->lock);
10601178 fail:
....@@ -1064,15 +1182,14 @@
10641182 static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
10651183 {
10661184 struct swap_info_struct *p;
1067
- unsigned long offset, type;
1185
+ unsigned long offset;
10681186
10691187 if (!entry.val)
10701188 goto out;
1071
- type = swp_type(entry);
1072
- p = swap_type_to_swap_info(type);
1189
+ p = swp_swap_info(entry);
10731190 if (!p)
10741191 goto bad_nofile;
1075
- if (!(p->flags & SWP_USED))
1192
+ if (data_race(!(p->flags & SWP_USED)))
10761193 goto bad_device;
10771194 offset = swp_offset(entry);
10781195 if (offset >= p->max)
....@@ -1098,13 +1215,12 @@
10981215 p = __swap_info_get(entry);
10991216 if (!p)
11001217 goto out;
1101
- if (!p->swap_map[swp_offset(entry)])
1218
+ if (data_race(!p->swap_map[swp_offset(entry)]))
11021219 goto bad_free;
11031220 return p;
11041221
11051222 bad_free:
11061223 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
1107
- goto out;
11081224 out:
11091225 return NULL;
11101226 }
....@@ -1167,20 +1283,89 @@
11671283 }
11681284
11691285 usage = count | has_cache;
1170
- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
1286
+ if (usage)
1287
+ WRITE_ONCE(p->swap_map[offset], usage);
1288
+ else
1289
+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
11711290
11721291 return usage;
11731292 }
11741293
1294
+/*
1295
+ * Check whether swap entry is valid in the swap device. If so,
1296
+ * return pointer to swap_info_struct, and keep the swap entry valid
1297
+ * via preventing the swap device from being swapoff, until
1298
+ * put_swap_device() is called. Otherwise return NULL.
1299
+ *
1300
+ * The entirety of the RCU read critical section must come before the
1301
+ * return from or after the call to synchronize_rcu() in
1302
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1303
+ * true, the si->map, si->cluster_info, etc. must be valid in the
1304
+ * critical section.
1305
+ *
1306
+ * Notice that swapoff or swapoff+swapon can still happen before the
1307
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
1308
+ * in put_swap_device() if there isn't any other way to prevent
1309
+ * swapoff, such as page lock, page table lock, etc. The caller must
1310
+ * be prepared for that. For example, the following situation is
1311
+ * possible.
1312
+ *
1313
+ * CPU1 CPU2
1314
+ * do_swap_page()
1315
+ * ... swapoff+swapon
1316
+ * __read_swap_cache_async()
1317
+ * swapcache_prepare()
1318
+ * __swap_duplicate()
1319
+ * // check swap_map
1320
+ * // verify PTE not changed
1321
+ *
1322
+ * In __swap_duplicate(), the swap_map need to be checked before
1323
+ * changing partly because the specified swap entry may be for another
1324
+ * swap device which has been swapoff. And in do_swap_page(), after
1325
+ * the page is read from the swap device, the PTE is verified not
1326
+ * changed with the page table locked to check whether the swap device
1327
+ * has been swapoff or swapoff+swapon.
1328
+ */
1329
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
1330
+{
1331
+ struct swap_info_struct *si;
1332
+ unsigned long offset;
1333
+
1334
+ if (!entry.val)
1335
+ goto out;
1336
+ si = swp_swap_info(entry);
1337
+ if (!si)
1338
+ goto bad_nofile;
1339
+
1340
+ rcu_read_lock();
1341
+ if (data_race(!(si->flags & SWP_VALID)))
1342
+ goto unlock_out;
1343
+ offset = swp_offset(entry);
1344
+ if (offset >= si->max)
1345
+ goto unlock_out;
1346
+
1347
+ return si;
1348
+bad_nofile:
1349
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1350
+out:
1351
+ return NULL;
1352
+unlock_out:
1353
+ rcu_read_unlock();
1354
+ return NULL;
1355
+}
1356
+
11751357 static unsigned char __swap_entry_free(struct swap_info_struct *p,
1176
- swp_entry_t entry, unsigned char usage)
1358
+ swp_entry_t entry)
11771359 {
11781360 struct swap_cluster_info *ci;
11791361 unsigned long offset = swp_offset(entry);
1362
+ unsigned char usage;
11801363
11811364 ci = lock_cluster_or_swap_info(p, offset);
1182
- usage = __swap_entry_free_locked(p, offset, usage);
1365
+ usage = __swap_entry_free_locked(p, offset, 1);
11831366 unlock_cluster_or_swap_info(p, ci);
1367
+ if (!usage)
1368
+ free_swap_slot(entry);
11841369
11851370 return usage;
11861371 }
....@@ -1211,10 +1396,8 @@
12111396 struct swap_info_struct *p;
12121397
12131398 p = _swap_info_get(entry);
1214
- if (p) {
1215
- if (!__swap_entry_free(p, entry, 1))
1216
- free_swap_slot(entry);
1217
- }
1399
+ if (p)
1400
+ __swap_entry_free(p, entry);
12181401 }
12191402
12201403 /*
....@@ -1229,7 +1412,7 @@
12291412 unsigned char *map;
12301413 unsigned int i, free_entries = 0;
12311414 unsigned char val;
1232
- int size = swap_entry_size(hpage_nr_pages(page));
1415
+ int size = swap_entry_size(thp_nr_pages(page));
12331416
12341417 si = _swap_info_get(entry);
12351418 if (!si)
....@@ -1249,9 +1432,6 @@
12491432 if (free_entries == SWAPFILE_CLUSTER) {
12501433 unlock_cluster_or_swap_info(si, ci);
12511434 spin_lock(&si->lock);
1252
- ci = lock_cluster(si, offset);
1253
- memset(map, 0, SWAPFILE_CLUSTER);
1254
- unlock_cluster(ci);
12551435 mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
12561436 swap_free_cluster(si, idx);
12571437 spin_unlock(&si->lock);
....@@ -1321,6 +1501,7 @@
13211501 if (p)
13221502 spin_unlock(&p->lock);
13231503 }
1504
+EXPORT_SYMBOL_GPL(swapcache_free_entries);
13241505
13251506 /*
13261507 * How many references to page are currently swapped out?
....@@ -1346,11 +1527,18 @@
13461527 return count;
13471528 }
13481529
1349
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
1530
+int __swap_count(swp_entry_t entry)
13501531 {
1532
+ struct swap_info_struct *si;
13511533 pgoff_t offset = swp_offset(entry);
1534
+ int count = 0;
13521535
1353
- return swap_count(si->swap_map[offset]);
1536
+ si = get_swap_device(entry);
1537
+ if (si) {
1538
+ count = swap_count(si->swap_map[offset]);
1539
+ put_swap_device(si);
1540
+ }
1541
+ return count;
13541542 }
13551543
13561544 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
....@@ -1375,9 +1563,11 @@
13751563 int count = 0;
13761564 struct swap_info_struct *si;
13771565
1378
- si = __swap_info_get(entry);
1379
- if (si)
1566
+ si = get_swap_device(entry);
1567
+ if (si) {
13801568 count = swap_swapcount(si, entry);
1569
+ put_swap_device(si);
1570
+ }
13811571 return count;
13821572 }
13831573
....@@ -1624,7 +1814,6 @@
16241814 int free_swap_and_cache(swp_entry_t entry)
16251815 {
16261816 struct swap_info_struct *p;
1627
- struct page *page = NULL;
16281817 unsigned char count;
16291818
16301819 if (non_swap_entry(entry))
....@@ -1632,32 +1821,11 @@
16321821
16331822 p = _swap_info_get(entry);
16341823 if (p) {
1635
- count = __swap_entry_free(p, entry, 1);
1824
+ count = __swap_entry_free(p, entry);
16361825 if (count == SWAP_HAS_CACHE &&
1637
- !swap_page_trans_huge_swapped(p, entry)) {
1638
- page = find_get_page(swap_address_space(entry),
1639
- swp_offset(entry));
1640
- if (page && !trylock_page(page)) {
1641
- put_page(page);
1642
- page = NULL;
1643
- }
1644
- } else if (!count)
1645
- free_swap_slot(entry);
1646
- }
1647
- if (page) {
1648
- /*
1649
- * Not mapped elsewhere, or swap space full? Free it!
1650
- * Also recheck PageSwapCache now page is locked (above).
1651
- */
1652
- if (PageSwapCache(page) && !PageWriteback(page) &&
1653
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
1654
- !swap_page_trans_huge_swapped(p, entry)) {
1655
- page = compound_head(page);
1656
- delete_from_swap_cache(page);
1657
- SetPageDirty(page);
1658
- }
1659
- unlock_page(page);
1660
- put_page(page);
1826
+ !swap_page_trans_huge_swapped(p, entry))
1827
+ __try_to_reclaim_swap(p, swp_offset(entry),
1828
+ TTRS_UNMAPPED | TTRS_FULL);
16611829 }
16621830 return p != NULL;
16631831 }
....@@ -1671,13 +1839,12 @@
16711839 *
16721840 * This is needed for the suspend to disk (aka swsusp).
16731841 */
1674
-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1842
+int swap_type_of(dev_t device, sector_t offset)
16751843 {
1676
- struct block_device *bdev = NULL;
16771844 int type;
16781845
1679
- if (device)
1680
- bdev = bdget(device);
1846
+ if (!device)
1847
+ return -1;
16811848
16821849 spin_lock(&swap_lock);
16831850 for (type = 0; type < nr_swapfiles; type++) {
....@@ -1686,30 +1853,34 @@
16861853 if (!(sis->flags & SWP_WRITEOK))
16871854 continue;
16881855
1689
- if (!bdev) {
1690
- if (bdev_p)
1691
- *bdev_p = bdgrab(sis->bdev);
1692
-
1693
- spin_unlock(&swap_lock);
1694
- return type;
1695
- }
1696
- if (bdev == sis->bdev) {
1697
- struct swap_extent *se = &sis->first_swap_extent;
1856
+ if (device == sis->bdev->bd_dev) {
1857
+ struct swap_extent *se = first_se(sis);
16981858
16991859 if (se->start_block == offset) {
1700
- if (bdev_p)
1701
- *bdev_p = bdgrab(sis->bdev);
1702
-
17031860 spin_unlock(&swap_lock);
1704
- bdput(bdev);
17051861 return type;
17061862 }
17071863 }
17081864 }
17091865 spin_unlock(&swap_lock);
1710
- if (bdev)
1711
- bdput(bdev);
1866
+ return -ENODEV;
1867
+}
17121868
1869
+int find_first_swap(dev_t *device)
1870
+{
1871
+ int type;
1872
+
1873
+ spin_lock(&swap_lock);
1874
+ for (type = 0; type < nr_swapfiles; type++) {
1875
+ struct swap_info_struct *sis = swap_info[type];
1876
+
1877
+ if (!(sis->flags & SWP_WRITEOK))
1878
+ continue;
1879
+ *device = sis->bdev->bd_dev;
1880
+ spin_unlock(&swap_lock);
1881
+ return type;
1882
+ }
1883
+ spin_unlock(&swap_lock);
17131884 return -ENODEV;
17141885 }
17151886
....@@ -1756,7 +1927,7 @@
17561927
17571928 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
17581929 {
1759
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
1930
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
17601931 }
17611932
17621933 /*
....@@ -1768,7 +1939,6 @@
17681939 unsigned long addr, swp_entry_t entry, struct page *page)
17691940 {
17701941 struct page *swapcache;
1771
- struct mem_cgroup *memcg;
17721942 spinlock_t *ptl;
17731943 pte_t *pte;
17741944 int ret = 1;
....@@ -1778,15 +1948,8 @@
17781948 if (unlikely(!page))
17791949 return -ENOMEM;
17801950
1781
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
1782
- &memcg, false)) {
1783
- ret = -ENOMEM;
1784
- goto out_nolock;
1785
- }
1786
-
17871951 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
17881952 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
1789
- mem_cgroup_cancel_charge(page, memcg, false);
17901953 ret = 0;
17911954 goto out;
17921955 }
....@@ -1798,21 +1961,13 @@
17981961 pte_mkold(mk_pte(page, vma->vm_page_prot)));
17991962 if (page == swapcache) {
18001963 page_add_anon_rmap(page, vma, addr, false);
1801
- mem_cgroup_commit_charge(page, memcg, true, false);
18021964 } else { /* ksm created a completely new copy */
18031965 page_add_new_anon_rmap(page, vma, addr, false);
1804
- mem_cgroup_commit_charge(page, memcg, false, false);
1805
- lru_cache_add_active_or_unevictable(page, vma);
1966
+ lru_cache_add_inactive_or_unevictable(page, vma);
18061967 }
18071968 swap_free(entry);
1808
- /*
1809
- * Move the page to the active list so it is not
1810
- * immediately swapped out again after swapon.
1811
- */
1812
- activate_page(page);
18131969 out:
18141970 pte_unmap_unlock(pte, ptl);
1815
-out_nolock:
18161971 if (page != swapcache) {
18171972 unlock_page(page);
18181973 put_page(page);
....@@ -1821,44 +1976,83 @@
18211976 }
18221977
18231978 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1824
- unsigned long addr, unsigned long end,
1825
- swp_entry_t entry, struct page *page)
1979
+ unsigned long addr, unsigned long end,
1980
+ unsigned int type, bool frontswap,
1981
+ unsigned long *fs_pages_to_unuse)
18261982 {
1827
- pte_t swp_pte = swp_entry_to_pte(entry);
1983
+ struct page *page;
1984
+ swp_entry_t entry;
18281985 pte_t *pte;
1986
+ struct swap_info_struct *si;
1987
+ unsigned long offset;
18291988 int ret = 0;
1989
+ volatile unsigned char *swap_map;
18301990
1831
- /*
1832
- * We don't actually need pte lock while scanning for swp_pte: since
1833
- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1834
- * page table while we're scanning; though it could get zapped, and on
1835
- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1836
- * of unmatched parts which look like swp_pte, so unuse_pte must
1837
- * recheck under pte lock. Scanning without pte lock lets it be
1838
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1839
- */
1991
+ si = swap_info[type];
18401992 pte = pte_offset_map(pmd, addr);
18411993 do {
1842
- /*
1843
- * swapoff spends a _lot_ of time in this loop!
1844
- * Test inline before going to call unuse_pte.
1845
- */
1846
- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
1847
- pte_unmap(pte);
1848
- ret = unuse_pte(vma, pmd, addr, entry, page);
1849
- if (ret)
1850
- goto out;
1851
- pte = pte_offset_map(pmd, addr);
1994
+ if (!is_swap_pte(*pte))
1995
+ continue;
1996
+
1997
+ entry = pte_to_swp_entry(*pte);
1998
+ if (swp_type(entry) != type)
1999
+ continue;
2000
+
2001
+ offset = swp_offset(entry);
2002
+ if (frontswap && !frontswap_test(si, offset))
2003
+ continue;
2004
+
2005
+ pte_unmap(pte);
2006
+ swap_map = &si->swap_map[offset];
2007
+ page = lookup_swap_cache(entry, vma, addr);
2008
+ if (!page) {
2009
+ struct vm_fault vmf = {
2010
+ .vma = vma,
2011
+ .address = addr,
2012
+ .pmd = pmd,
2013
+ };
2014
+
2015
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2016
+ &vmf);
18522017 }
2018
+ if (!page) {
2019
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
2020
+ goto try_next;
2021
+ return -ENOMEM;
2022
+ }
2023
+
2024
+ lock_page(page);
2025
+ wait_on_page_writeback(page);
2026
+ ret = unuse_pte(vma, pmd, addr, entry, page);
2027
+ if (ret < 0) {
2028
+ unlock_page(page);
2029
+ put_page(page);
2030
+ goto out;
2031
+ }
2032
+
2033
+ try_to_free_swap(page);
2034
+ trace_android_vh_unuse_swap_page(si, page);
2035
+ unlock_page(page);
2036
+ put_page(page);
2037
+
2038
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
2039
+ ret = FRONTSWAP_PAGES_UNUSED;
2040
+ goto out;
2041
+ }
2042
+try_next:
2043
+ pte = pte_offset_map(pmd, addr);
18532044 } while (pte++, addr += PAGE_SIZE, addr != end);
18542045 pte_unmap(pte - 1);
2046
+
2047
+ ret = 0;
18552048 out:
18562049 return ret;
18572050 }
18582051
18592052 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
18602053 unsigned long addr, unsigned long end,
1861
- swp_entry_t entry, struct page *page)
2054
+ unsigned int type, bool frontswap,
2055
+ unsigned long *fs_pages_to_unuse)
18622056 {
18632057 pmd_t *pmd;
18642058 unsigned long next;
....@@ -1870,7 +2064,8 @@
18702064 next = pmd_addr_end(addr, end);
18712065 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
18722066 continue;
1873
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
2067
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
2068
+ frontswap, fs_pages_to_unuse);
18742069 if (ret)
18752070 return ret;
18762071 } while (pmd++, addr = next, addr != end);
....@@ -1879,7 +2074,8 @@
18792074
18802075 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
18812076 unsigned long addr, unsigned long end,
1882
- swp_entry_t entry, struct page *page)
2077
+ unsigned int type, bool frontswap,
2078
+ unsigned long *fs_pages_to_unuse)
18832079 {
18842080 pud_t *pud;
18852081 unsigned long next;
....@@ -1890,7 +2086,8 @@
18902086 next = pud_addr_end(addr, end);
18912087 if (pud_none_or_clear_bad(pud))
18922088 continue;
1893
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
2089
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
2090
+ frontswap, fs_pages_to_unuse);
18942091 if (ret)
18952092 return ret;
18962093 } while (pud++, addr = next, addr != end);
....@@ -1899,7 +2096,8 @@
18992096
19002097 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
19012098 unsigned long addr, unsigned long end,
1902
- swp_entry_t entry, struct page *page)
2099
+ unsigned int type, bool frontswap,
2100
+ unsigned long *fs_pages_to_unuse)
19032101 {
19042102 p4d_t *p4d;
19052103 unsigned long next;
....@@ -1910,78 +2108,66 @@
19102108 next = p4d_addr_end(addr, end);
19112109 if (p4d_none_or_clear_bad(p4d))
19122110 continue;
1913
- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
2111
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
2112
+ frontswap, fs_pages_to_unuse);
19142113 if (ret)
19152114 return ret;
19162115 } while (p4d++, addr = next, addr != end);
19172116 return 0;
19182117 }
19192118
1920
-static int unuse_vma(struct vm_area_struct *vma,
1921
- swp_entry_t entry, struct page *page)
2119
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
2120
+ bool frontswap, unsigned long *fs_pages_to_unuse)
19222121 {
19232122 pgd_t *pgd;
19242123 unsigned long addr, end, next;
19252124 int ret;
19262125
1927
- if (page_anon_vma(page)) {
1928
- addr = page_address_in_vma(page, vma);
1929
- if (addr == -EFAULT)
1930
- return 0;
1931
- else
1932
- end = addr + PAGE_SIZE;
1933
- } else {
1934
- addr = vma->vm_start;
1935
- end = vma->vm_end;
1936
- }
2126
+ addr = vma->vm_start;
2127
+ end = vma->vm_end;
19372128
19382129 pgd = pgd_offset(vma->vm_mm, addr);
19392130 do {
19402131 next = pgd_addr_end(addr, end);
19412132 if (pgd_none_or_clear_bad(pgd))
19422133 continue;
1943
- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
2134
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
2135
+ frontswap, fs_pages_to_unuse);
19442136 if (ret)
19452137 return ret;
19462138 } while (pgd++, addr = next, addr != end);
19472139 return 0;
19482140 }
19492141
1950
-static int unuse_mm(struct mm_struct *mm,
1951
- swp_entry_t entry, struct page *page)
2142
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
2143
+ bool frontswap, unsigned long *fs_pages_to_unuse)
19522144 {
19532145 struct vm_area_struct *vma;
19542146 int ret = 0;
19552147
1956
- if (!down_read_trylock(&mm->mmap_sem)) {
1957
- /*
1958
- * Activate page so shrink_inactive_list is unlikely to unmap
1959
- * its ptes while lock is dropped, so swapoff can make progress.
1960
- */
1961
- activate_page(page);
1962
- unlock_page(page);
1963
- down_read(&mm->mmap_sem);
1964
- lock_page(page);
1965
- }
2148
+ mmap_read_lock(mm);
19662149 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1967
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1968
- break;
2150
+ if (vma->anon_vma) {
2151
+ ret = unuse_vma(vma, type, frontswap,
2152
+ fs_pages_to_unuse);
2153
+ if (ret)
2154
+ break;
2155
+ }
19692156 cond_resched();
19702157 }
1971
- up_read(&mm->mmap_sem);
1972
- return (ret < 0)? ret: 0;
2158
+ mmap_read_unlock(mm);
2159
+ return ret;
19732160 }
19742161
19752162 /*
19762163 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1977
- * from current position to next entry still in use.
1978
- * Recycle to start on reaching the end, returning 0 when empty.
2164
+ * from current position to next entry still in use. Return 0
2165
+ * if there are no inuse entries after prev till end of the map.
19792166 */
19802167 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
19812168 unsigned int prev, bool frontswap)
19822169 {
1983
- unsigned int max = si->max;
1984
- unsigned int i = prev;
2170
+ unsigned int i;
19852171 unsigned char count;
19862172
19872173 /*
....@@ -1990,20 +2176,7 @@
19902176 * hits are okay, and sys_swapoff() has already prevented new
19912177 * allocations from this area (while holding swap_lock).
19922178 */
1993
- for (;;) {
1994
- if (++i >= max) {
1995
- if (!prev) {
1996
- i = 0;
1997
- break;
1998
- }
1999
- /*
2000
- * No entries in use at top of swap_map,
2001
- * loop back to start and recheck there.
2002
- */
2003
- max = prev + 1;
2004
- prev = 0;
2005
- i = 1;
2006
- }
2179
+ for (i = prev + 1; i < si->max; i++) {
20072180 count = READ_ONCE(si->swap_map[i]);
20082181 if (count && swap_count(count) != SWAP_MAP_BAD)
20092182 if (!frontswap || frontswap_test(si, i))
....@@ -2011,240 +2184,124 @@
20112184 if ((i % LATENCY_LIMIT) == 0)
20122185 cond_resched();
20132186 }
2187
+
2188
+ if (i == si->max)
2189
+ i = 0;
2190
+
20142191 return i;
20152192 }
20162193
20172194 /*
2018
- * We completely avoid races by reading each swap page in advance,
2019
- * and then search for the process using it. All the necessary
2020
- * page table adjustments can then be made atomically.
2021
- *
2022
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
2195
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
20232196 * pages_to_unuse==0 means all pages; ignored if frontswap is false
20242197 */
20252198 int try_to_unuse(unsigned int type, bool frontswap,
20262199 unsigned long pages_to_unuse)
20272200 {
2201
+ struct mm_struct *prev_mm;
2202
+ struct mm_struct *mm;
2203
+ struct list_head *p;
2204
+ int retval = 0;
20282205 struct swap_info_struct *si = swap_info[type];
2029
- struct mm_struct *start_mm;
2030
- volatile unsigned char *swap_map; /* swap_map is accessed without
2031
- * locking. Mark it as volatile
2032
- * to prevent compiler doing
2033
- * something odd.
2034
- */
2035
- unsigned char swcount;
20362206 struct page *page;
20372207 swp_entry_t entry;
2038
- unsigned int i = 0;
2039
- int retval = 0;
2208
+ unsigned int i;
20402209
2041
- /*
2042
- * When searching mms for an entry, a good strategy is to
2043
- * start at the first mm we freed the previous entry from
2044
- * (though actually we don't notice whether we or coincidence
2045
- * freed the entry). Initialize this start_mm with a hold.
2046
- *
2047
- * A simpler strategy would be to start at the last mm we
2048
- * freed the previous entry from; but that would take less
2049
- * advantage of mmlist ordering, which clusters forked mms
2050
- * together, child after parent. If we race with dup_mmap(), we
2051
- * prefer to resolve parent before child, lest we miss entries
2052
- * duplicated after we scanned child: using last mm would invert
2053
- * that.
2054
- */
2055
- start_mm = &init_mm;
2056
- mmget(&init_mm);
2210
+ if (!READ_ONCE(si->inuse_pages))
2211
+ return 0;
20572212
2058
- /*
2059
- * Keep on scanning until all entries have gone. Usually,
2060
- * one pass through swap_map is enough, but not necessarily:
2061
- * there are races when an instance of an entry might be missed.
2062
- */
2063
- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2064
- if (signal_pending(current)) {
2065
- retval = -EINTR;
2066
- break;
2067
- }
2213
+ if (!frontswap)
2214
+ pages_to_unuse = 0;
20682215
2069
- /*
2070
- * Get a page for the entry, using the existing swap
2071
- * cache page if there is one. Otherwise, get a clean
2072
- * page and read the swap into it.
2073
- */
2074
- swap_map = &si->swap_map[i];
2075
- entry = swp_entry(type, i);
2076
- page = read_swap_cache_async(entry,
2077
- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2078
- if (!page) {
2079
- /*
2080
- * Either swap_duplicate() failed because entry
2081
- * has been freed independently, and will not be
2082
- * reused since sys_swapoff() already disabled
2083
- * allocation from here, or alloc_page() failed.
2084
- */
2085
- swcount = *swap_map;
2086
- /*
2087
- * We don't hold lock here, so the swap entry could be
2088
- * SWAP_MAP_BAD (when the cluster is discarding).
2089
- * Instead of fail out, We can just skip the swap
2090
- * entry because swapoff will wait for discarding
2091
- * finish anyway.
2092
- */
2093
- if (!swcount || swcount == SWAP_MAP_BAD)
2094
- continue;
2095
- retval = -ENOMEM;
2096
- break;
2097
- }
2216
+retry:
2217
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2218
+ if (retval)
2219
+ goto out;
20982220
2099
- /*
2100
- * Don't hold on to start_mm if it looks like exiting.
2101
- */
2102
- if (atomic_read(&start_mm->mm_users) == 1) {
2103
- mmput(start_mm);
2104
- start_mm = &init_mm;
2105
- mmget(&init_mm);
2106
- }
2221
+ prev_mm = &init_mm;
2222
+ mmget(prev_mm);
21072223
2108
- /*
2109
- * Wait for and lock page. When do_swap_page races with
2110
- * try_to_unuse, do_swap_page can handle the fault much
2111
- * faster than try_to_unuse can locate the entry. This
2112
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
2113
- * defer to do_swap_page in such a case - in some tests,
2114
- * do_swap_page and try_to_unuse repeatedly compete.
2115
- */
2116
- wait_on_page_locked(page);
2117
- wait_on_page_writeback(page);
2118
- lock_page(page);
2119
- wait_on_page_writeback(page);
2224
+ spin_lock(&mmlist_lock);
2225
+ p = &init_mm.mmlist;
2226
+ while (READ_ONCE(si->inuse_pages) &&
2227
+ !signal_pending(current) &&
2228
+ (p = p->next) != &init_mm.mmlist) {
21202229
2121
- /*
2122
- * Remove all references to entry.
2123
- */
2124
- swcount = *swap_map;
2125
- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
2126
- retval = shmem_unuse(entry, page);
2127
- /* page has already been unlocked and released */
2128
- if (retval < 0)
2129
- break;
2230
+ mm = list_entry(p, struct mm_struct, mmlist);
2231
+ if (!mmget_not_zero(mm))
21302232 continue;
2131
- }
2132
- if (swap_count(swcount) && start_mm != &init_mm)
2133
- retval = unuse_mm(start_mm, entry, page);
2233
+ spin_unlock(&mmlist_lock);
2234
+ mmput(prev_mm);
2235
+ prev_mm = mm;
2236
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
21342237
2135
- if (swap_count(*swap_map)) {
2136
- int set_start_mm = (*swap_map >= swcount);
2137
- struct list_head *p = &start_mm->mmlist;
2138
- struct mm_struct *new_start_mm = start_mm;
2139
- struct mm_struct *prev_mm = start_mm;
2140
- struct mm_struct *mm;
2141
-
2142
- mmget(new_start_mm);
2143
- mmget(prev_mm);
2144
- spin_lock(&mmlist_lock);
2145
- while (swap_count(*swap_map) && !retval &&
2146
- (p = p->next) != &start_mm->mmlist) {
2147
- mm = list_entry(p, struct mm_struct, mmlist);
2148
- if (!mmget_not_zero(mm))
2149
- continue;
2150
- spin_unlock(&mmlist_lock);
2151
- mmput(prev_mm);
2152
- prev_mm = mm;
2153
-
2154
- cond_resched();
2155
-
2156
- swcount = *swap_map;
2157
- if (!swap_count(swcount)) /* any usage ? */
2158
- ;
2159
- else if (mm == &init_mm)
2160
- set_start_mm = 1;
2161
- else
2162
- retval = unuse_mm(mm, entry, page);
2163
-
2164
- if (set_start_mm && *swap_map < swcount) {
2165
- mmput(new_start_mm);
2166
- mmget(mm);
2167
- new_start_mm = mm;
2168
- set_start_mm = 0;
2169
- }
2170
- spin_lock(&mmlist_lock);
2171
- }
2172
- spin_unlock(&mmlist_lock);
2173
- mmput(prev_mm);
2174
- mmput(start_mm);
2175
- start_mm = new_start_mm;
2176
- }
21772238 if (retval) {
2178
- unlock_page(page);
2179
- put_page(page);
2180
- break;
2239
+ mmput(prev_mm);
2240
+ goto out;
21812241 }
2182
-
2183
- /*
2184
- * If a reference remains (rare), we would like to leave
2185
- * the page in the swap cache; but try_to_unmap could
2186
- * then re-duplicate the entry once we drop page lock,
2187
- * so we might loop indefinitely; also, that page could
2188
- * not be swapped out to other storage meanwhile. So:
2189
- * delete from cache even if there's another reference,
2190
- * after ensuring that the data has been saved to disk -
2191
- * since if the reference remains (rarer), it will be
2192
- * read from disk into another page. Splitting into two
2193
- * pages would be incorrect if swap supported "shared
2194
- * private" pages, but they are handled by tmpfs files.
2195
- *
2196
- * Given how unuse_vma() targets one particular offset
2197
- * in an anon_vma, once the anon_vma has been determined,
2198
- * this splitting happens to be just what is needed to
2199
- * handle where KSM pages have been swapped out: re-reading
2200
- * is unnecessarily slow, but we can fix that later on.
2201
- */
2202
- if (swap_count(*swap_map) &&
2203
- PageDirty(page) && PageSwapCache(page)) {
2204
- struct writeback_control wbc = {
2205
- .sync_mode = WB_SYNC_NONE,
2206
- };
2207
-
2208
- swap_writepage(compound_head(page), &wbc);
2209
- lock_page(page);
2210
- wait_on_page_writeback(page);
2211
- }
2212
-
2213
- /*
2214
- * It is conceivable that a racing task removed this page from
2215
- * swap cache just before we acquired the page lock at the top,
2216
- * or while we dropped it in unuse_mm(). The page might even
2217
- * be back in swap cache on another swap area: that we must not
2218
- * delete, since it may not have been written out to swap yet.
2219
- */
2220
- if (PageSwapCache(page) &&
2221
- likely(page_private(page) == entry.val) &&
2222
- (!PageTransCompound(page) ||
2223
- !swap_page_trans_huge_swapped(si, entry)))
2224
- delete_from_swap_cache(compound_head(page));
2225
-
2226
- /*
2227
- * So we could skip searching mms once swap count went
2228
- * to 1, we did not mark any present ptes as dirty: must
2229
- * mark page dirty so shrink_page_list will preserve it.
2230
- */
2231
- SetPageDirty(page);
2232
- unlock_page(page);
2233
- put_page(page);
22342242
22352243 /*
22362244 * Make sure that we aren't completely killing
22372245 * interactive performance.
22382246 */
22392247 cond_resched();
2240
- if (frontswap && pages_to_unuse > 0) {
2241
- if (!--pages_to_unuse)
2242
- break;
2243
- }
2248
+ spin_lock(&mmlist_lock);
2249
+ }
2250
+ spin_unlock(&mmlist_lock);
2251
+
2252
+ mmput(prev_mm);
2253
+
2254
+ i = 0;
2255
+ while (READ_ONCE(si->inuse_pages) &&
2256
+ !signal_pending(current) &&
2257
+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
2258
+
2259
+ entry = swp_entry(type, i);
2260
+ page = find_get_page(swap_address_space(entry), i);
2261
+ if (!page)
2262
+ continue;
2263
+
2264
+ /*
2265
+ * It is conceivable that a racing task removed this page from
2266
+ * swap cache just before we acquired the page lock. The page
2267
+ * might even be back in swap cache on another swap area. But
2268
+ * that is okay, try_to_free_swap() only removes stale pages.
2269
+ */
2270
+ lock_page(page);
2271
+ wait_on_page_writeback(page);
2272
+ try_to_free_swap(page);
2273
+ trace_android_vh_unuse_swap_page(si, page);
2274
+ unlock_page(page);
2275
+ put_page(page);
2276
+
2277
+ /*
2278
+ * For frontswap, we just need to unuse pages_to_unuse, if
2279
+ * it was specified. Need not check frontswap again here as
2280
+ * we already zeroed out pages_to_unuse if not frontswap.
2281
+ */
2282
+ if (pages_to_unuse && --pages_to_unuse == 0)
2283
+ goto out;
22442284 }
22452285
2246
- mmput(start_mm);
2247
- return retval;
2286
+ /*
2287
+ * Lets check again to see if there are still swap entries in the map.
2288
+ * If yes, we would need to do retry the unuse logic again.
2289
+ * Under global memory pressure, swap entries can be reinserted back
2290
+ * into process space after the mmlist loop above passes over them.
2291
+ *
2292
+ * Limit the number of retries? No: when mmget_not_zero() above fails,
2293
+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
2294
+ * at its own independent pace; and even shmem_writepage() could have
2295
+ * been preempted after get_swap_page(), temporarily hiding that swap.
2296
+ * It's easy and robust (though cpu-intensive) just to keep retrying.
2297
+ */
2298
+ if (READ_ONCE(si->inuse_pages)) {
2299
+ if (!signal_pending(current))
2300
+ goto retry;
2301
+ retval = -EINTR;
2302
+ }
2303
+out:
2304
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
22482305 }
22492306
22502307 /*
....@@ -2276,7 +2333,6 @@
22762333 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
22772334 {
22782335 struct swap_info_struct *sis;
2279
- struct swap_extent *start_se;
22802336 struct swap_extent *se;
22812337 pgoff_t offset;
22822338
....@@ -2284,18 +2340,8 @@
22842340 *bdev = sis->bdev;
22852341
22862342 offset = swp_offset(entry);
2287
- start_se = sis->curr_swap_extent;
2288
- se = start_se;
2289
-
2290
- for ( ; ; ) {
2291
- if (se->start_page <= offset &&
2292
- offset < (se->start_page + se->nr_pages)) {
2293
- return se->start_block + (offset - se->start_page);
2294
- }
2295
- se = list_next_entry(se, list);
2296
- sis->curr_swap_extent = se;
2297
- BUG_ON(se == start_se); /* It *must* be present */
2298
- }
2343
+ se = offset_to_swap_extent(sis, offset);
2344
+ return se->start_block + (offset - se->start_page);
22992345 }
23002346
23012347 /*
....@@ -2305,7 +2351,7 @@
23052351 {
23062352 swp_entry_t entry;
23072353 entry.val = page_private(page);
2308
- return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9);
2354
+ return map_swap_entry(entry, bdev);
23092355 }
23102356
23112357 /*
....@@ -2313,27 +2359,27 @@
23132359 */
23142360 static void destroy_swap_extents(struct swap_info_struct *sis)
23152361 {
2316
- while (!list_empty(&sis->first_swap_extent.list)) {
2317
- struct swap_extent *se;
2362
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2363
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
2364
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
23182365
2319
- se = list_first_entry(&sis->first_swap_extent.list,
2320
- struct swap_extent, list);
2321
- list_del(&se->list);
2366
+ rb_erase(rb, &sis->swap_extent_root);
23222367 kfree(se);
23232368 }
23242369
2325
- if (sis->flags & SWP_FILE) {
2370
+ if (sis->flags & SWP_ACTIVATED) {
23262371 struct file *swap_file = sis->swap_file;
23272372 struct address_space *mapping = swap_file->f_mapping;
23282373
2329
- sis->flags &= ~SWP_FILE;
2330
- mapping->a_ops->swap_deactivate(swap_file);
2374
+ sis->flags &= ~SWP_ACTIVATED;
2375
+ if (mapping->a_ops->swap_deactivate)
2376
+ mapping->a_ops->swap_deactivate(swap_file);
23312377 }
23322378 }
23332379
23342380 /*
23352381 * Add a block range (and the corresponding page range) into this swapdev's
2336
- * extent list. The extent list is kept sorted in page order.
2382
+ * extent tree.
23372383 *
23382384 * This function rather assumes that it is called in ascending page order.
23392385 */
....@@ -2341,20 +2387,21 @@
23412387 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
23422388 unsigned long nr_pages, sector_t start_block)
23432389 {
2390
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
23442391 struct swap_extent *se;
23452392 struct swap_extent *new_se;
2346
- struct list_head *lh;
23472393
2348
- if (start_page == 0) {
2349
- se = &sis->first_swap_extent;
2350
- sis->curr_swap_extent = se;
2351
- se->start_page = 0;
2352
- se->nr_pages = nr_pages;
2353
- se->start_block = start_block;
2354
- return 1;
2355
- } else {
2356
- lh = sis->first_swap_extent.list.prev; /* Highest extent */
2357
- se = list_entry(lh, struct swap_extent, list);
2394
+ /*
2395
+ * place the new node at the right most since the
2396
+ * function is called in ascending page order.
2397
+ */
2398
+ while (*link) {
2399
+ parent = *link;
2400
+ link = &parent->rb_right;
2401
+ }
2402
+
2403
+ if (parent) {
2404
+ se = rb_entry(parent, struct swap_extent, rb_node);
23582405 BUG_ON(se->start_page + se->nr_pages != start_page);
23592406 if (se->start_block + se->nr_pages == start_block) {
23602407 /* Merge it */
....@@ -2363,9 +2410,7 @@
23632410 }
23642411 }
23652412
2366
- /*
2367
- * No merge. Insert a new extent, preserving ordering.
2368
- */
2413
+ /* No merge, insert a new extent. */
23692414 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
23702415 if (new_se == NULL)
23712416 return -ENOMEM;
....@@ -2373,7 +2418,8 @@
23732418 new_se->nr_pages = nr_pages;
23742419 new_se->start_block = start_block;
23752420
2376
- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
2421
+ rb_link_node(&new_se->rb_node, parent, link);
2422
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
23772423 return 1;
23782424 }
23792425 EXPORT_SYMBOL_GPL(add_swap_extent);
....@@ -2423,8 +2469,10 @@
24232469
24242470 if (mapping->a_ops->swap_activate) {
24252471 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2472
+ if (ret >= 0)
2473
+ sis->flags |= SWP_ACTIVATED;
24262474 if (!ret) {
2427
- sis->flags |= SWP_FILE;
2475
+ sis->flags |= SWP_FS_OPS;
24282476 ret = add_swap_extent(sis, 0, sis->max, 0);
24292477 *span = sis->pages;
24302478 }
....@@ -2446,9 +2494,9 @@
24462494 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
24472495 }
24482496
2449
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
2450
- unsigned char *swap_map,
2451
- struct swap_cluster_info *cluster_info)
2497
+static void setup_swap_info(struct swap_info_struct *p, int prio,
2498
+ unsigned char *swap_map,
2499
+ struct swap_cluster_info *cluster_info)
24522500 {
24532501 int i;
24542502
....@@ -2473,10 +2521,18 @@
24732521 }
24742522 p->swap_map = swap_map;
24752523 p->cluster_info = cluster_info;
2476
- p->flags |= SWP_WRITEOK;
2477
- atomic_long_add(p->pages, &nr_swap_pages);
2478
- total_swap_pages += p->pages;
2524
+}
24792525
2526
+static void _enable_swap_info(struct swap_info_struct *p)
2527
+{
2528
+ bool skip = false;
2529
+
2530
+ p->flags |= SWP_WRITEOK | SWP_VALID;
2531
+ trace_android_vh_account_swap_pages(p, &skip);
2532
+ if (!skip) {
2533
+ atomic_long_add(p->pages, &nr_swap_pages);
2534
+ total_swap_pages += p->pages;
2535
+ }
24802536 assert_spin_locked(&swap_lock);
24812537 /*
24822538 * both lists are plists, and thus priority ordered.
....@@ -2500,7 +2556,17 @@
25002556 frontswap_init(p->type, frontswap_map);
25012557 spin_lock(&swap_lock);
25022558 spin_lock(&p->lock);
2503
- _enable_swap_info(p, prio, swap_map, cluster_info);
2559
+ setup_swap_info(p, prio, swap_map, cluster_info);
2560
+ spin_unlock(&p->lock);
2561
+ spin_unlock(&swap_lock);
2562
+ /*
2563
+ * Guarantee swap_map, cluster_info, etc. fields are valid
2564
+ * between get/put_swap_device() if SWP_VALID bit is set
2565
+ */
2566
+ synchronize_rcu();
2567
+ spin_lock(&swap_lock);
2568
+ spin_lock(&p->lock);
2569
+ _enable_swap_info(p);
25042570 spin_unlock(&p->lock);
25052571 spin_unlock(&swap_lock);
25062572 }
....@@ -2509,7 +2575,8 @@
25092575 {
25102576 spin_lock(&swap_lock);
25112577 spin_lock(&p->lock);
2512
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2578
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2579
+ _enable_swap_info(p);
25132580 spin_unlock(&p->lock);
25142581 spin_unlock(&swap_lock);
25152582 }
....@@ -2537,6 +2604,7 @@
25372604 struct filename *pathname;
25382605 int err, found = 0;
25392606 unsigned int old_block_size;
2607
+ bool skip = false;
25402608
25412609 if (!capable(CAP_SYS_ADMIN))
25422610 return -EPERM;
....@@ -2574,8 +2642,8 @@
25742642 spin_unlock(&swap_lock);
25752643 goto out_dput;
25762644 }
2577
- del_from_avail_list(p);
25782645 spin_lock(&p->lock);
2646
+ del_from_avail_list(p);
25792647 if (p->prio < 0) {
25802648 struct swap_info_struct *si = p;
25812649 int nid;
....@@ -2591,8 +2659,11 @@
25912659 least_priority++;
25922660 }
25932661 plist_del(&p->list, &swap_active_head);
2594
- atomic_long_sub(p->pages, &nr_swap_pages);
2595
- total_swap_pages -= p->pages;
2662
+ trace_android_vh_account_swap_pages(p, &skip);
2663
+ if (!skip) {
2664
+ atomic_long_sub(p->pages, &nr_swap_pages);
2665
+ total_swap_pages -= p->pages;
2666
+ }
25962667 p->flags &= ~SWP_WRITEOK;
25972668 spin_unlock(&p->lock);
25982669 spin_unlock(&swap_lock);
....@@ -2611,6 +2682,17 @@
26112682 }
26122683
26132684 reenable_swap_slots_cache_unlock();
2685
+
2686
+ spin_lock(&swap_lock);
2687
+ spin_lock(&p->lock);
2688
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
2689
+ spin_unlock(&p->lock);
2690
+ spin_unlock(&swap_lock);
2691
+ /*
2692
+ * wait for swap operations protected by get/put_swap_device()
2693
+ * to complete
2694
+ */
2695
+ synchronize_rcu();
26142696
26152697 flush_work(&p->discard_work);
26162698
....@@ -2647,11 +2729,14 @@
26472729 frontswap_map = frontswap_map_get(p);
26482730 spin_unlock(&p->lock);
26492731 spin_unlock(&swap_lock);
2732
+ arch_swap_invalidate_area(p->type);
26502733 frontswap_invalidate_area(p->type);
26512734 frontswap_map_set(p, NULL);
26522735 mutex_unlock(&swapon_mutex);
26532736 free_percpu(p->percpu_cluster);
26542737 p->percpu_cluster = NULL;
2738
+ free_percpu(p->cluster_next_cpu);
2739
+ p->cluster_next_cpu = NULL;
26552740 vfree(swap_map);
26562741 kvfree(cluster_info);
26572742 kvfree(frontswap_map);
....@@ -2759,20 +2844,24 @@
27592844 struct swap_info_struct *si = v;
27602845 struct file *file;
27612846 int len;
2847
+ unsigned int bytes, inuse;
27622848
27632849 if (si == SEQ_START_TOKEN) {
2764
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
2850
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
27652851 return 0;
27662852 }
27672853
2854
+ bytes = si->pages << (PAGE_SHIFT - 10);
2855
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
2856
+
27682857 file = si->swap_file;
27692858 len = seq_file_path(swap, file, " \t\n\\");
2770
- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
2859
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
27712860 len < 40 ? 40 - len : 1, " ",
27722861 S_ISBLK(file_inode(file)->i_mode) ?
27732862 "partition" : "file\t",
2774
- si->pages << (PAGE_SHIFT - 10),
2775
- si->inuse_pages << (PAGE_SHIFT - 10),
2863
+ bytes, bytes < 10000000 ? "\t" : "",
2864
+ inuse, inuse < 10000000 ? "\t" : "",
27762865 si->prio);
27772866 return 0;
27782867 }
....@@ -2798,17 +2887,18 @@
27982887 return 0;
27992888 }
28002889
2801
-static const struct file_operations proc_swaps_operations = {
2802
- .open = swaps_open,
2803
- .read = seq_read,
2804
- .llseek = seq_lseek,
2805
- .release = seq_release,
2806
- .poll = swaps_poll,
2890
+static const struct proc_ops swaps_proc_ops = {
2891
+ .proc_flags = PROC_ENTRY_PERMANENT,
2892
+ .proc_open = swaps_open,
2893
+ .proc_read = seq_read,
2894
+ .proc_lseek = seq_lseek,
2895
+ .proc_release = seq_release,
2896
+ .proc_poll = swaps_poll,
28072897 };
28082898
28092899 static int __init procswaps_init(void)
28102900 {
2811
- proc_create("swaps", 0, NULL, &proc_swaps_operations);
2901
+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
28122902 return 0;
28132903 }
28142904 __initcall(procswaps_init);
....@@ -2825,13 +2915,16 @@
28252915
28262916 static struct swap_info_struct *alloc_swap_info(void)
28272917 {
2828
- struct swap_info_struct *p;
2918
+ struct swap_info_struct *p = NULL;
28292919 struct swap_info_struct *defer = NULL;
28302920 unsigned int type;
28312921 int i;
2832
- int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
2922
+ bool skip = false;
28332923
2834
- p = kvzalloc(size, GFP_KERNEL);
2924
+ trace_android_rvh_alloc_si(&p, &skip);
2925
+ trace_android_vh_alloc_si(&p, &skip);
2926
+ if (!skip)
2927
+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
28352928 if (!p)
28362929 return ERR_PTR(-ENOMEM);
28372930
....@@ -2863,7 +2956,7 @@
28632956 * would be relying on p->type to remain valid.
28642957 */
28652958 }
2866
- INIT_LIST_HEAD(&p->first_swap_extent.list);
2959
+ p->swap_extent_root = RB_ROOT;
28672960 plist_node_init(&p->list, 0);
28682961 for_each_node(i)
28692962 plist_node_init(&p->avail_lists[i], 0);
....@@ -2881,10 +2974,10 @@
28812974 int error;
28822975
28832976 if (S_ISBLK(inode->i_mode)) {
2884
- p->bdev = bdgrab(I_BDEV(inode));
2885
- error = blkdev_get(p->bdev,
2977
+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
28862978 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
2887
- if (error < 0) {
2979
+ if (IS_ERR(p->bdev)) {
2980
+ error = PTR_ERR(p->bdev);
28882981 p->bdev = NULL;
28892982 return error;
28902983 }
....@@ -2892,6 +2985,13 @@
28922985 error = set_blocksize(p->bdev, PAGE_SIZE);
28932986 if (error < 0)
28942987 return error;
2988
+ /*
2989
+ * Zoned block devices contain zones that have a sequential
2990
+ * write only restriction. Hence zoned block devices are not
2991
+ * suitable for swapping. Disallow them here.
2992
+ */
2993
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
2994
+ return -EINVAL;
28952995 p->flags |= SWP_BLKDEV;
28962996 } else if (S_ISREG(inode->i_mode)) {
28972997 p->bdev = inode->i_sb->s_bdev;
....@@ -3188,10 +3288,10 @@
31883288 goto bad_swap_unlock_inode;
31893289 }
31903290
3191
- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
3291
+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
31923292 p->flags |= SWP_STABLE_WRITES;
31933293
3194
- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
3294
+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
31953295 p->flags |= SWP_SYNCHRONOUS_IO;
31963296
31973297 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
....@@ -3199,11 +3299,19 @@
31993299 unsigned long ci, nr_cluster;
32003300
32013301 p->flags |= SWP_SOLIDSTATE;
3302
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
3303
+ if (!p->cluster_next_cpu) {
3304
+ error = -ENOMEM;
3305
+ goto bad_swap_unlock_inode;
3306
+ }
32023307 /*
32033308 * select a random position to start with to help wear leveling
32043309 * SSD
32053310 */
3206
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
3311
+ for_each_possible_cpu(cpu) {
3312
+ per_cpu(*p->cluster_next_cpu, cpu) =
3313
+ 1 + prandom_u32_max(p->highest_bit);
3314
+ }
32073315 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
32083316
32093317 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
....@@ -3289,7 +3397,7 @@
32893397 error = inode_drain_writes(inode);
32903398 if (error) {
32913399 inode->i_flags &= ~S_SWAPFILE;
3292
- goto bad_swap_unlock_inode;
3400
+ goto free_swap_address_space;
32933401 }
32943402
32953403 mutex_lock(&swapon_mutex);
....@@ -3297,8 +3405,11 @@
32973405 if (swap_flags & SWAP_FLAG_PREFER)
32983406 prio =
32993407 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3408
+
3409
+ trace_android_vh_swap_avail_heads_init(swap_avail_heads);
33003410 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
33013411
3412
+ trace_android_vh_init_swap_info_struct(p, swap_avail_heads);
33023413 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
33033414 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
33043415 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
....@@ -3314,11 +3425,15 @@
33143425
33153426 error = 0;
33163427 goto out;
3428
+free_swap_address_space:
3429
+ exit_swap_address_space(p->type);
33173430 bad_swap_unlock_inode:
33183431 inode_unlock(inode);
33193432 bad_swap:
33203433 free_percpu(p->percpu_cluster);
33213434 p->percpu_cluster = NULL;
3435
+ free_percpu(p->cluster_next_cpu);
3436
+ p->cluster_next_cpu = NULL;
33223437 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
33233438 set_blocksize(p->bdev, p->old_block_size);
33243439 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
....@@ -3359,14 +3474,17 @@
33593474 spin_lock(&swap_lock);
33603475 for (type = 0; type < nr_swapfiles; type++) {
33613476 struct swap_info_struct *si = swap_info[type];
3477
+ bool skip = false;
33623478
3363
- if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3479
+ trace_android_vh_si_swapinfo(si, &skip);
3480
+ if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
33643481 nr_to_be_unused += si->inuse_pages;
33653482 }
33663483 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
33673484 val->totalswap = total_swap_pages + nr_to_be_unused;
33683485 spin_unlock(&swap_lock);
33693486 }
3487
+EXPORT_SYMBOL_GPL(si_swapinfo);
33703488
33713489 /*
33723490 * Verify that a swap entry is valid and increment its swap map count.
....@@ -3388,17 +3506,11 @@
33883506 unsigned char has_cache;
33893507 int err = -EINVAL;
33903508
3391
- if (non_swap_entry(entry))
3392
- goto out;
3393
-
3394
- p = swp_swap_info(entry);
3509
+ p = get_swap_device(entry);
33953510 if (!p)
3396
- goto bad_file;
3511
+ goto out;
33973512
33983513 offset = swp_offset(entry);
3399
- if (unlikely(offset >= p->max))
3400
- goto out;
3401
-
34023514 ci = lock_cluster_or_swap_info(p, offset);
34033515
34043516 count = p->swap_map[offset];
....@@ -3439,16 +3551,14 @@
34393551 } else
34403552 err = -ENOENT; /* unused swap entry */
34413553
3442
- p->swap_map[offset] = count | has_cache;
3554
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
34433555
34443556 unlock_out:
34453557 unlock_cluster_or_swap_info(p, ci);
34463558 out:
3559
+ if (p)
3560
+ put_swap_device(p);
34473561 return err;
3448
-
3449
-bad_file:
3450
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3451
- goto out;
34523562 }
34533563
34543564 /*
....@@ -3481,7 +3591,7 @@
34813591 *
34823592 * Called when allocating swap cache for existing swap entry,
34833593 * This can return error codes. Returns 0 at success.
3484
- * -EBUSY means there is a swap cache.
3594
+ * -EEXIST means there is a swap cache.
34853595 * Note: return code is different from swap_duplicate().
34863596 */
34873597 int swapcache_prepare(swp_entry_t entry)
....@@ -3493,6 +3603,7 @@
34933603 {
34943604 return swap_type_to_swap_info(swp_type(entry));
34953605 }
3606
+EXPORT_SYMBOL_GPL(swp_swap_info);
34963607
34973608 struct swap_info_struct *page_swap_info(struct page *page)
34983609 {
....@@ -3540,6 +3651,7 @@
35403651 struct page *list_page;
35413652 pgoff_t offset;
35423653 unsigned char count;
3654
+ int ret = 0;
35433655
35443656 /*
35453657 * When debugging, it's easier to use __GFP_ZERO here; but it's better
....@@ -3547,15 +3659,15 @@
35473659 */
35483660 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
35493661
3550
- si = swap_info_get(entry);
3662
+ si = get_swap_device(entry);
35513663 if (!si) {
35523664 /*
35533665 * An acceptable race has occurred since the failing
3554
- * __swap_duplicate(): the swap entry has been freed,
3555
- * perhaps even the whole swap_map cleared for swapoff.
3666
+ * __swap_duplicate(): the swap device may be swapoff
35563667 */
35573668 goto outer;
35583669 }
3670
+ spin_lock(&si->lock);
35593671
35603672 offset = swp_offset(entry);
35613673
....@@ -3573,9 +3685,8 @@
35733685 }
35743686
35753687 if (!page) {
3576
- unlock_cluster(ci);
3577
- spin_unlock(&si->lock);
3578
- return -ENOMEM;
3688
+ ret = -ENOMEM;
3689
+ goto out;
35793690 }
35803691
35813692 /*
....@@ -3627,10 +3738,11 @@
36273738 out:
36283739 unlock_cluster(ci);
36293740 spin_unlock(&si->lock);
3741
+ put_swap_device(si);
36303742 outer:
36313743 if (page)
36323744 __free_page(page);
3633
- return 0;
3745
+ return ret;
36343746 }
36353747
36363748 /*
....@@ -3658,7 +3770,7 @@
36583770
36593771 spin_lock(&si->cont_lock);
36603772 offset &= ~PAGE_MASK;
3661
- page = list_entry(head->lru.next, struct page, lru);
3773
+ page = list_next_entry(head, lru);
36623774 map = kmap_atomic(page) + offset;
36633775
36643776 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
....@@ -3670,13 +3782,13 @@
36703782 */
36713783 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
36723784 kunmap_atomic(map);
3673
- page = list_entry(page->lru.next, struct page, lru);
3785
+ page = list_next_entry(page, lru);
36743786 BUG_ON(page == head);
36753787 map = kmap_atomic(page) + offset;
36763788 }
36773789 if (*map == SWAP_CONT_MAX) {
36783790 kunmap_atomic(map);
3679
- page = list_entry(page->lru.next, struct page, lru);
3791
+ page = list_next_entry(page, lru);
36803792 if (page == head) {
36813793 ret = false; /* add count continuation */
36823794 goto out;
....@@ -3686,12 +3798,10 @@
36863798 }
36873799 *map += 1;
36883800 kunmap_atomic(map);
3689
- page = list_entry(page->lru.prev, struct page, lru);
3690
- while (page != head) {
3801
+ while ((page = list_prev_entry(page, lru)) != head) {
36913802 map = kmap_atomic(page) + offset;
36923803 *map = COUNT_CONTINUED;
36933804 kunmap_atomic(map);
3694
- page = list_entry(page->lru.prev, struct page, lru);
36953805 }
36963806 ret = true; /* incremented */
36973807
....@@ -3702,7 +3812,7 @@
37023812 BUG_ON(count != COUNT_CONTINUED);
37033813 while (*map == COUNT_CONTINUED) {
37043814 kunmap_atomic(map);
3705
- page = list_entry(page->lru.next, struct page, lru);
3815
+ page = list_next_entry(page, lru);
37063816 BUG_ON(page == head);
37073817 map = kmap_atomic(page) + offset;
37083818 }
....@@ -3711,13 +3821,11 @@
37113821 if (*map == 0)
37123822 count = 0;
37133823 kunmap_atomic(map);
3714
- page = list_entry(page->lru.prev, struct page, lru);
3715
- while (page != head) {
3824
+ while ((page = list_prev_entry(page, lru)) != head) {
37163825 map = kmap_atomic(page) + offset;
37173826 *map = SWAP_CONT_MAX | count;
37183827 count = COUNT_CONTINUED;
37193828 kunmap_atomic(map);
3720
- page = list_entry(page->lru.prev, struct page, lru);
37213829 }
37223830 ret = count == COUNT_CONTINUED;
37233831 }
....@@ -3749,11 +3857,13 @@
37493857 }
37503858
37513859 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3752
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3753
- gfp_t gfp_mask)
3860
+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
37543861 {
37553862 struct swap_info_struct *si, *next;
3756
- if (!(gfp_mask & __GFP_IO) || !memcg)
3863
+ int nid = page_to_nid(page);
3864
+ bool skip = false;
3865
+
3866
+ if (!(gfp_mask & __GFP_IO))
37573867 return;
37583868
37593869 if (!blk_cgroup_congested())
....@@ -3766,12 +3876,15 @@
37663876 if (current->throttle_queue)
37673877 return;
37683878
3879
+ trace_android_vh___cgroup_throttle_swaprate(nid, &skip);
3880
+ if (skip)
3881
+ return;
3882
+
37693883 spin_lock(&swap_avail_lock);
3770
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3771
- avail_lists[node]) {
3884
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3885
+ avail_lists[nid]) {
37723886 if (si->bdev) {
3773
- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3774
- true);
3887
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
37753888 break;
37763889 }
37773890 }