.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/mm/swapfile.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
39 | 40 | #include <linux/swap_slots.h> |
---|
40 | 41 | #include <linux/sort.h> |
---|
41 | 42 | |
---|
42 | | -#include <asm/pgtable.h> |
---|
43 | 43 | #include <asm/tlbflush.h> |
---|
44 | 44 | #include <linux/swapops.h> |
---|
45 | 45 | #include <linux/swap_cgroup.h> |
---|
| 46 | +#include <trace/hooks/mm.h> |
---|
46 | 47 | |
---|
47 | 48 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, |
---|
48 | 49 | unsigned char); |
---|
.. | .. |
---|
98 | 99 | |
---|
99 | 100 | atomic_t nr_rotate_swap = ATOMIC_INIT(0); |
---|
100 | 101 | |
---|
101 | | -static struct swap_info_struct *swap_type_to_swap_info(int type) |
---|
| 102 | +struct swap_info_struct *swap_type_to_swap_info(int type) |
---|
102 | 103 | { |
---|
103 | 104 | if (type >= READ_ONCE(nr_swapfiles)) |
---|
104 | 105 | return NULL; |
---|
.. | .. |
---|
106 | 107 | smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ |
---|
107 | 108 | return READ_ONCE(swap_info[type]); |
---|
108 | 109 | } |
---|
| 110 | +EXPORT_SYMBOL_GPL(swap_type_to_swap_info); |
---|
109 | 111 | |
---|
110 | 112 | static inline unsigned char swap_count(unsigned char ent) |
---|
111 | 113 | { |
---|
112 | 114 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ |
---|
113 | 115 | } |
---|
114 | 116 | |
---|
| 117 | +/* Reclaim the swap entry anyway if possible */ |
---|
| 118 | +#define TTRS_ANYWAY 0x1 |
---|
| 119 | +/* |
---|
| 120 | + * Reclaim the swap entry if there are no more mappings of the |
---|
| 121 | + * corresponding page |
---|
| 122 | + */ |
---|
| 123 | +#define TTRS_UNMAPPED 0x2 |
---|
| 124 | +/* Reclaim the swap entry if swap is getting full*/ |
---|
| 125 | +#define TTRS_FULL 0x4 |
---|
| 126 | + |
---|
115 | 127 | /* returns 1 if swap entry is freed */ |
---|
116 | | -static int |
---|
117 | | -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
---|
| 128 | +static int __try_to_reclaim_swap(struct swap_info_struct *si, |
---|
| 129 | + unsigned long offset, unsigned long flags) |
---|
118 | 130 | { |
---|
119 | 131 | swp_entry_t entry = swp_entry(si->type, offset); |
---|
120 | 132 | struct page *page; |
---|
121 | 133 | int ret = 0; |
---|
122 | 134 | |
---|
123 | | - page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
---|
| 135 | + page = find_get_page(swap_address_space(entry), offset); |
---|
124 | 136 | if (!page) |
---|
125 | 137 | return 0; |
---|
126 | 138 | /* |
---|
127 | | - * This function is called from scan_swap_map() and it's called |
---|
128 | | - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. |
---|
129 | | - * We have to use trylock for avoiding deadlock. This is a special |
---|
| 139 | + * When this function is called from scan_swap_map_slots() and it's |
---|
| 140 | + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, |
---|
| 141 | + * here. We have to use trylock for avoiding deadlock. This is a special |
---|
130 | 142 | * case and you should use try_to_free_swap() with explicit lock_page() |
---|
131 | 143 | * in usual operations. |
---|
132 | 144 | */ |
---|
133 | 145 | if (trylock_page(page)) { |
---|
134 | | - ret = try_to_free_swap(page); |
---|
| 146 | + if ((flags & TTRS_ANYWAY) || |
---|
| 147 | + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || |
---|
| 148 | + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) |
---|
| 149 | + ret = try_to_free_swap(page); |
---|
135 | 150 | unlock_page(page); |
---|
136 | 151 | } |
---|
137 | 152 | put_page(page); |
---|
138 | 153 | return ret; |
---|
| 154 | +} |
---|
| 155 | + |
---|
| 156 | +static inline struct swap_extent *first_se(struct swap_info_struct *sis) |
---|
| 157 | +{ |
---|
| 158 | + struct rb_node *rb = rb_first(&sis->swap_extent_root); |
---|
| 159 | + return rb_entry(rb, struct swap_extent, rb_node); |
---|
| 160 | +} |
---|
| 161 | + |
---|
| 162 | +static inline struct swap_extent *next_se(struct swap_extent *se) |
---|
| 163 | +{ |
---|
| 164 | + struct rb_node *rb = rb_next(&se->rb_node); |
---|
| 165 | + return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; |
---|
139 | 166 | } |
---|
140 | 167 | |
---|
141 | 168 | /* |
---|
.. | .. |
---|
150 | 177 | int err = 0; |
---|
151 | 178 | |
---|
152 | 179 | /* Do not discard the swap header page! */ |
---|
153 | | - se = &si->first_swap_extent; |
---|
| 180 | + se = first_se(si); |
---|
154 | 181 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
---|
155 | 182 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); |
---|
156 | 183 | if (nr_blocks) { |
---|
.. | .. |
---|
161 | 188 | cond_resched(); |
---|
162 | 189 | } |
---|
163 | 190 | |
---|
164 | | - list_for_each_entry(se, &si->first_swap_extent.list, list) { |
---|
| 191 | + for (se = next_se(se); se; se = next_se(se)) { |
---|
165 | 192 | start_block = se->start_block << (PAGE_SHIFT - 9); |
---|
166 | 193 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
---|
167 | 194 | |
---|
.. | .. |
---|
175 | 202 | return err; /* That will often be -EOPNOTSUPP */ |
---|
176 | 203 | } |
---|
177 | 204 | |
---|
| 205 | +static struct swap_extent * |
---|
| 206 | +offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) |
---|
| 207 | +{ |
---|
| 208 | + struct swap_extent *se; |
---|
| 209 | + struct rb_node *rb; |
---|
| 210 | + |
---|
| 211 | + rb = sis->swap_extent_root.rb_node; |
---|
| 212 | + while (rb) { |
---|
| 213 | + se = rb_entry(rb, struct swap_extent, rb_node); |
---|
| 214 | + if (offset < se->start_page) |
---|
| 215 | + rb = rb->rb_left; |
---|
| 216 | + else if (offset >= se->start_page + se->nr_pages) |
---|
| 217 | + rb = rb->rb_right; |
---|
| 218 | + else |
---|
| 219 | + return se; |
---|
| 220 | + } |
---|
| 221 | + /* It *must* be present */ |
---|
| 222 | + BUG(); |
---|
| 223 | +} |
---|
| 224 | + |
---|
| 225 | +sector_t swap_page_sector(struct page *page) |
---|
| 226 | +{ |
---|
| 227 | + struct swap_info_struct *sis = page_swap_info(page); |
---|
| 228 | + struct swap_extent *se; |
---|
| 229 | + sector_t sector; |
---|
| 230 | + pgoff_t offset; |
---|
| 231 | + |
---|
| 232 | + offset = __page_file_index(page); |
---|
| 233 | + se = offset_to_swap_extent(sis, offset); |
---|
| 234 | + sector = se->start_block + (offset - se->start_page); |
---|
| 235 | + return sector << (PAGE_SHIFT - 9); |
---|
| 236 | +} |
---|
| 237 | + |
---|
178 | 238 | /* |
---|
179 | 239 | * swap allocation tell device that a cluster of swap can now be discarded, |
---|
180 | 240 | * to allow the swap device to optimize its wear-levelling. |
---|
.. | .. |
---|
182 | 242 | static void discard_swap_cluster(struct swap_info_struct *si, |
---|
183 | 243 | pgoff_t start_page, pgoff_t nr_pages) |
---|
184 | 244 | { |
---|
185 | | - struct swap_extent *se = si->curr_swap_extent; |
---|
186 | | - int found_extent = 0; |
---|
| 245 | + struct swap_extent *se = offset_to_swap_extent(si, start_page); |
---|
187 | 246 | |
---|
188 | 247 | while (nr_pages) { |
---|
189 | | - if (se->start_page <= start_page && |
---|
190 | | - start_page < se->start_page + se->nr_pages) { |
---|
191 | | - pgoff_t offset = start_page - se->start_page; |
---|
192 | | - sector_t start_block = se->start_block + offset; |
---|
193 | | - sector_t nr_blocks = se->nr_pages - offset; |
---|
| 248 | + pgoff_t offset = start_page - se->start_page; |
---|
| 249 | + sector_t start_block = se->start_block + offset; |
---|
| 250 | + sector_t nr_blocks = se->nr_pages - offset; |
---|
194 | 251 | |
---|
195 | | - if (nr_blocks > nr_pages) |
---|
196 | | - nr_blocks = nr_pages; |
---|
197 | | - start_page += nr_blocks; |
---|
198 | | - nr_pages -= nr_blocks; |
---|
| 252 | + if (nr_blocks > nr_pages) |
---|
| 253 | + nr_blocks = nr_pages; |
---|
| 254 | + start_page += nr_blocks; |
---|
| 255 | + nr_pages -= nr_blocks; |
---|
199 | 256 | |
---|
200 | | - if (!found_extent++) |
---|
201 | | - si->curr_swap_extent = se; |
---|
| 257 | + start_block <<= PAGE_SHIFT - 9; |
---|
| 258 | + nr_blocks <<= PAGE_SHIFT - 9; |
---|
| 259 | + if (blkdev_issue_discard(si->bdev, start_block, |
---|
| 260 | + nr_blocks, GFP_NOIO, 0)) |
---|
| 261 | + break; |
---|
202 | 262 | |
---|
203 | | - start_block <<= PAGE_SHIFT - 9; |
---|
204 | | - nr_blocks <<= PAGE_SHIFT - 9; |
---|
205 | | - if (blkdev_issue_discard(si->bdev, start_block, |
---|
206 | | - nr_blocks, GFP_NOIO, 0)) |
---|
207 | | - break; |
---|
208 | | - } |
---|
209 | | - |
---|
210 | | - se = list_next_entry(se, list); |
---|
| 263 | + se = next_se(se); |
---|
211 | 264 | } |
---|
212 | 265 | } |
---|
213 | 266 | |
---|
.. | .. |
---|
562 | 615 | { |
---|
563 | 616 | struct percpu_cluster *cluster; |
---|
564 | 617 | struct swap_cluster_info *ci; |
---|
565 | | - bool found_free; |
---|
566 | 618 | unsigned long tmp, max; |
---|
567 | 619 | |
---|
568 | 620 | new_cluster: |
---|
.. | .. |
---|
575 | 627 | } else if (!cluster_list_empty(&si->discard_clusters)) { |
---|
576 | 628 | /* |
---|
577 | 629 | * we don't have free cluster but have some clusters in |
---|
578 | | - * discarding, do discard now and reclaim them |
---|
| 630 | + * discarding, do discard now and reclaim them, then |
---|
| 631 | + * reread cluster_next_cpu since we dropped si->lock |
---|
579 | 632 | */ |
---|
580 | 633 | swap_do_scheduled_discard(si); |
---|
581 | | - *scan_base = *offset = si->cluster_next; |
---|
| 634 | + *scan_base = this_cpu_read(*si->cluster_next_cpu); |
---|
| 635 | + *offset = *scan_base; |
---|
582 | 636 | goto new_cluster; |
---|
583 | 637 | } else |
---|
584 | 638 | return false; |
---|
585 | 639 | } |
---|
586 | | - |
---|
587 | | - found_free = false; |
---|
588 | 640 | |
---|
589 | 641 | /* |
---|
590 | 642 | * Other CPUs can use our cluster if they can't find a free cluster, |
---|
.. | .. |
---|
593 | 645 | tmp = cluster->next; |
---|
594 | 646 | max = min_t(unsigned long, si->max, |
---|
595 | 647 | (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); |
---|
596 | | - if (tmp >= max) { |
---|
597 | | - cluster_set_null(&cluster->index); |
---|
598 | | - goto new_cluster; |
---|
599 | | - } |
---|
600 | | - ci = lock_cluster(si, tmp); |
---|
601 | | - while (tmp < max) { |
---|
602 | | - if (!si->swap_map[tmp]) { |
---|
603 | | - found_free = true; |
---|
604 | | - break; |
---|
| 648 | + if (tmp < max) { |
---|
| 649 | + ci = lock_cluster(si, tmp); |
---|
| 650 | + while (tmp < max) { |
---|
| 651 | + if (!si->swap_map[tmp]) |
---|
| 652 | + break; |
---|
| 653 | + tmp++; |
---|
605 | 654 | } |
---|
606 | | - tmp++; |
---|
| 655 | + unlock_cluster(ci); |
---|
607 | 656 | } |
---|
608 | | - unlock_cluster(ci); |
---|
609 | | - if (!found_free) { |
---|
| 657 | + if (tmp >= max) { |
---|
610 | 658 | cluster_set_null(&cluster->index); |
---|
611 | 659 | goto new_cluster; |
---|
612 | 660 | } |
---|
613 | 661 | cluster->next = tmp + 1; |
---|
614 | 662 | *offset = tmp; |
---|
615 | 663 | *scan_base = tmp; |
---|
616 | | - return found_free; |
---|
| 664 | + return true; |
---|
617 | 665 | } |
---|
618 | 666 | |
---|
619 | 667 | static void __del_from_avail_list(struct swap_info_struct *p) |
---|
620 | 668 | { |
---|
621 | 669 | int nid; |
---|
622 | 670 | |
---|
| 671 | + assert_spin_locked(&p->lock); |
---|
623 | 672 | for_each_node(nid) |
---|
624 | 673 | plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); |
---|
625 | 674 | } |
---|
626 | 675 | |
---|
627 | 676 | static void del_from_avail_list(struct swap_info_struct *p) |
---|
628 | 677 | { |
---|
| 678 | + bool skip = false; |
---|
| 679 | + |
---|
| 680 | + trace_android_vh_del_from_avail_list(p, &skip); |
---|
| 681 | + if (skip) |
---|
| 682 | + return; |
---|
| 683 | + |
---|
629 | 684 | spin_lock(&swap_avail_lock); |
---|
630 | 685 | __del_from_avail_list(p); |
---|
631 | 686 | spin_unlock(&swap_avail_lock); |
---|
.. | .. |
---|
639 | 694 | if (offset == si->lowest_bit) |
---|
640 | 695 | si->lowest_bit += nr_entries; |
---|
641 | 696 | if (end == si->highest_bit) |
---|
642 | | - si->highest_bit -= nr_entries; |
---|
| 697 | + WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); |
---|
643 | 698 | si->inuse_pages += nr_entries; |
---|
644 | 699 | if (si->inuse_pages == si->pages) { |
---|
645 | 700 | si->lowest_bit = si->max; |
---|
.. | .. |
---|
651 | 706 | static void add_to_avail_list(struct swap_info_struct *p) |
---|
652 | 707 | { |
---|
653 | 708 | int nid; |
---|
| 709 | + bool skip = false; |
---|
| 710 | + |
---|
| 711 | + trace_android_vh_add_to_avail_list(p, &skip); |
---|
| 712 | + if (skip) |
---|
| 713 | + return; |
---|
654 | 714 | |
---|
655 | 715 | spin_lock(&swap_avail_lock); |
---|
656 | 716 | for_each_node(nid) { |
---|
.. | .. |
---|
663 | 723 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, |
---|
664 | 724 | unsigned int nr_entries) |
---|
665 | 725 | { |
---|
| 726 | + unsigned long begin = offset; |
---|
666 | 727 | unsigned long end = offset + nr_entries - 1; |
---|
667 | 728 | void (*swap_slot_free_notify)(struct block_device *, unsigned long); |
---|
| 729 | + bool skip = false; |
---|
668 | 730 | |
---|
669 | 731 | if (offset < si->lowest_bit) |
---|
670 | 732 | si->lowest_bit = offset; |
---|
671 | 733 | if (end > si->highest_bit) { |
---|
672 | 734 | bool was_full = !si->highest_bit; |
---|
673 | 735 | |
---|
674 | | - si->highest_bit = end; |
---|
| 736 | + WRITE_ONCE(si->highest_bit, end); |
---|
675 | 737 | if (was_full && (si->flags & SWP_WRITEOK)) |
---|
676 | 738 | add_to_avail_list(si); |
---|
677 | 739 | } |
---|
678 | | - atomic_long_add(nr_entries, &nr_swap_pages); |
---|
| 740 | + trace_android_vh_account_swap_pages(si, &skip); |
---|
| 741 | + if (!skip) |
---|
| 742 | + atomic_long_add(nr_entries, &nr_swap_pages); |
---|
679 | 743 | si->inuse_pages -= nr_entries; |
---|
680 | 744 | if (si->flags & SWP_BLKDEV) |
---|
681 | 745 | swap_slot_free_notify = |
---|
.. | .. |
---|
683 | 747 | else |
---|
684 | 748 | swap_slot_free_notify = NULL; |
---|
685 | 749 | while (offset <= end) { |
---|
| 750 | + arch_swap_invalidate_page(si->type, offset); |
---|
686 | 751 | frontswap_invalidate_page(si->type, offset); |
---|
687 | 752 | if (swap_slot_free_notify) |
---|
688 | 753 | swap_slot_free_notify(si->bdev, offset); |
---|
689 | 754 | offset++; |
---|
690 | 755 | } |
---|
| 756 | + clear_shadow_from_swap_cache(si->type, begin, end); |
---|
691 | 757 | } |
---|
692 | 758 | |
---|
693 | | -static int scan_swap_map_slots(struct swap_info_struct *si, |
---|
| 759 | +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) |
---|
| 760 | +{ |
---|
| 761 | + unsigned long prev; |
---|
| 762 | + |
---|
| 763 | + if (!(si->flags & SWP_SOLIDSTATE)) { |
---|
| 764 | + si->cluster_next = next; |
---|
| 765 | + return; |
---|
| 766 | + } |
---|
| 767 | + |
---|
| 768 | + prev = this_cpu_read(*si->cluster_next_cpu); |
---|
| 769 | + /* |
---|
| 770 | + * Cross the swap address space size aligned trunk, choose |
---|
| 771 | + * another trunk randomly to avoid lock contention on swap |
---|
| 772 | + * address space if possible. |
---|
| 773 | + */ |
---|
| 774 | + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != |
---|
| 775 | + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { |
---|
| 776 | + /* No free swap slots available */ |
---|
| 777 | + if (si->highest_bit <= si->lowest_bit) |
---|
| 778 | + return; |
---|
| 779 | + next = si->lowest_bit + |
---|
| 780 | + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); |
---|
| 781 | + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); |
---|
| 782 | + next = max_t(unsigned int, next, si->lowest_bit); |
---|
| 783 | + } |
---|
| 784 | + this_cpu_write(*si->cluster_next_cpu, next); |
---|
| 785 | +} |
---|
| 786 | + |
---|
| 787 | +int scan_swap_map_slots(struct swap_info_struct *si, |
---|
694 | 788 | unsigned char usage, int nr, |
---|
695 | 789 | swp_entry_t slots[]) |
---|
696 | 790 | { |
---|
.. | .. |
---|
700 | 794 | unsigned long last_in_cluster = 0; |
---|
701 | 795 | int latency_ration = LATENCY_LIMIT; |
---|
702 | 796 | int n_ret = 0; |
---|
703 | | - |
---|
704 | | - if (nr > SWAP_BATCH) |
---|
705 | | - nr = SWAP_BATCH; |
---|
| 797 | + bool scanned_many = false; |
---|
706 | 798 | |
---|
707 | 799 | /* |
---|
708 | 800 | * We try to cluster swap pages by allocating them sequentially |
---|
.. | .. |
---|
716 | 808 | */ |
---|
717 | 809 | |
---|
718 | 810 | si->flags += SWP_SCANNING; |
---|
719 | | - scan_base = offset = si->cluster_next; |
---|
| 811 | + /* |
---|
| 812 | + * Use percpu scan base for SSD to reduce lock contention on |
---|
| 813 | + * cluster and swap cache. For HDD, sequential access is more |
---|
| 814 | + * important. |
---|
| 815 | + */ |
---|
| 816 | + if (si->flags & SWP_SOLIDSTATE) |
---|
| 817 | + scan_base = this_cpu_read(*si->cluster_next_cpu); |
---|
| 818 | + else |
---|
| 819 | + scan_base = si->cluster_next; |
---|
| 820 | + offset = scan_base; |
---|
720 | 821 | |
---|
721 | 822 | /* SSD algorithm */ |
---|
722 | 823 | if (si->cluster_info) { |
---|
723 | | - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
---|
724 | | - goto checks; |
---|
725 | | - else |
---|
| 824 | + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
---|
726 | 825 | goto scan; |
---|
727 | | - } |
---|
728 | | - |
---|
729 | | - if (unlikely(!si->cluster_nr--)) { |
---|
| 826 | + } else if (unlikely(!si->cluster_nr--)) { |
---|
730 | 827 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
---|
731 | 828 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
---|
732 | 829 | goto checks; |
---|
.. | .. |
---|
789 | 886 | int swap_was_freed; |
---|
790 | 887 | unlock_cluster(ci); |
---|
791 | 888 | spin_unlock(&si->lock); |
---|
792 | | - swap_was_freed = __try_to_reclaim_swap(si, offset); |
---|
| 889 | + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); |
---|
793 | 890 | spin_lock(&si->lock); |
---|
794 | 891 | /* entry was freed successfully, try to use this again */ |
---|
795 | 892 | if (swap_was_freed) |
---|
.. | .. |
---|
804 | 901 | else |
---|
805 | 902 | goto done; |
---|
806 | 903 | } |
---|
807 | | - si->swap_map[offset] = usage; |
---|
| 904 | + WRITE_ONCE(si->swap_map[offset], usage); |
---|
808 | 905 | inc_cluster_info_page(si, si->cluster_info, offset); |
---|
809 | 906 | unlock_cluster(ci); |
---|
810 | 907 | |
---|
811 | 908 | swap_range_alloc(si, offset, 1); |
---|
812 | | - si->cluster_next = offset + 1; |
---|
813 | 909 | slots[n_ret++] = swp_entry(si->type, offset); |
---|
814 | 910 | |
---|
815 | 911 | /* got enough slots or reach max slots? */ |
---|
.. | .. |
---|
832 | 928 | if (si->cluster_info) { |
---|
833 | 929 | if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
---|
834 | 930 | goto checks; |
---|
835 | | - else |
---|
836 | | - goto done; |
---|
837 | | - } |
---|
838 | | - /* non-ssd case */ |
---|
839 | | - ++offset; |
---|
840 | | - |
---|
841 | | - /* non-ssd case, still more slots in cluster? */ |
---|
842 | | - if (si->cluster_nr && !si->swap_map[offset]) { |
---|
| 931 | + } else if (si->cluster_nr && !si->swap_map[++offset]) { |
---|
| 932 | + /* non-ssd case, still more slots in cluster? */ |
---|
843 | 933 | --si->cluster_nr; |
---|
844 | 934 | goto checks; |
---|
845 | 935 | } |
---|
846 | 936 | |
---|
| 937 | + /* |
---|
| 938 | + * Even if there's no free clusters available (fragmented), |
---|
| 939 | + * try to scan a little more quickly with lock held unless we |
---|
| 940 | + * have scanned too many slots already. |
---|
| 941 | + */ |
---|
| 942 | + if (!scanned_many) { |
---|
| 943 | + unsigned long scan_limit; |
---|
| 944 | + |
---|
| 945 | + if (offset < scan_base) |
---|
| 946 | + scan_limit = scan_base; |
---|
| 947 | + else |
---|
| 948 | + scan_limit = si->highest_bit; |
---|
| 949 | + for (; offset <= scan_limit && --latency_ration > 0; |
---|
| 950 | + offset++) { |
---|
| 951 | + if (!si->swap_map[offset]) |
---|
| 952 | + goto checks; |
---|
| 953 | + } |
---|
| 954 | + } |
---|
| 955 | + |
---|
847 | 956 | done: |
---|
| 957 | + set_cluster_next(si, offset + 1); |
---|
848 | 958 | si->flags -= SWP_SCANNING; |
---|
849 | 959 | return n_ret; |
---|
850 | 960 | |
---|
851 | 961 | scan: |
---|
852 | 962 | spin_unlock(&si->lock); |
---|
853 | | - while (++offset <= si->highest_bit) { |
---|
854 | | - if (!si->swap_map[offset]) { |
---|
| 963 | + while (++offset <= READ_ONCE(si->highest_bit)) { |
---|
| 964 | + if (data_race(!si->swap_map[offset])) { |
---|
855 | 965 | spin_lock(&si->lock); |
---|
856 | 966 | goto checks; |
---|
857 | 967 | } |
---|
858 | | - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
---|
| 968 | + if (vm_swap_full() && |
---|
| 969 | + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
---|
859 | 970 | spin_lock(&si->lock); |
---|
860 | 971 | goto checks; |
---|
861 | 972 | } |
---|
862 | 973 | if (unlikely(--latency_ration < 0)) { |
---|
863 | 974 | cond_resched(); |
---|
864 | 975 | latency_ration = LATENCY_LIMIT; |
---|
| 976 | + scanned_many = true; |
---|
865 | 977 | } |
---|
866 | 978 | } |
---|
867 | 979 | offset = si->lowest_bit; |
---|
868 | 980 | while (offset < scan_base) { |
---|
869 | | - if (!si->swap_map[offset]) { |
---|
| 981 | + if (data_race(!si->swap_map[offset])) { |
---|
870 | 982 | spin_lock(&si->lock); |
---|
871 | 983 | goto checks; |
---|
872 | 984 | } |
---|
873 | | - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
---|
| 985 | + if (vm_swap_full() && |
---|
| 986 | + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
---|
874 | 987 | spin_lock(&si->lock); |
---|
875 | 988 | goto checks; |
---|
876 | 989 | } |
---|
877 | 990 | if (unlikely(--latency_ration < 0)) { |
---|
878 | 991 | cond_resched(); |
---|
879 | 992 | latency_ration = LATENCY_LIMIT; |
---|
| 993 | + scanned_many = true; |
---|
880 | 994 | } |
---|
881 | 995 | offset++; |
---|
882 | 996 | } |
---|
.. | .. |
---|
886 | 1000 | si->flags -= SWP_SCANNING; |
---|
887 | 1001 | return n_ret; |
---|
888 | 1002 | } |
---|
| 1003 | +EXPORT_SYMBOL_GPL(scan_swap_map_slots); |
---|
889 | 1004 | |
---|
890 | | -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) |
---|
| 1005 | +int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) |
---|
891 | 1006 | { |
---|
892 | 1007 | unsigned long idx; |
---|
893 | 1008 | struct swap_cluster_info *ci; |
---|
.. | .. |
---|
921 | 1036 | |
---|
922 | 1037 | return 1; |
---|
923 | 1038 | } |
---|
| 1039 | +EXPORT_SYMBOL_GPL(swap_alloc_cluster); |
---|
924 | 1040 | |
---|
925 | 1041 | static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) |
---|
926 | 1042 | { |
---|
.. | .. |
---|
928 | 1044 | struct swap_cluster_info *ci; |
---|
929 | 1045 | |
---|
930 | 1046 | ci = lock_cluster(si, offset); |
---|
| 1047 | + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); |
---|
931 | 1048 | cluster_set_count_flag(ci, 0, 0); |
---|
932 | 1049 | free_cluster(si, idx); |
---|
933 | 1050 | unlock_cluster(ci); |
---|
.. | .. |
---|
960 | 1077 | /* Only single cluster request supported */ |
---|
961 | 1078 | WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); |
---|
962 | 1079 | |
---|
| 1080 | + spin_lock(&swap_avail_lock); |
---|
| 1081 | + |
---|
963 | 1082 | avail_pgs = atomic_long_read(&nr_swap_pages) / size; |
---|
964 | | - if (avail_pgs <= 0) |
---|
| 1083 | + if (avail_pgs <= 0) { |
---|
| 1084 | + spin_unlock(&swap_avail_lock); |
---|
965 | 1085 | goto noswap; |
---|
| 1086 | + } |
---|
966 | 1087 | |
---|
967 | | - if (n_goal > SWAP_BATCH) |
---|
968 | | - n_goal = SWAP_BATCH; |
---|
969 | | - |
---|
970 | | - if (n_goal > avail_pgs) |
---|
971 | | - n_goal = avail_pgs; |
---|
| 1088 | + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); |
---|
972 | 1089 | |
---|
973 | 1090 | atomic_long_sub(n_goal * size, &nr_swap_pages); |
---|
974 | | - |
---|
975 | | - spin_lock(&swap_avail_lock); |
---|
976 | 1091 | |
---|
977 | 1092 | start_over: |
---|
978 | 1093 | node = numa_node_id(); |
---|
.. | .. |
---|
1008 | 1123 | goto check_out; |
---|
1009 | 1124 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
---|
1010 | 1125 | si->type); |
---|
| 1126 | + cond_resched(); |
---|
1011 | 1127 | |
---|
1012 | 1128 | spin_lock(&swap_avail_lock); |
---|
1013 | 1129 | nextsi: |
---|
.. | .. |
---|
1041 | 1157 | { |
---|
1042 | 1158 | struct swap_info_struct *si = swap_type_to_swap_info(type); |
---|
1043 | 1159 | pgoff_t offset; |
---|
| 1160 | + bool skip = false; |
---|
1044 | 1161 | |
---|
1045 | 1162 | if (!si) |
---|
1046 | 1163 | goto fail; |
---|
1047 | 1164 | |
---|
1048 | 1165 | spin_lock(&si->lock); |
---|
1049 | 1166 | if (si->flags & SWP_WRITEOK) { |
---|
1050 | | - atomic_long_dec(&nr_swap_pages); |
---|
1051 | 1167 | /* This is called for allocating swap entry, not cache */ |
---|
1052 | 1168 | offset = scan_swap_map(si, 1); |
---|
1053 | 1169 | if (offset) { |
---|
| 1170 | + trace_android_vh_account_swap_pages(si, &skip); |
---|
| 1171 | + if (!skip) |
---|
| 1172 | + atomic_long_dec(&nr_swap_pages); |
---|
1054 | 1173 | spin_unlock(&si->lock); |
---|
1055 | 1174 | return swp_entry(type, offset); |
---|
1056 | 1175 | } |
---|
1057 | | - atomic_long_inc(&nr_swap_pages); |
---|
1058 | 1176 | } |
---|
1059 | 1177 | spin_unlock(&si->lock); |
---|
1060 | 1178 | fail: |
---|
.. | .. |
---|
1064 | 1182 | static struct swap_info_struct *__swap_info_get(swp_entry_t entry) |
---|
1065 | 1183 | { |
---|
1066 | 1184 | struct swap_info_struct *p; |
---|
1067 | | - unsigned long offset, type; |
---|
| 1185 | + unsigned long offset; |
---|
1068 | 1186 | |
---|
1069 | 1187 | if (!entry.val) |
---|
1070 | 1188 | goto out; |
---|
1071 | | - type = swp_type(entry); |
---|
1072 | | - p = swap_type_to_swap_info(type); |
---|
| 1189 | + p = swp_swap_info(entry); |
---|
1073 | 1190 | if (!p) |
---|
1074 | 1191 | goto bad_nofile; |
---|
1075 | | - if (!(p->flags & SWP_USED)) |
---|
| 1192 | + if (data_race(!(p->flags & SWP_USED))) |
---|
1076 | 1193 | goto bad_device; |
---|
1077 | 1194 | offset = swp_offset(entry); |
---|
1078 | 1195 | if (offset >= p->max) |
---|
.. | .. |
---|
1098 | 1215 | p = __swap_info_get(entry); |
---|
1099 | 1216 | if (!p) |
---|
1100 | 1217 | goto out; |
---|
1101 | | - if (!p->swap_map[swp_offset(entry)]) |
---|
| 1218 | + if (data_race(!p->swap_map[swp_offset(entry)])) |
---|
1102 | 1219 | goto bad_free; |
---|
1103 | 1220 | return p; |
---|
1104 | 1221 | |
---|
1105 | 1222 | bad_free: |
---|
1106 | 1223 | pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); |
---|
1107 | | - goto out; |
---|
1108 | 1224 | out: |
---|
1109 | 1225 | return NULL; |
---|
1110 | 1226 | } |
---|
.. | .. |
---|
1167 | 1283 | } |
---|
1168 | 1284 | |
---|
1169 | 1285 | usage = count | has_cache; |
---|
1170 | | - p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; |
---|
| 1286 | + if (usage) |
---|
| 1287 | + WRITE_ONCE(p->swap_map[offset], usage); |
---|
| 1288 | + else |
---|
| 1289 | + WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); |
---|
1171 | 1290 | |
---|
1172 | 1291 | return usage; |
---|
1173 | 1292 | } |
---|
1174 | 1293 | |
---|
| 1294 | +/* |
---|
| 1295 | + * Check whether swap entry is valid in the swap device. If so, |
---|
| 1296 | + * return pointer to swap_info_struct, and keep the swap entry valid |
---|
| 1297 | + * via preventing the swap device from being swapoff, until |
---|
| 1298 | + * put_swap_device() is called. Otherwise return NULL. |
---|
| 1299 | + * |
---|
| 1300 | + * The entirety of the RCU read critical section must come before the |
---|
| 1301 | + * return from or after the call to synchronize_rcu() in |
---|
| 1302 | + * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is |
---|
| 1303 | + * true, the si->map, si->cluster_info, etc. must be valid in the |
---|
| 1304 | + * critical section. |
---|
| 1305 | + * |
---|
| 1306 | + * Notice that swapoff or swapoff+swapon can still happen before the |
---|
| 1307 | + * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() |
---|
| 1308 | + * in put_swap_device() if there isn't any other way to prevent |
---|
| 1309 | + * swapoff, such as page lock, page table lock, etc. The caller must |
---|
| 1310 | + * be prepared for that. For example, the following situation is |
---|
| 1311 | + * possible. |
---|
| 1312 | + * |
---|
| 1313 | + * CPU1 CPU2 |
---|
| 1314 | + * do_swap_page() |
---|
| 1315 | + * ... swapoff+swapon |
---|
| 1316 | + * __read_swap_cache_async() |
---|
| 1317 | + * swapcache_prepare() |
---|
| 1318 | + * __swap_duplicate() |
---|
| 1319 | + * // check swap_map |
---|
| 1320 | + * // verify PTE not changed |
---|
| 1321 | + * |
---|
| 1322 | + * In __swap_duplicate(), the swap_map need to be checked before |
---|
| 1323 | + * changing partly because the specified swap entry may be for another |
---|
| 1324 | + * swap device which has been swapoff. And in do_swap_page(), after |
---|
| 1325 | + * the page is read from the swap device, the PTE is verified not |
---|
| 1326 | + * changed with the page table locked to check whether the swap device |
---|
| 1327 | + * has been swapoff or swapoff+swapon. |
---|
| 1328 | + */ |
---|
| 1329 | +struct swap_info_struct *get_swap_device(swp_entry_t entry) |
---|
| 1330 | +{ |
---|
| 1331 | + struct swap_info_struct *si; |
---|
| 1332 | + unsigned long offset; |
---|
| 1333 | + |
---|
| 1334 | + if (!entry.val) |
---|
| 1335 | + goto out; |
---|
| 1336 | + si = swp_swap_info(entry); |
---|
| 1337 | + if (!si) |
---|
| 1338 | + goto bad_nofile; |
---|
| 1339 | + |
---|
| 1340 | + rcu_read_lock(); |
---|
| 1341 | + if (data_race(!(si->flags & SWP_VALID))) |
---|
| 1342 | + goto unlock_out; |
---|
| 1343 | + offset = swp_offset(entry); |
---|
| 1344 | + if (offset >= si->max) |
---|
| 1345 | + goto unlock_out; |
---|
| 1346 | + |
---|
| 1347 | + return si; |
---|
| 1348 | +bad_nofile: |
---|
| 1349 | + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); |
---|
| 1350 | +out: |
---|
| 1351 | + return NULL; |
---|
| 1352 | +unlock_out: |
---|
| 1353 | + rcu_read_unlock(); |
---|
| 1354 | + return NULL; |
---|
| 1355 | +} |
---|
| 1356 | + |
---|
1175 | 1357 | static unsigned char __swap_entry_free(struct swap_info_struct *p, |
---|
1176 | | - swp_entry_t entry, unsigned char usage) |
---|
| 1358 | + swp_entry_t entry) |
---|
1177 | 1359 | { |
---|
1178 | 1360 | struct swap_cluster_info *ci; |
---|
1179 | 1361 | unsigned long offset = swp_offset(entry); |
---|
| 1362 | + unsigned char usage; |
---|
1180 | 1363 | |
---|
1181 | 1364 | ci = lock_cluster_or_swap_info(p, offset); |
---|
1182 | | - usage = __swap_entry_free_locked(p, offset, usage); |
---|
| 1365 | + usage = __swap_entry_free_locked(p, offset, 1); |
---|
1183 | 1366 | unlock_cluster_or_swap_info(p, ci); |
---|
| 1367 | + if (!usage) |
---|
| 1368 | + free_swap_slot(entry); |
---|
1184 | 1369 | |
---|
1185 | 1370 | return usage; |
---|
1186 | 1371 | } |
---|
.. | .. |
---|
1211 | 1396 | struct swap_info_struct *p; |
---|
1212 | 1397 | |
---|
1213 | 1398 | p = _swap_info_get(entry); |
---|
1214 | | - if (p) { |
---|
1215 | | - if (!__swap_entry_free(p, entry, 1)) |
---|
1216 | | - free_swap_slot(entry); |
---|
1217 | | - } |
---|
| 1399 | + if (p) |
---|
| 1400 | + __swap_entry_free(p, entry); |
---|
1218 | 1401 | } |
---|
1219 | 1402 | |
---|
1220 | 1403 | /* |
---|
.. | .. |
---|
1229 | 1412 | unsigned char *map; |
---|
1230 | 1413 | unsigned int i, free_entries = 0; |
---|
1231 | 1414 | unsigned char val; |
---|
1232 | | - int size = swap_entry_size(hpage_nr_pages(page)); |
---|
| 1415 | + int size = swap_entry_size(thp_nr_pages(page)); |
---|
1233 | 1416 | |
---|
1234 | 1417 | si = _swap_info_get(entry); |
---|
1235 | 1418 | if (!si) |
---|
.. | .. |
---|
1249 | 1432 | if (free_entries == SWAPFILE_CLUSTER) { |
---|
1250 | 1433 | unlock_cluster_or_swap_info(si, ci); |
---|
1251 | 1434 | spin_lock(&si->lock); |
---|
1252 | | - ci = lock_cluster(si, offset); |
---|
1253 | | - memset(map, 0, SWAPFILE_CLUSTER); |
---|
1254 | | - unlock_cluster(ci); |
---|
1255 | 1435 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); |
---|
1256 | 1436 | swap_free_cluster(si, idx); |
---|
1257 | 1437 | spin_unlock(&si->lock); |
---|
.. | .. |
---|
1321 | 1501 | if (p) |
---|
1322 | 1502 | spin_unlock(&p->lock); |
---|
1323 | 1503 | } |
---|
| 1504 | +EXPORT_SYMBOL_GPL(swapcache_free_entries); |
---|
1324 | 1505 | |
---|
1325 | 1506 | /* |
---|
1326 | 1507 | * How many references to page are currently swapped out? |
---|
.. | .. |
---|
1346 | 1527 | return count; |
---|
1347 | 1528 | } |
---|
1348 | 1529 | |
---|
1349 | | -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) |
---|
| 1530 | +int __swap_count(swp_entry_t entry) |
---|
1350 | 1531 | { |
---|
| 1532 | + struct swap_info_struct *si; |
---|
1351 | 1533 | pgoff_t offset = swp_offset(entry); |
---|
| 1534 | + int count = 0; |
---|
1352 | 1535 | |
---|
1353 | | - return swap_count(si->swap_map[offset]); |
---|
| 1536 | + si = get_swap_device(entry); |
---|
| 1537 | + if (si) { |
---|
| 1538 | + count = swap_count(si->swap_map[offset]); |
---|
| 1539 | + put_swap_device(si); |
---|
| 1540 | + } |
---|
| 1541 | + return count; |
---|
1354 | 1542 | } |
---|
1355 | 1543 | |
---|
1356 | 1544 | static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) |
---|
.. | .. |
---|
1375 | 1563 | int count = 0; |
---|
1376 | 1564 | struct swap_info_struct *si; |
---|
1377 | 1565 | |
---|
1378 | | - si = __swap_info_get(entry); |
---|
1379 | | - if (si) |
---|
| 1566 | + si = get_swap_device(entry); |
---|
| 1567 | + if (si) { |
---|
1380 | 1568 | count = swap_swapcount(si, entry); |
---|
| 1569 | + put_swap_device(si); |
---|
| 1570 | + } |
---|
1381 | 1571 | return count; |
---|
1382 | 1572 | } |
---|
1383 | 1573 | |
---|
.. | .. |
---|
1624 | 1814 | int free_swap_and_cache(swp_entry_t entry) |
---|
1625 | 1815 | { |
---|
1626 | 1816 | struct swap_info_struct *p; |
---|
1627 | | - struct page *page = NULL; |
---|
1628 | 1817 | unsigned char count; |
---|
1629 | 1818 | |
---|
1630 | 1819 | if (non_swap_entry(entry)) |
---|
.. | .. |
---|
1632 | 1821 | |
---|
1633 | 1822 | p = _swap_info_get(entry); |
---|
1634 | 1823 | if (p) { |
---|
1635 | | - count = __swap_entry_free(p, entry, 1); |
---|
| 1824 | + count = __swap_entry_free(p, entry); |
---|
1636 | 1825 | if (count == SWAP_HAS_CACHE && |
---|
1637 | | - !swap_page_trans_huge_swapped(p, entry)) { |
---|
1638 | | - page = find_get_page(swap_address_space(entry), |
---|
1639 | | - swp_offset(entry)); |
---|
1640 | | - if (page && !trylock_page(page)) { |
---|
1641 | | - put_page(page); |
---|
1642 | | - page = NULL; |
---|
1643 | | - } |
---|
1644 | | - } else if (!count) |
---|
1645 | | - free_swap_slot(entry); |
---|
1646 | | - } |
---|
1647 | | - if (page) { |
---|
1648 | | - /* |
---|
1649 | | - * Not mapped elsewhere, or swap space full? Free it! |
---|
1650 | | - * Also recheck PageSwapCache now page is locked (above). |
---|
1651 | | - */ |
---|
1652 | | - if (PageSwapCache(page) && !PageWriteback(page) && |
---|
1653 | | - (!page_mapped(page) || mem_cgroup_swap_full(page)) && |
---|
1654 | | - !swap_page_trans_huge_swapped(p, entry)) { |
---|
1655 | | - page = compound_head(page); |
---|
1656 | | - delete_from_swap_cache(page); |
---|
1657 | | - SetPageDirty(page); |
---|
1658 | | - } |
---|
1659 | | - unlock_page(page); |
---|
1660 | | - put_page(page); |
---|
| 1826 | + !swap_page_trans_huge_swapped(p, entry)) |
---|
| 1827 | + __try_to_reclaim_swap(p, swp_offset(entry), |
---|
| 1828 | + TTRS_UNMAPPED | TTRS_FULL); |
---|
1661 | 1829 | } |
---|
1662 | 1830 | return p != NULL; |
---|
1663 | 1831 | } |
---|
.. | .. |
---|
1671 | 1839 | * |
---|
1672 | 1840 | * This is needed for the suspend to disk (aka swsusp). |
---|
1673 | 1841 | */ |
---|
1674 | | -int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
---|
| 1842 | +int swap_type_of(dev_t device, sector_t offset) |
---|
1675 | 1843 | { |
---|
1676 | | - struct block_device *bdev = NULL; |
---|
1677 | 1844 | int type; |
---|
1678 | 1845 | |
---|
1679 | | - if (device) |
---|
1680 | | - bdev = bdget(device); |
---|
| 1846 | + if (!device) |
---|
| 1847 | + return -1; |
---|
1681 | 1848 | |
---|
1682 | 1849 | spin_lock(&swap_lock); |
---|
1683 | 1850 | for (type = 0; type < nr_swapfiles; type++) { |
---|
.. | .. |
---|
1686 | 1853 | if (!(sis->flags & SWP_WRITEOK)) |
---|
1687 | 1854 | continue; |
---|
1688 | 1855 | |
---|
1689 | | - if (!bdev) { |
---|
1690 | | - if (bdev_p) |
---|
1691 | | - *bdev_p = bdgrab(sis->bdev); |
---|
1692 | | - |
---|
1693 | | - spin_unlock(&swap_lock); |
---|
1694 | | - return type; |
---|
1695 | | - } |
---|
1696 | | - if (bdev == sis->bdev) { |
---|
1697 | | - struct swap_extent *se = &sis->first_swap_extent; |
---|
| 1856 | + if (device == sis->bdev->bd_dev) { |
---|
| 1857 | + struct swap_extent *se = first_se(sis); |
---|
1698 | 1858 | |
---|
1699 | 1859 | if (se->start_block == offset) { |
---|
1700 | | - if (bdev_p) |
---|
1701 | | - *bdev_p = bdgrab(sis->bdev); |
---|
1702 | | - |
---|
1703 | 1860 | spin_unlock(&swap_lock); |
---|
1704 | | - bdput(bdev); |
---|
1705 | 1861 | return type; |
---|
1706 | 1862 | } |
---|
1707 | 1863 | } |
---|
1708 | 1864 | } |
---|
1709 | 1865 | spin_unlock(&swap_lock); |
---|
1710 | | - if (bdev) |
---|
1711 | | - bdput(bdev); |
---|
| 1866 | + return -ENODEV; |
---|
| 1867 | +} |
---|
1712 | 1868 | |
---|
| 1869 | +int find_first_swap(dev_t *device) |
---|
| 1870 | +{ |
---|
| 1871 | + int type; |
---|
| 1872 | + |
---|
| 1873 | + spin_lock(&swap_lock); |
---|
| 1874 | + for (type = 0; type < nr_swapfiles; type++) { |
---|
| 1875 | + struct swap_info_struct *sis = swap_info[type]; |
---|
| 1876 | + |
---|
| 1877 | + if (!(sis->flags & SWP_WRITEOK)) |
---|
| 1878 | + continue; |
---|
| 1879 | + *device = sis->bdev->bd_dev; |
---|
| 1880 | + spin_unlock(&swap_lock); |
---|
| 1881 | + return type; |
---|
| 1882 | + } |
---|
| 1883 | + spin_unlock(&swap_lock); |
---|
1713 | 1884 | return -ENODEV; |
---|
1714 | 1885 | } |
---|
1715 | 1886 | |
---|
.. | .. |
---|
1756 | 1927 | |
---|
1757 | 1928 | static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) |
---|
1758 | 1929 | { |
---|
1759 | | - return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); |
---|
| 1930 | + return pte_same(pte_swp_clear_flags(pte), swp_pte); |
---|
1760 | 1931 | } |
---|
1761 | 1932 | |
---|
1762 | 1933 | /* |
---|
.. | .. |
---|
1768 | 1939 | unsigned long addr, swp_entry_t entry, struct page *page) |
---|
1769 | 1940 | { |
---|
1770 | 1941 | struct page *swapcache; |
---|
1771 | | - struct mem_cgroup *memcg; |
---|
1772 | 1942 | spinlock_t *ptl; |
---|
1773 | 1943 | pte_t *pte; |
---|
1774 | 1944 | int ret = 1; |
---|
.. | .. |
---|
1778 | 1948 | if (unlikely(!page)) |
---|
1779 | 1949 | return -ENOMEM; |
---|
1780 | 1950 | |
---|
1781 | | - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
---|
1782 | | - &memcg, false)) { |
---|
1783 | | - ret = -ENOMEM; |
---|
1784 | | - goto out_nolock; |
---|
1785 | | - } |
---|
1786 | | - |
---|
1787 | 1951 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
---|
1788 | 1952 | if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { |
---|
1789 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
1790 | 1953 | ret = 0; |
---|
1791 | 1954 | goto out; |
---|
1792 | 1955 | } |
---|
.. | .. |
---|
1798 | 1961 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
---|
1799 | 1962 | if (page == swapcache) { |
---|
1800 | 1963 | page_add_anon_rmap(page, vma, addr, false); |
---|
1801 | | - mem_cgroup_commit_charge(page, memcg, true, false); |
---|
1802 | 1964 | } else { /* ksm created a completely new copy */ |
---|
1803 | 1965 | page_add_new_anon_rmap(page, vma, addr, false); |
---|
1804 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
---|
1805 | | - lru_cache_add_active_or_unevictable(page, vma); |
---|
| 1966 | + lru_cache_add_inactive_or_unevictable(page, vma); |
---|
1806 | 1967 | } |
---|
1807 | 1968 | swap_free(entry); |
---|
1808 | | - /* |
---|
1809 | | - * Move the page to the active list so it is not |
---|
1810 | | - * immediately swapped out again after swapon. |
---|
1811 | | - */ |
---|
1812 | | - activate_page(page); |
---|
1813 | 1969 | out: |
---|
1814 | 1970 | pte_unmap_unlock(pte, ptl); |
---|
1815 | | -out_nolock: |
---|
1816 | 1971 | if (page != swapcache) { |
---|
1817 | 1972 | unlock_page(page); |
---|
1818 | 1973 | put_page(page); |
---|
.. | .. |
---|
1821 | 1976 | } |
---|
1822 | 1977 | |
---|
1823 | 1978 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
---|
1824 | | - unsigned long addr, unsigned long end, |
---|
1825 | | - swp_entry_t entry, struct page *page) |
---|
| 1979 | + unsigned long addr, unsigned long end, |
---|
| 1980 | + unsigned int type, bool frontswap, |
---|
| 1981 | + unsigned long *fs_pages_to_unuse) |
---|
1826 | 1982 | { |
---|
1827 | | - pte_t swp_pte = swp_entry_to_pte(entry); |
---|
| 1983 | + struct page *page; |
---|
| 1984 | + swp_entry_t entry; |
---|
1828 | 1985 | pte_t *pte; |
---|
| 1986 | + struct swap_info_struct *si; |
---|
| 1987 | + unsigned long offset; |
---|
1829 | 1988 | int ret = 0; |
---|
| 1989 | + volatile unsigned char *swap_map; |
---|
1830 | 1990 | |
---|
1831 | | - /* |
---|
1832 | | - * We don't actually need pte lock while scanning for swp_pte: since |
---|
1833 | | - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the |
---|
1834 | | - * page table while we're scanning; though it could get zapped, and on |
---|
1835 | | - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse |
---|
1836 | | - * of unmatched parts which look like swp_pte, so unuse_pte must |
---|
1837 | | - * recheck under pte lock. Scanning without pte lock lets it be |
---|
1838 | | - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. |
---|
1839 | | - */ |
---|
| 1991 | + si = swap_info[type]; |
---|
1840 | 1992 | pte = pte_offset_map(pmd, addr); |
---|
1841 | 1993 | do { |
---|
1842 | | - /* |
---|
1843 | | - * swapoff spends a _lot_ of time in this loop! |
---|
1844 | | - * Test inline before going to call unuse_pte. |
---|
1845 | | - */ |
---|
1846 | | - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { |
---|
1847 | | - pte_unmap(pte); |
---|
1848 | | - ret = unuse_pte(vma, pmd, addr, entry, page); |
---|
1849 | | - if (ret) |
---|
1850 | | - goto out; |
---|
1851 | | - pte = pte_offset_map(pmd, addr); |
---|
| 1994 | + if (!is_swap_pte(*pte)) |
---|
| 1995 | + continue; |
---|
| 1996 | + |
---|
| 1997 | + entry = pte_to_swp_entry(*pte); |
---|
| 1998 | + if (swp_type(entry) != type) |
---|
| 1999 | + continue; |
---|
| 2000 | + |
---|
| 2001 | + offset = swp_offset(entry); |
---|
| 2002 | + if (frontswap && !frontswap_test(si, offset)) |
---|
| 2003 | + continue; |
---|
| 2004 | + |
---|
| 2005 | + pte_unmap(pte); |
---|
| 2006 | + swap_map = &si->swap_map[offset]; |
---|
| 2007 | + page = lookup_swap_cache(entry, vma, addr); |
---|
| 2008 | + if (!page) { |
---|
| 2009 | + struct vm_fault vmf = { |
---|
| 2010 | + .vma = vma, |
---|
| 2011 | + .address = addr, |
---|
| 2012 | + .pmd = pmd, |
---|
| 2013 | + }; |
---|
| 2014 | + |
---|
| 2015 | + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, |
---|
| 2016 | + &vmf); |
---|
1852 | 2017 | } |
---|
| 2018 | + if (!page) { |
---|
| 2019 | + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) |
---|
| 2020 | + goto try_next; |
---|
| 2021 | + return -ENOMEM; |
---|
| 2022 | + } |
---|
| 2023 | + |
---|
| 2024 | + lock_page(page); |
---|
| 2025 | + wait_on_page_writeback(page); |
---|
| 2026 | + ret = unuse_pte(vma, pmd, addr, entry, page); |
---|
| 2027 | + if (ret < 0) { |
---|
| 2028 | + unlock_page(page); |
---|
| 2029 | + put_page(page); |
---|
| 2030 | + goto out; |
---|
| 2031 | + } |
---|
| 2032 | + |
---|
| 2033 | + try_to_free_swap(page); |
---|
| 2034 | + trace_android_vh_unuse_swap_page(si, page); |
---|
| 2035 | + unlock_page(page); |
---|
| 2036 | + put_page(page); |
---|
| 2037 | + |
---|
| 2038 | + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { |
---|
| 2039 | + ret = FRONTSWAP_PAGES_UNUSED; |
---|
| 2040 | + goto out; |
---|
| 2041 | + } |
---|
| 2042 | +try_next: |
---|
| 2043 | + pte = pte_offset_map(pmd, addr); |
---|
1853 | 2044 | } while (pte++, addr += PAGE_SIZE, addr != end); |
---|
1854 | 2045 | pte_unmap(pte - 1); |
---|
| 2046 | + |
---|
| 2047 | + ret = 0; |
---|
1855 | 2048 | out: |
---|
1856 | 2049 | return ret; |
---|
1857 | 2050 | } |
---|
1858 | 2051 | |
---|
1859 | 2052 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
---|
1860 | 2053 | unsigned long addr, unsigned long end, |
---|
1861 | | - swp_entry_t entry, struct page *page) |
---|
| 2054 | + unsigned int type, bool frontswap, |
---|
| 2055 | + unsigned long *fs_pages_to_unuse) |
---|
1862 | 2056 | { |
---|
1863 | 2057 | pmd_t *pmd; |
---|
1864 | 2058 | unsigned long next; |
---|
.. | .. |
---|
1870 | 2064 | next = pmd_addr_end(addr, end); |
---|
1871 | 2065 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
---|
1872 | 2066 | continue; |
---|
1873 | | - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
---|
| 2067 | + ret = unuse_pte_range(vma, pmd, addr, next, type, |
---|
| 2068 | + frontswap, fs_pages_to_unuse); |
---|
1874 | 2069 | if (ret) |
---|
1875 | 2070 | return ret; |
---|
1876 | 2071 | } while (pmd++, addr = next, addr != end); |
---|
.. | .. |
---|
1879 | 2074 | |
---|
1880 | 2075 | static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, |
---|
1881 | 2076 | unsigned long addr, unsigned long end, |
---|
1882 | | - swp_entry_t entry, struct page *page) |
---|
| 2077 | + unsigned int type, bool frontswap, |
---|
| 2078 | + unsigned long *fs_pages_to_unuse) |
---|
1883 | 2079 | { |
---|
1884 | 2080 | pud_t *pud; |
---|
1885 | 2081 | unsigned long next; |
---|
.. | .. |
---|
1890 | 2086 | next = pud_addr_end(addr, end); |
---|
1891 | 2087 | if (pud_none_or_clear_bad(pud)) |
---|
1892 | 2088 | continue; |
---|
1893 | | - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
---|
| 2089 | + ret = unuse_pmd_range(vma, pud, addr, next, type, |
---|
| 2090 | + frontswap, fs_pages_to_unuse); |
---|
1894 | 2091 | if (ret) |
---|
1895 | 2092 | return ret; |
---|
1896 | 2093 | } while (pud++, addr = next, addr != end); |
---|
.. | .. |
---|
1899 | 2096 | |
---|
1900 | 2097 | static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, |
---|
1901 | 2098 | unsigned long addr, unsigned long end, |
---|
1902 | | - swp_entry_t entry, struct page *page) |
---|
| 2099 | + unsigned int type, bool frontswap, |
---|
| 2100 | + unsigned long *fs_pages_to_unuse) |
---|
1903 | 2101 | { |
---|
1904 | 2102 | p4d_t *p4d; |
---|
1905 | 2103 | unsigned long next; |
---|
.. | .. |
---|
1910 | 2108 | next = p4d_addr_end(addr, end); |
---|
1911 | 2109 | if (p4d_none_or_clear_bad(p4d)) |
---|
1912 | 2110 | continue; |
---|
1913 | | - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); |
---|
| 2111 | + ret = unuse_pud_range(vma, p4d, addr, next, type, |
---|
| 2112 | + frontswap, fs_pages_to_unuse); |
---|
1914 | 2113 | if (ret) |
---|
1915 | 2114 | return ret; |
---|
1916 | 2115 | } while (p4d++, addr = next, addr != end); |
---|
1917 | 2116 | return 0; |
---|
1918 | 2117 | } |
---|
1919 | 2118 | |
---|
1920 | | -static int unuse_vma(struct vm_area_struct *vma, |
---|
1921 | | - swp_entry_t entry, struct page *page) |
---|
| 2119 | +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, |
---|
| 2120 | + bool frontswap, unsigned long *fs_pages_to_unuse) |
---|
1922 | 2121 | { |
---|
1923 | 2122 | pgd_t *pgd; |
---|
1924 | 2123 | unsigned long addr, end, next; |
---|
1925 | 2124 | int ret; |
---|
1926 | 2125 | |
---|
1927 | | - if (page_anon_vma(page)) { |
---|
1928 | | - addr = page_address_in_vma(page, vma); |
---|
1929 | | - if (addr == -EFAULT) |
---|
1930 | | - return 0; |
---|
1931 | | - else |
---|
1932 | | - end = addr + PAGE_SIZE; |
---|
1933 | | - } else { |
---|
1934 | | - addr = vma->vm_start; |
---|
1935 | | - end = vma->vm_end; |
---|
1936 | | - } |
---|
| 2126 | + addr = vma->vm_start; |
---|
| 2127 | + end = vma->vm_end; |
---|
1937 | 2128 | |
---|
1938 | 2129 | pgd = pgd_offset(vma->vm_mm, addr); |
---|
1939 | 2130 | do { |
---|
1940 | 2131 | next = pgd_addr_end(addr, end); |
---|
1941 | 2132 | if (pgd_none_or_clear_bad(pgd)) |
---|
1942 | 2133 | continue; |
---|
1943 | | - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); |
---|
| 2134 | + ret = unuse_p4d_range(vma, pgd, addr, next, type, |
---|
| 2135 | + frontswap, fs_pages_to_unuse); |
---|
1944 | 2136 | if (ret) |
---|
1945 | 2137 | return ret; |
---|
1946 | 2138 | } while (pgd++, addr = next, addr != end); |
---|
1947 | 2139 | return 0; |
---|
1948 | 2140 | } |
---|
1949 | 2141 | |
---|
1950 | | -static int unuse_mm(struct mm_struct *mm, |
---|
1951 | | - swp_entry_t entry, struct page *page) |
---|
| 2142 | +static int unuse_mm(struct mm_struct *mm, unsigned int type, |
---|
| 2143 | + bool frontswap, unsigned long *fs_pages_to_unuse) |
---|
1952 | 2144 | { |
---|
1953 | 2145 | struct vm_area_struct *vma; |
---|
1954 | 2146 | int ret = 0; |
---|
1955 | 2147 | |
---|
1956 | | - if (!down_read_trylock(&mm->mmap_sem)) { |
---|
1957 | | - /* |
---|
1958 | | - * Activate page so shrink_inactive_list is unlikely to unmap |
---|
1959 | | - * its ptes while lock is dropped, so swapoff can make progress. |
---|
1960 | | - */ |
---|
1961 | | - activate_page(page); |
---|
1962 | | - unlock_page(page); |
---|
1963 | | - down_read(&mm->mmap_sem); |
---|
1964 | | - lock_page(page); |
---|
1965 | | - } |
---|
| 2148 | + mmap_read_lock(mm); |
---|
1966 | 2149 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
---|
1967 | | - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
---|
1968 | | - break; |
---|
| 2150 | + if (vma->anon_vma) { |
---|
| 2151 | + ret = unuse_vma(vma, type, frontswap, |
---|
| 2152 | + fs_pages_to_unuse); |
---|
| 2153 | + if (ret) |
---|
| 2154 | + break; |
---|
| 2155 | + } |
---|
1969 | 2156 | cond_resched(); |
---|
1970 | 2157 | } |
---|
1971 | | - up_read(&mm->mmap_sem); |
---|
1972 | | - return (ret < 0)? ret: 0; |
---|
| 2158 | + mmap_read_unlock(mm); |
---|
| 2159 | + return ret; |
---|
1973 | 2160 | } |
---|
1974 | 2161 | |
---|
1975 | 2162 | /* |
---|
1976 | 2163 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
---|
1977 | | - * from current position to next entry still in use. |
---|
1978 | | - * Recycle to start on reaching the end, returning 0 when empty. |
---|
| 2164 | + * from current position to next entry still in use. Return 0 |
---|
| 2165 | + * if there are no inuse entries after prev till end of the map. |
---|
1979 | 2166 | */ |
---|
1980 | 2167 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
---|
1981 | 2168 | unsigned int prev, bool frontswap) |
---|
1982 | 2169 | { |
---|
1983 | | - unsigned int max = si->max; |
---|
1984 | | - unsigned int i = prev; |
---|
| 2170 | + unsigned int i; |
---|
1985 | 2171 | unsigned char count; |
---|
1986 | 2172 | |
---|
1987 | 2173 | /* |
---|
.. | .. |
---|
1990 | 2176 | * hits are okay, and sys_swapoff() has already prevented new |
---|
1991 | 2177 | * allocations from this area (while holding swap_lock). |
---|
1992 | 2178 | */ |
---|
1993 | | - for (;;) { |
---|
1994 | | - if (++i >= max) { |
---|
1995 | | - if (!prev) { |
---|
1996 | | - i = 0; |
---|
1997 | | - break; |
---|
1998 | | - } |
---|
1999 | | - /* |
---|
2000 | | - * No entries in use at top of swap_map, |
---|
2001 | | - * loop back to start and recheck there. |
---|
2002 | | - */ |
---|
2003 | | - max = prev + 1; |
---|
2004 | | - prev = 0; |
---|
2005 | | - i = 1; |
---|
2006 | | - } |
---|
| 2179 | + for (i = prev + 1; i < si->max; i++) { |
---|
2007 | 2180 | count = READ_ONCE(si->swap_map[i]); |
---|
2008 | 2181 | if (count && swap_count(count) != SWAP_MAP_BAD) |
---|
2009 | 2182 | if (!frontswap || frontswap_test(si, i)) |
---|
.. | .. |
---|
2011 | 2184 | if ((i % LATENCY_LIMIT) == 0) |
---|
2012 | 2185 | cond_resched(); |
---|
2013 | 2186 | } |
---|
| 2187 | + |
---|
| 2188 | + if (i == si->max) |
---|
| 2189 | + i = 0; |
---|
| 2190 | + |
---|
2014 | 2191 | return i; |
---|
2015 | 2192 | } |
---|
2016 | 2193 | |
---|
2017 | 2194 | /* |
---|
2018 | | - * We completely avoid races by reading each swap page in advance, |
---|
2019 | | - * and then search for the process using it. All the necessary |
---|
2020 | | - * page table adjustments can then be made atomically. |
---|
2021 | | - * |
---|
2022 | | - * if the boolean frontswap is true, only unuse pages_to_unuse pages; |
---|
| 2195 | + * If the boolean frontswap is true, only unuse pages_to_unuse pages; |
---|
2023 | 2196 | * pages_to_unuse==0 means all pages; ignored if frontswap is false |
---|
2024 | 2197 | */ |
---|
2025 | 2198 | int try_to_unuse(unsigned int type, bool frontswap, |
---|
2026 | 2199 | unsigned long pages_to_unuse) |
---|
2027 | 2200 | { |
---|
| 2201 | + struct mm_struct *prev_mm; |
---|
| 2202 | + struct mm_struct *mm; |
---|
| 2203 | + struct list_head *p; |
---|
| 2204 | + int retval = 0; |
---|
2028 | 2205 | struct swap_info_struct *si = swap_info[type]; |
---|
2029 | | - struct mm_struct *start_mm; |
---|
2030 | | - volatile unsigned char *swap_map; /* swap_map is accessed without |
---|
2031 | | - * locking. Mark it as volatile |
---|
2032 | | - * to prevent compiler doing |
---|
2033 | | - * something odd. |
---|
2034 | | - */ |
---|
2035 | | - unsigned char swcount; |
---|
2036 | 2206 | struct page *page; |
---|
2037 | 2207 | swp_entry_t entry; |
---|
2038 | | - unsigned int i = 0; |
---|
2039 | | - int retval = 0; |
---|
| 2208 | + unsigned int i; |
---|
2040 | 2209 | |
---|
2041 | | - /* |
---|
2042 | | - * When searching mms for an entry, a good strategy is to |
---|
2043 | | - * start at the first mm we freed the previous entry from |
---|
2044 | | - * (though actually we don't notice whether we or coincidence |
---|
2045 | | - * freed the entry). Initialize this start_mm with a hold. |
---|
2046 | | - * |
---|
2047 | | - * A simpler strategy would be to start at the last mm we |
---|
2048 | | - * freed the previous entry from; but that would take less |
---|
2049 | | - * advantage of mmlist ordering, which clusters forked mms |
---|
2050 | | - * together, child after parent. If we race with dup_mmap(), we |
---|
2051 | | - * prefer to resolve parent before child, lest we miss entries |
---|
2052 | | - * duplicated after we scanned child: using last mm would invert |
---|
2053 | | - * that. |
---|
2054 | | - */ |
---|
2055 | | - start_mm = &init_mm; |
---|
2056 | | - mmget(&init_mm); |
---|
| 2210 | + if (!READ_ONCE(si->inuse_pages)) |
---|
| 2211 | + return 0; |
---|
2057 | 2212 | |
---|
2058 | | - /* |
---|
2059 | | - * Keep on scanning until all entries have gone. Usually, |
---|
2060 | | - * one pass through swap_map is enough, but not necessarily: |
---|
2061 | | - * there are races when an instance of an entry might be missed. |
---|
2062 | | - */ |
---|
2063 | | - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
---|
2064 | | - if (signal_pending(current)) { |
---|
2065 | | - retval = -EINTR; |
---|
2066 | | - break; |
---|
2067 | | - } |
---|
| 2213 | + if (!frontswap) |
---|
| 2214 | + pages_to_unuse = 0; |
---|
2068 | 2215 | |
---|
2069 | | - /* |
---|
2070 | | - * Get a page for the entry, using the existing swap |
---|
2071 | | - * cache page if there is one. Otherwise, get a clean |
---|
2072 | | - * page and read the swap into it. |
---|
2073 | | - */ |
---|
2074 | | - swap_map = &si->swap_map[i]; |
---|
2075 | | - entry = swp_entry(type, i); |
---|
2076 | | - page = read_swap_cache_async(entry, |
---|
2077 | | - GFP_HIGHUSER_MOVABLE, NULL, 0, false); |
---|
2078 | | - if (!page) { |
---|
2079 | | - /* |
---|
2080 | | - * Either swap_duplicate() failed because entry |
---|
2081 | | - * has been freed independently, and will not be |
---|
2082 | | - * reused since sys_swapoff() already disabled |
---|
2083 | | - * allocation from here, or alloc_page() failed. |
---|
2084 | | - */ |
---|
2085 | | - swcount = *swap_map; |
---|
2086 | | - /* |
---|
2087 | | - * We don't hold lock here, so the swap entry could be |
---|
2088 | | - * SWAP_MAP_BAD (when the cluster is discarding). |
---|
2089 | | - * Instead of fail out, We can just skip the swap |
---|
2090 | | - * entry because swapoff will wait for discarding |
---|
2091 | | - * finish anyway. |
---|
2092 | | - */ |
---|
2093 | | - if (!swcount || swcount == SWAP_MAP_BAD) |
---|
2094 | | - continue; |
---|
2095 | | - retval = -ENOMEM; |
---|
2096 | | - break; |
---|
2097 | | - } |
---|
| 2216 | +retry: |
---|
| 2217 | + retval = shmem_unuse(type, frontswap, &pages_to_unuse); |
---|
| 2218 | + if (retval) |
---|
| 2219 | + goto out; |
---|
2098 | 2220 | |
---|
2099 | | - /* |
---|
2100 | | - * Don't hold on to start_mm if it looks like exiting. |
---|
2101 | | - */ |
---|
2102 | | - if (atomic_read(&start_mm->mm_users) == 1) { |
---|
2103 | | - mmput(start_mm); |
---|
2104 | | - start_mm = &init_mm; |
---|
2105 | | - mmget(&init_mm); |
---|
2106 | | - } |
---|
| 2221 | + prev_mm = &init_mm; |
---|
| 2222 | + mmget(prev_mm); |
---|
2107 | 2223 | |
---|
2108 | | - /* |
---|
2109 | | - * Wait for and lock page. When do_swap_page races with |
---|
2110 | | - * try_to_unuse, do_swap_page can handle the fault much |
---|
2111 | | - * faster than try_to_unuse can locate the entry. This |
---|
2112 | | - * apparently redundant "wait_on_page_locked" lets try_to_unuse |
---|
2113 | | - * defer to do_swap_page in such a case - in some tests, |
---|
2114 | | - * do_swap_page and try_to_unuse repeatedly compete. |
---|
2115 | | - */ |
---|
2116 | | - wait_on_page_locked(page); |
---|
2117 | | - wait_on_page_writeback(page); |
---|
2118 | | - lock_page(page); |
---|
2119 | | - wait_on_page_writeback(page); |
---|
| 2224 | + spin_lock(&mmlist_lock); |
---|
| 2225 | + p = &init_mm.mmlist; |
---|
| 2226 | + while (READ_ONCE(si->inuse_pages) && |
---|
| 2227 | + !signal_pending(current) && |
---|
| 2228 | + (p = p->next) != &init_mm.mmlist) { |
---|
2120 | 2229 | |
---|
2121 | | - /* |
---|
2122 | | - * Remove all references to entry. |
---|
2123 | | - */ |
---|
2124 | | - swcount = *swap_map; |
---|
2125 | | - if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
---|
2126 | | - retval = shmem_unuse(entry, page); |
---|
2127 | | - /* page has already been unlocked and released */ |
---|
2128 | | - if (retval < 0) |
---|
2129 | | - break; |
---|
| 2230 | + mm = list_entry(p, struct mm_struct, mmlist); |
---|
| 2231 | + if (!mmget_not_zero(mm)) |
---|
2130 | 2232 | continue; |
---|
2131 | | - } |
---|
2132 | | - if (swap_count(swcount) && start_mm != &init_mm) |
---|
2133 | | - retval = unuse_mm(start_mm, entry, page); |
---|
| 2233 | + spin_unlock(&mmlist_lock); |
---|
| 2234 | + mmput(prev_mm); |
---|
| 2235 | + prev_mm = mm; |
---|
| 2236 | + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); |
---|
2134 | 2237 | |
---|
2135 | | - if (swap_count(*swap_map)) { |
---|
2136 | | - int set_start_mm = (*swap_map >= swcount); |
---|
2137 | | - struct list_head *p = &start_mm->mmlist; |
---|
2138 | | - struct mm_struct *new_start_mm = start_mm; |
---|
2139 | | - struct mm_struct *prev_mm = start_mm; |
---|
2140 | | - struct mm_struct *mm; |
---|
2141 | | - |
---|
2142 | | - mmget(new_start_mm); |
---|
2143 | | - mmget(prev_mm); |
---|
2144 | | - spin_lock(&mmlist_lock); |
---|
2145 | | - while (swap_count(*swap_map) && !retval && |
---|
2146 | | - (p = p->next) != &start_mm->mmlist) { |
---|
2147 | | - mm = list_entry(p, struct mm_struct, mmlist); |
---|
2148 | | - if (!mmget_not_zero(mm)) |
---|
2149 | | - continue; |
---|
2150 | | - spin_unlock(&mmlist_lock); |
---|
2151 | | - mmput(prev_mm); |
---|
2152 | | - prev_mm = mm; |
---|
2153 | | - |
---|
2154 | | - cond_resched(); |
---|
2155 | | - |
---|
2156 | | - swcount = *swap_map; |
---|
2157 | | - if (!swap_count(swcount)) /* any usage ? */ |
---|
2158 | | - ; |
---|
2159 | | - else if (mm == &init_mm) |
---|
2160 | | - set_start_mm = 1; |
---|
2161 | | - else |
---|
2162 | | - retval = unuse_mm(mm, entry, page); |
---|
2163 | | - |
---|
2164 | | - if (set_start_mm && *swap_map < swcount) { |
---|
2165 | | - mmput(new_start_mm); |
---|
2166 | | - mmget(mm); |
---|
2167 | | - new_start_mm = mm; |
---|
2168 | | - set_start_mm = 0; |
---|
2169 | | - } |
---|
2170 | | - spin_lock(&mmlist_lock); |
---|
2171 | | - } |
---|
2172 | | - spin_unlock(&mmlist_lock); |
---|
2173 | | - mmput(prev_mm); |
---|
2174 | | - mmput(start_mm); |
---|
2175 | | - start_mm = new_start_mm; |
---|
2176 | | - } |
---|
2177 | 2238 | if (retval) { |
---|
2178 | | - unlock_page(page); |
---|
2179 | | - put_page(page); |
---|
2180 | | - break; |
---|
| 2239 | + mmput(prev_mm); |
---|
| 2240 | + goto out; |
---|
2181 | 2241 | } |
---|
2182 | | - |
---|
2183 | | - /* |
---|
2184 | | - * If a reference remains (rare), we would like to leave |
---|
2185 | | - * the page in the swap cache; but try_to_unmap could |
---|
2186 | | - * then re-duplicate the entry once we drop page lock, |
---|
2187 | | - * so we might loop indefinitely; also, that page could |
---|
2188 | | - * not be swapped out to other storage meanwhile. So: |
---|
2189 | | - * delete from cache even if there's another reference, |
---|
2190 | | - * after ensuring that the data has been saved to disk - |
---|
2191 | | - * since if the reference remains (rarer), it will be |
---|
2192 | | - * read from disk into another page. Splitting into two |
---|
2193 | | - * pages would be incorrect if swap supported "shared |
---|
2194 | | - * private" pages, but they are handled by tmpfs files. |
---|
2195 | | - * |
---|
2196 | | - * Given how unuse_vma() targets one particular offset |
---|
2197 | | - * in an anon_vma, once the anon_vma has been determined, |
---|
2198 | | - * this splitting happens to be just what is needed to |
---|
2199 | | - * handle where KSM pages have been swapped out: re-reading |
---|
2200 | | - * is unnecessarily slow, but we can fix that later on. |
---|
2201 | | - */ |
---|
2202 | | - if (swap_count(*swap_map) && |
---|
2203 | | - PageDirty(page) && PageSwapCache(page)) { |
---|
2204 | | - struct writeback_control wbc = { |
---|
2205 | | - .sync_mode = WB_SYNC_NONE, |
---|
2206 | | - }; |
---|
2207 | | - |
---|
2208 | | - swap_writepage(compound_head(page), &wbc); |
---|
2209 | | - lock_page(page); |
---|
2210 | | - wait_on_page_writeback(page); |
---|
2211 | | - } |
---|
2212 | | - |
---|
2213 | | - /* |
---|
2214 | | - * It is conceivable that a racing task removed this page from |
---|
2215 | | - * swap cache just before we acquired the page lock at the top, |
---|
2216 | | - * or while we dropped it in unuse_mm(). The page might even |
---|
2217 | | - * be back in swap cache on another swap area: that we must not |
---|
2218 | | - * delete, since it may not have been written out to swap yet. |
---|
2219 | | - */ |
---|
2220 | | - if (PageSwapCache(page) && |
---|
2221 | | - likely(page_private(page) == entry.val) && |
---|
2222 | | - (!PageTransCompound(page) || |
---|
2223 | | - !swap_page_trans_huge_swapped(si, entry))) |
---|
2224 | | - delete_from_swap_cache(compound_head(page)); |
---|
2225 | | - |
---|
2226 | | - /* |
---|
2227 | | - * So we could skip searching mms once swap count went |
---|
2228 | | - * to 1, we did not mark any present ptes as dirty: must |
---|
2229 | | - * mark page dirty so shrink_page_list will preserve it. |
---|
2230 | | - */ |
---|
2231 | | - SetPageDirty(page); |
---|
2232 | | - unlock_page(page); |
---|
2233 | | - put_page(page); |
---|
2234 | 2242 | |
---|
2235 | 2243 | /* |
---|
2236 | 2244 | * Make sure that we aren't completely killing |
---|
2237 | 2245 | * interactive performance. |
---|
2238 | 2246 | */ |
---|
2239 | 2247 | cond_resched(); |
---|
2240 | | - if (frontswap && pages_to_unuse > 0) { |
---|
2241 | | - if (!--pages_to_unuse) |
---|
2242 | | - break; |
---|
2243 | | - } |
---|
| 2248 | + spin_lock(&mmlist_lock); |
---|
| 2249 | + } |
---|
| 2250 | + spin_unlock(&mmlist_lock); |
---|
| 2251 | + |
---|
| 2252 | + mmput(prev_mm); |
---|
| 2253 | + |
---|
| 2254 | + i = 0; |
---|
| 2255 | + while (READ_ONCE(si->inuse_pages) && |
---|
| 2256 | + !signal_pending(current) && |
---|
| 2257 | + (i = find_next_to_unuse(si, i, frontswap)) != 0) { |
---|
| 2258 | + |
---|
| 2259 | + entry = swp_entry(type, i); |
---|
| 2260 | + page = find_get_page(swap_address_space(entry), i); |
---|
| 2261 | + if (!page) |
---|
| 2262 | + continue; |
---|
| 2263 | + |
---|
| 2264 | + /* |
---|
| 2265 | + * It is conceivable that a racing task removed this page from |
---|
| 2266 | + * swap cache just before we acquired the page lock. The page |
---|
| 2267 | + * might even be back in swap cache on another swap area. But |
---|
| 2268 | + * that is okay, try_to_free_swap() only removes stale pages. |
---|
| 2269 | + */ |
---|
| 2270 | + lock_page(page); |
---|
| 2271 | + wait_on_page_writeback(page); |
---|
| 2272 | + try_to_free_swap(page); |
---|
| 2273 | + trace_android_vh_unuse_swap_page(si, page); |
---|
| 2274 | + unlock_page(page); |
---|
| 2275 | + put_page(page); |
---|
| 2276 | + |
---|
| 2277 | + /* |
---|
| 2278 | + * For frontswap, we just need to unuse pages_to_unuse, if |
---|
| 2279 | + * it was specified. Need not check frontswap again here as |
---|
| 2280 | + * we already zeroed out pages_to_unuse if not frontswap. |
---|
| 2281 | + */ |
---|
| 2282 | + if (pages_to_unuse && --pages_to_unuse == 0) |
---|
| 2283 | + goto out; |
---|
2244 | 2284 | } |
---|
2245 | 2285 | |
---|
2246 | | - mmput(start_mm); |
---|
2247 | | - return retval; |
---|
| 2286 | + /* |
---|
| 2287 | + * Lets check again to see if there are still swap entries in the map. |
---|
| 2288 | + * If yes, we would need to do retry the unuse logic again. |
---|
| 2289 | + * Under global memory pressure, swap entries can be reinserted back |
---|
| 2290 | + * into process space after the mmlist loop above passes over them. |
---|
| 2291 | + * |
---|
| 2292 | + * Limit the number of retries? No: when mmget_not_zero() above fails, |
---|
| 2293 | + * that mm is likely to be freeing swap from exit_mmap(), which proceeds |
---|
| 2294 | + * at its own independent pace; and even shmem_writepage() could have |
---|
| 2295 | + * been preempted after get_swap_page(), temporarily hiding that swap. |
---|
| 2296 | + * It's easy and robust (though cpu-intensive) just to keep retrying. |
---|
| 2297 | + */ |
---|
| 2298 | + if (READ_ONCE(si->inuse_pages)) { |
---|
| 2299 | + if (!signal_pending(current)) |
---|
| 2300 | + goto retry; |
---|
| 2301 | + retval = -EINTR; |
---|
| 2302 | + } |
---|
| 2303 | +out: |
---|
| 2304 | + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; |
---|
2248 | 2305 | } |
---|
2249 | 2306 | |
---|
2250 | 2307 | /* |
---|
.. | .. |
---|
2276 | 2333 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
---|
2277 | 2334 | { |
---|
2278 | 2335 | struct swap_info_struct *sis; |
---|
2279 | | - struct swap_extent *start_se; |
---|
2280 | 2336 | struct swap_extent *se; |
---|
2281 | 2337 | pgoff_t offset; |
---|
2282 | 2338 | |
---|
.. | .. |
---|
2284 | 2340 | *bdev = sis->bdev; |
---|
2285 | 2341 | |
---|
2286 | 2342 | offset = swp_offset(entry); |
---|
2287 | | - start_se = sis->curr_swap_extent; |
---|
2288 | | - se = start_se; |
---|
2289 | | - |
---|
2290 | | - for ( ; ; ) { |
---|
2291 | | - if (se->start_page <= offset && |
---|
2292 | | - offset < (se->start_page + se->nr_pages)) { |
---|
2293 | | - return se->start_block + (offset - se->start_page); |
---|
2294 | | - } |
---|
2295 | | - se = list_next_entry(se, list); |
---|
2296 | | - sis->curr_swap_extent = se; |
---|
2297 | | - BUG_ON(se == start_se); /* It *must* be present */ |
---|
2298 | | - } |
---|
| 2343 | + se = offset_to_swap_extent(sis, offset); |
---|
| 2344 | + return se->start_block + (offset - se->start_page); |
---|
2299 | 2345 | } |
---|
2300 | 2346 | |
---|
2301 | 2347 | /* |
---|
.. | .. |
---|
2305 | 2351 | { |
---|
2306 | 2352 | swp_entry_t entry; |
---|
2307 | 2353 | entry.val = page_private(page); |
---|
2308 | | - return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9); |
---|
| 2354 | + return map_swap_entry(entry, bdev); |
---|
2309 | 2355 | } |
---|
2310 | 2356 | |
---|
2311 | 2357 | /* |
---|
.. | .. |
---|
2313 | 2359 | */ |
---|
2314 | 2360 | static void destroy_swap_extents(struct swap_info_struct *sis) |
---|
2315 | 2361 | { |
---|
2316 | | - while (!list_empty(&sis->first_swap_extent.list)) { |
---|
2317 | | - struct swap_extent *se; |
---|
| 2362 | + while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { |
---|
| 2363 | + struct rb_node *rb = sis->swap_extent_root.rb_node; |
---|
| 2364 | + struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); |
---|
2318 | 2365 | |
---|
2319 | | - se = list_first_entry(&sis->first_swap_extent.list, |
---|
2320 | | - struct swap_extent, list); |
---|
2321 | | - list_del(&se->list); |
---|
| 2366 | + rb_erase(rb, &sis->swap_extent_root); |
---|
2322 | 2367 | kfree(se); |
---|
2323 | 2368 | } |
---|
2324 | 2369 | |
---|
2325 | | - if (sis->flags & SWP_FILE) { |
---|
| 2370 | + if (sis->flags & SWP_ACTIVATED) { |
---|
2326 | 2371 | struct file *swap_file = sis->swap_file; |
---|
2327 | 2372 | struct address_space *mapping = swap_file->f_mapping; |
---|
2328 | 2373 | |
---|
2329 | | - sis->flags &= ~SWP_FILE; |
---|
2330 | | - mapping->a_ops->swap_deactivate(swap_file); |
---|
| 2374 | + sis->flags &= ~SWP_ACTIVATED; |
---|
| 2375 | + if (mapping->a_ops->swap_deactivate) |
---|
| 2376 | + mapping->a_ops->swap_deactivate(swap_file); |
---|
2331 | 2377 | } |
---|
2332 | 2378 | } |
---|
2333 | 2379 | |
---|
2334 | 2380 | /* |
---|
2335 | 2381 | * Add a block range (and the corresponding page range) into this swapdev's |
---|
2336 | | - * extent list. The extent list is kept sorted in page order. |
---|
| 2382 | + * extent tree. |
---|
2337 | 2383 | * |
---|
2338 | 2384 | * This function rather assumes that it is called in ascending page order. |
---|
2339 | 2385 | */ |
---|
.. | .. |
---|
2341 | 2387 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
---|
2342 | 2388 | unsigned long nr_pages, sector_t start_block) |
---|
2343 | 2389 | { |
---|
| 2390 | + struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; |
---|
2344 | 2391 | struct swap_extent *se; |
---|
2345 | 2392 | struct swap_extent *new_se; |
---|
2346 | | - struct list_head *lh; |
---|
2347 | 2393 | |
---|
2348 | | - if (start_page == 0) { |
---|
2349 | | - se = &sis->first_swap_extent; |
---|
2350 | | - sis->curr_swap_extent = se; |
---|
2351 | | - se->start_page = 0; |
---|
2352 | | - se->nr_pages = nr_pages; |
---|
2353 | | - se->start_block = start_block; |
---|
2354 | | - return 1; |
---|
2355 | | - } else { |
---|
2356 | | - lh = sis->first_swap_extent.list.prev; /* Highest extent */ |
---|
2357 | | - se = list_entry(lh, struct swap_extent, list); |
---|
| 2394 | + /* |
---|
| 2395 | + * place the new node at the right most since the |
---|
| 2396 | + * function is called in ascending page order. |
---|
| 2397 | + */ |
---|
| 2398 | + while (*link) { |
---|
| 2399 | + parent = *link; |
---|
| 2400 | + link = &parent->rb_right; |
---|
| 2401 | + } |
---|
| 2402 | + |
---|
| 2403 | + if (parent) { |
---|
| 2404 | + se = rb_entry(parent, struct swap_extent, rb_node); |
---|
2358 | 2405 | BUG_ON(se->start_page + se->nr_pages != start_page); |
---|
2359 | 2406 | if (se->start_block + se->nr_pages == start_block) { |
---|
2360 | 2407 | /* Merge it */ |
---|
.. | .. |
---|
2363 | 2410 | } |
---|
2364 | 2411 | } |
---|
2365 | 2412 | |
---|
2366 | | - /* |
---|
2367 | | - * No merge. Insert a new extent, preserving ordering. |
---|
2368 | | - */ |
---|
| 2413 | + /* No merge, insert a new extent. */ |
---|
2369 | 2414 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); |
---|
2370 | 2415 | if (new_se == NULL) |
---|
2371 | 2416 | return -ENOMEM; |
---|
.. | .. |
---|
2373 | 2418 | new_se->nr_pages = nr_pages; |
---|
2374 | 2419 | new_se->start_block = start_block; |
---|
2375 | 2420 | |
---|
2376 | | - list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
---|
| 2421 | + rb_link_node(&new_se->rb_node, parent, link); |
---|
| 2422 | + rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); |
---|
2377 | 2423 | return 1; |
---|
2378 | 2424 | } |
---|
2379 | 2425 | EXPORT_SYMBOL_GPL(add_swap_extent); |
---|
.. | .. |
---|
2423 | 2469 | |
---|
2424 | 2470 | if (mapping->a_ops->swap_activate) { |
---|
2425 | 2471 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
---|
| 2472 | + if (ret >= 0) |
---|
| 2473 | + sis->flags |= SWP_ACTIVATED; |
---|
2426 | 2474 | if (!ret) { |
---|
2427 | | - sis->flags |= SWP_FILE; |
---|
| 2475 | + sis->flags |= SWP_FS_OPS; |
---|
2428 | 2476 | ret = add_swap_extent(sis, 0, sis->max, 0); |
---|
2429 | 2477 | *span = sis->pages; |
---|
2430 | 2478 | } |
---|
.. | .. |
---|
2446 | 2494 | return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; |
---|
2447 | 2495 | } |
---|
2448 | 2496 | |
---|
2449 | | -static void _enable_swap_info(struct swap_info_struct *p, int prio, |
---|
2450 | | - unsigned char *swap_map, |
---|
2451 | | - struct swap_cluster_info *cluster_info) |
---|
| 2497 | +static void setup_swap_info(struct swap_info_struct *p, int prio, |
---|
| 2498 | + unsigned char *swap_map, |
---|
| 2499 | + struct swap_cluster_info *cluster_info) |
---|
2452 | 2500 | { |
---|
2453 | 2501 | int i; |
---|
2454 | 2502 | |
---|
.. | .. |
---|
2473 | 2521 | } |
---|
2474 | 2522 | p->swap_map = swap_map; |
---|
2475 | 2523 | p->cluster_info = cluster_info; |
---|
2476 | | - p->flags |= SWP_WRITEOK; |
---|
2477 | | - atomic_long_add(p->pages, &nr_swap_pages); |
---|
2478 | | - total_swap_pages += p->pages; |
---|
| 2524 | +} |
---|
2479 | 2525 | |
---|
| 2526 | +static void _enable_swap_info(struct swap_info_struct *p) |
---|
| 2527 | +{ |
---|
| 2528 | + bool skip = false; |
---|
| 2529 | + |
---|
| 2530 | + p->flags |= SWP_WRITEOK | SWP_VALID; |
---|
| 2531 | + trace_android_vh_account_swap_pages(p, &skip); |
---|
| 2532 | + if (!skip) { |
---|
| 2533 | + atomic_long_add(p->pages, &nr_swap_pages); |
---|
| 2534 | + total_swap_pages += p->pages; |
---|
| 2535 | + } |
---|
2480 | 2536 | assert_spin_locked(&swap_lock); |
---|
2481 | 2537 | /* |
---|
2482 | 2538 | * both lists are plists, and thus priority ordered. |
---|
.. | .. |
---|
2500 | 2556 | frontswap_init(p->type, frontswap_map); |
---|
2501 | 2557 | spin_lock(&swap_lock); |
---|
2502 | 2558 | spin_lock(&p->lock); |
---|
2503 | | - _enable_swap_info(p, prio, swap_map, cluster_info); |
---|
| 2559 | + setup_swap_info(p, prio, swap_map, cluster_info); |
---|
| 2560 | + spin_unlock(&p->lock); |
---|
| 2561 | + spin_unlock(&swap_lock); |
---|
| 2562 | + /* |
---|
| 2563 | + * Guarantee swap_map, cluster_info, etc. fields are valid |
---|
| 2564 | + * between get/put_swap_device() if SWP_VALID bit is set |
---|
| 2565 | + */ |
---|
| 2566 | + synchronize_rcu(); |
---|
| 2567 | + spin_lock(&swap_lock); |
---|
| 2568 | + spin_lock(&p->lock); |
---|
| 2569 | + _enable_swap_info(p); |
---|
2504 | 2570 | spin_unlock(&p->lock); |
---|
2505 | 2571 | spin_unlock(&swap_lock); |
---|
2506 | 2572 | } |
---|
.. | .. |
---|
2509 | 2575 | { |
---|
2510 | 2576 | spin_lock(&swap_lock); |
---|
2511 | 2577 | spin_lock(&p->lock); |
---|
2512 | | - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
---|
| 2578 | + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
---|
| 2579 | + _enable_swap_info(p); |
---|
2513 | 2580 | spin_unlock(&p->lock); |
---|
2514 | 2581 | spin_unlock(&swap_lock); |
---|
2515 | 2582 | } |
---|
.. | .. |
---|
2537 | 2604 | struct filename *pathname; |
---|
2538 | 2605 | int err, found = 0; |
---|
2539 | 2606 | unsigned int old_block_size; |
---|
| 2607 | + bool skip = false; |
---|
2540 | 2608 | |
---|
2541 | 2609 | if (!capable(CAP_SYS_ADMIN)) |
---|
2542 | 2610 | return -EPERM; |
---|
.. | .. |
---|
2574 | 2642 | spin_unlock(&swap_lock); |
---|
2575 | 2643 | goto out_dput; |
---|
2576 | 2644 | } |
---|
2577 | | - del_from_avail_list(p); |
---|
2578 | 2645 | spin_lock(&p->lock); |
---|
| 2646 | + del_from_avail_list(p); |
---|
2579 | 2647 | if (p->prio < 0) { |
---|
2580 | 2648 | struct swap_info_struct *si = p; |
---|
2581 | 2649 | int nid; |
---|
.. | .. |
---|
2591 | 2659 | least_priority++; |
---|
2592 | 2660 | } |
---|
2593 | 2661 | plist_del(&p->list, &swap_active_head); |
---|
2594 | | - atomic_long_sub(p->pages, &nr_swap_pages); |
---|
2595 | | - total_swap_pages -= p->pages; |
---|
| 2662 | + trace_android_vh_account_swap_pages(p, &skip); |
---|
| 2663 | + if (!skip) { |
---|
| 2664 | + atomic_long_sub(p->pages, &nr_swap_pages); |
---|
| 2665 | + total_swap_pages -= p->pages; |
---|
| 2666 | + } |
---|
2596 | 2667 | p->flags &= ~SWP_WRITEOK; |
---|
2597 | 2668 | spin_unlock(&p->lock); |
---|
2598 | 2669 | spin_unlock(&swap_lock); |
---|
.. | .. |
---|
2611 | 2682 | } |
---|
2612 | 2683 | |
---|
2613 | 2684 | reenable_swap_slots_cache_unlock(); |
---|
| 2685 | + |
---|
| 2686 | + spin_lock(&swap_lock); |
---|
| 2687 | + spin_lock(&p->lock); |
---|
| 2688 | + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ |
---|
| 2689 | + spin_unlock(&p->lock); |
---|
| 2690 | + spin_unlock(&swap_lock); |
---|
| 2691 | + /* |
---|
| 2692 | + * wait for swap operations protected by get/put_swap_device() |
---|
| 2693 | + * to complete |
---|
| 2694 | + */ |
---|
| 2695 | + synchronize_rcu(); |
---|
2614 | 2696 | |
---|
2615 | 2697 | flush_work(&p->discard_work); |
---|
2616 | 2698 | |
---|
.. | .. |
---|
2647 | 2729 | frontswap_map = frontswap_map_get(p); |
---|
2648 | 2730 | spin_unlock(&p->lock); |
---|
2649 | 2731 | spin_unlock(&swap_lock); |
---|
| 2732 | + arch_swap_invalidate_area(p->type); |
---|
2650 | 2733 | frontswap_invalidate_area(p->type); |
---|
2651 | 2734 | frontswap_map_set(p, NULL); |
---|
2652 | 2735 | mutex_unlock(&swapon_mutex); |
---|
2653 | 2736 | free_percpu(p->percpu_cluster); |
---|
2654 | 2737 | p->percpu_cluster = NULL; |
---|
| 2738 | + free_percpu(p->cluster_next_cpu); |
---|
| 2739 | + p->cluster_next_cpu = NULL; |
---|
2655 | 2740 | vfree(swap_map); |
---|
2656 | 2741 | kvfree(cluster_info); |
---|
2657 | 2742 | kvfree(frontswap_map); |
---|
.. | .. |
---|
2759 | 2844 | struct swap_info_struct *si = v; |
---|
2760 | 2845 | struct file *file; |
---|
2761 | 2846 | int len; |
---|
| 2847 | + unsigned int bytes, inuse; |
---|
2762 | 2848 | |
---|
2763 | 2849 | if (si == SEQ_START_TOKEN) { |
---|
2764 | | - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
---|
| 2850 | + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); |
---|
2765 | 2851 | return 0; |
---|
2766 | 2852 | } |
---|
2767 | 2853 | |
---|
| 2854 | + bytes = si->pages << (PAGE_SHIFT - 10); |
---|
| 2855 | + inuse = si->inuse_pages << (PAGE_SHIFT - 10); |
---|
| 2856 | + |
---|
2768 | 2857 | file = si->swap_file; |
---|
2769 | 2858 | len = seq_file_path(swap, file, " \t\n\\"); |
---|
2770 | | - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
---|
| 2859 | + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", |
---|
2771 | 2860 | len < 40 ? 40 - len : 1, " ", |
---|
2772 | 2861 | S_ISBLK(file_inode(file)->i_mode) ? |
---|
2773 | 2862 | "partition" : "file\t", |
---|
2774 | | - si->pages << (PAGE_SHIFT - 10), |
---|
2775 | | - si->inuse_pages << (PAGE_SHIFT - 10), |
---|
| 2863 | + bytes, bytes < 10000000 ? "\t" : "", |
---|
| 2864 | + inuse, inuse < 10000000 ? "\t" : "", |
---|
2776 | 2865 | si->prio); |
---|
2777 | 2866 | return 0; |
---|
2778 | 2867 | } |
---|
.. | .. |
---|
2798 | 2887 | return 0; |
---|
2799 | 2888 | } |
---|
2800 | 2889 | |
---|
2801 | | -static const struct file_operations proc_swaps_operations = { |
---|
2802 | | - .open = swaps_open, |
---|
2803 | | - .read = seq_read, |
---|
2804 | | - .llseek = seq_lseek, |
---|
2805 | | - .release = seq_release, |
---|
2806 | | - .poll = swaps_poll, |
---|
| 2890 | +static const struct proc_ops swaps_proc_ops = { |
---|
| 2891 | + .proc_flags = PROC_ENTRY_PERMANENT, |
---|
| 2892 | + .proc_open = swaps_open, |
---|
| 2893 | + .proc_read = seq_read, |
---|
| 2894 | + .proc_lseek = seq_lseek, |
---|
| 2895 | + .proc_release = seq_release, |
---|
| 2896 | + .proc_poll = swaps_poll, |
---|
2807 | 2897 | }; |
---|
2808 | 2898 | |
---|
2809 | 2899 | static int __init procswaps_init(void) |
---|
2810 | 2900 | { |
---|
2811 | | - proc_create("swaps", 0, NULL, &proc_swaps_operations); |
---|
| 2901 | + proc_create("swaps", 0, NULL, &swaps_proc_ops); |
---|
2812 | 2902 | return 0; |
---|
2813 | 2903 | } |
---|
2814 | 2904 | __initcall(procswaps_init); |
---|
.. | .. |
---|
2825 | 2915 | |
---|
2826 | 2916 | static struct swap_info_struct *alloc_swap_info(void) |
---|
2827 | 2917 | { |
---|
2828 | | - struct swap_info_struct *p; |
---|
| 2918 | + struct swap_info_struct *p = NULL; |
---|
2829 | 2919 | struct swap_info_struct *defer = NULL; |
---|
2830 | 2920 | unsigned int type; |
---|
2831 | 2921 | int i; |
---|
2832 | | - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); |
---|
| 2922 | + bool skip = false; |
---|
2833 | 2923 | |
---|
2834 | | - p = kvzalloc(size, GFP_KERNEL); |
---|
| 2924 | + trace_android_rvh_alloc_si(&p, &skip); |
---|
| 2925 | + trace_android_vh_alloc_si(&p, &skip); |
---|
| 2926 | + if (!skip) |
---|
| 2927 | + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); |
---|
2835 | 2928 | if (!p) |
---|
2836 | 2929 | return ERR_PTR(-ENOMEM); |
---|
2837 | 2930 | |
---|
.. | .. |
---|
2863 | 2956 | * would be relying on p->type to remain valid. |
---|
2864 | 2957 | */ |
---|
2865 | 2958 | } |
---|
2866 | | - INIT_LIST_HEAD(&p->first_swap_extent.list); |
---|
| 2959 | + p->swap_extent_root = RB_ROOT; |
---|
2867 | 2960 | plist_node_init(&p->list, 0); |
---|
2868 | 2961 | for_each_node(i) |
---|
2869 | 2962 | plist_node_init(&p->avail_lists[i], 0); |
---|
.. | .. |
---|
2881 | 2974 | int error; |
---|
2882 | 2975 | |
---|
2883 | 2976 | if (S_ISBLK(inode->i_mode)) { |
---|
2884 | | - p->bdev = bdgrab(I_BDEV(inode)); |
---|
2885 | | - error = blkdev_get(p->bdev, |
---|
| 2977 | + p->bdev = blkdev_get_by_dev(inode->i_rdev, |
---|
2886 | 2978 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); |
---|
2887 | | - if (error < 0) { |
---|
| 2979 | + if (IS_ERR(p->bdev)) { |
---|
| 2980 | + error = PTR_ERR(p->bdev); |
---|
2888 | 2981 | p->bdev = NULL; |
---|
2889 | 2982 | return error; |
---|
2890 | 2983 | } |
---|
.. | .. |
---|
2892 | 2985 | error = set_blocksize(p->bdev, PAGE_SIZE); |
---|
2893 | 2986 | if (error < 0) |
---|
2894 | 2987 | return error; |
---|
| 2988 | + /* |
---|
| 2989 | + * Zoned block devices contain zones that have a sequential |
---|
| 2990 | + * write only restriction. Hence zoned block devices are not |
---|
| 2991 | + * suitable for swapping. Disallow them here. |
---|
| 2992 | + */ |
---|
| 2993 | + if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) |
---|
| 2994 | + return -EINVAL; |
---|
2895 | 2995 | p->flags |= SWP_BLKDEV; |
---|
2896 | 2996 | } else if (S_ISREG(inode->i_mode)) { |
---|
2897 | 2997 | p->bdev = inode->i_sb->s_bdev; |
---|
.. | .. |
---|
3188 | 3288 | goto bad_swap_unlock_inode; |
---|
3189 | 3289 | } |
---|
3190 | 3290 | |
---|
3191 | | - if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) |
---|
| 3291 | + if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) |
---|
3192 | 3292 | p->flags |= SWP_STABLE_WRITES; |
---|
3193 | 3293 | |
---|
3194 | | - if (bdi_cap_synchronous_io(inode_to_bdi(inode))) |
---|
| 3294 | + if (p->bdev && p->bdev->bd_disk->fops->rw_page) |
---|
3195 | 3295 | p->flags |= SWP_SYNCHRONOUS_IO; |
---|
3196 | 3296 | |
---|
3197 | 3297 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
---|
.. | .. |
---|
3199 | 3299 | unsigned long ci, nr_cluster; |
---|
3200 | 3300 | |
---|
3201 | 3301 | p->flags |= SWP_SOLIDSTATE; |
---|
| 3302 | + p->cluster_next_cpu = alloc_percpu(unsigned int); |
---|
| 3303 | + if (!p->cluster_next_cpu) { |
---|
| 3304 | + error = -ENOMEM; |
---|
| 3305 | + goto bad_swap_unlock_inode; |
---|
| 3306 | + } |
---|
3202 | 3307 | /* |
---|
3203 | 3308 | * select a random position to start with to help wear leveling |
---|
3204 | 3309 | * SSD |
---|
3205 | 3310 | */ |
---|
3206 | | - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
---|
| 3311 | + for_each_possible_cpu(cpu) { |
---|
| 3312 | + per_cpu(*p->cluster_next_cpu, cpu) = |
---|
| 3313 | + 1 + prandom_u32_max(p->highest_bit); |
---|
| 3314 | + } |
---|
3207 | 3315 | nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
---|
3208 | 3316 | |
---|
3209 | 3317 | cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), |
---|
.. | .. |
---|
3289 | 3397 | error = inode_drain_writes(inode); |
---|
3290 | 3398 | if (error) { |
---|
3291 | 3399 | inode->i_flags &= ~S_SWAPFILE; |
---|
3292 | | - goto bad_swap_unlock_inode; |
---|
| 3400 | + goto free_swap_address_space; |
---|
3293 | 3401 | } |
---|
3294 | 3402 | |
---|
3295 | 3403 | mutex_lock(&swapon_mutex); |
---|
.. | .. |
---|
3297 | 3405 | if (swap_flags & SWAP_FLAG_PREFER) |
---|
3298 | 3406 | prio = |
---|
3299 | 3407 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
---|
| 3408 | + |
---|
| 3409 | + trace_android_vh_swap_avail_heads_init(swap_avail_heads); |
---|
3300 | 3410 | enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); |
---|
3301 | 3411 | |
---|
| 3412 | + trace_android_vh_init_swap_info_struct(p, swap_avail_heads); |
---|
3302 | 3413 | pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
---|
3303 | 3414 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
---|
3304 | 3415 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
---|
.. | .. |
---|
3314 | 3425 | |
---|
3315 | 3426 | error = 0; |
---|
3316 | 3427 | goto out; |
---|
| 3428 | +free_swap_address_space: |
---|
| 3429 | + exit_swap_address_space(p->type); |
---|
3317 | 3430 | bad_swap_unlock_inode: |
---|
3318 | 3431 | inode_unlock(inode); |
---|
3319 | 3432 | bad_swap: |
---|
3320 | 3433 | free_percpu(p->percpu_cluster); |
---|
3321 | 3434 | p->percpu_cluster = NULL; |
---|
| 3435 | + free_percpu(p->cluster_next_cpu); |
---|
| 3436 | + p->cluster_next_cpu = NULL; |
---|
3322 | 3437 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
---|
3323 | 3438 | set_blocksize(p->bdev, p->old_block_size); |
---|
3324 | 3439 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
---|
.. | .. |
---|
3359 | 3474 | spin_lock(&swap_lock); |
---|
3360 | 3475 | for (type = 0; type < nr_swapfiles; type++) { |
---|
3361 | 3476 | struct swap_info_struct *si = swap_info[type]; |
---|
| 3477 | + bool skip = false; |
---|
3362 | 3478 | |
---|
3363 | | - if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
---|
| 3479 | + trace_android_vh_si_swapinfo(si, &skip); |
---|
| 3480 | + if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
---|
3364 | 3481 | nr_to_be_unused += si->inuse_pages; |
---|
3365 | 3482 | } |
---|
3366 | 3483 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
---|
3367 | 3484 | val->totalswap = total_swap_pages + nr_to_be_unused; |
---|
3368 | 3485 | spin_unlock(&swap_lock); |
---|
3369 | 3486 | } |
---|
| 3487 | +EXPORT_SYMBOL_GPL(si_swapinfo); |
---|
3370 | 3488 | |
---|
3371 | 3489 | /* |
---|
3372 | 3490 | * Verify that a swap entry is valid and increment its swap map count. |
---|
.. | .. |
---|
3388 | 3506 | unsigned char has_cache; |
---|
3389 | 3507 | int err = -EINVAL; |
---|
3390 | 3508 | |
---|
3391 | | - if (non_swap_entry(entry)) |
---|
3392 | | - goto out; |
---|
3393 | | - |
---|
3394 | | - p = swp_swap_info(entry); |
---|
| 3509 | + p = get_swap_device(entry); |
---|
3395 | 3510 | if (!p) |
---|
3396 | | - goto bad_file; |
---|
| 3511 | + goto out; |
---|
3397 | 3512 | |
---|
3398 | 3513 | offset = swp_offset(entry); |
---|
3399 | | - if (unlikely(offset >= p->max)) |
---|
3400 | | - goto out; |
---|
3401 | | - |
---|
3402 | 3514 | ci = lock_cluster_or_swap_info(p, offset); |
---|
3403 | 3515 | |
---|
3404 | 3516 | count = p->swap_map[offset]; |
---|
.. | .. |
---|
3439 | 3551 | } else |
---|
3440 | 3552 | err = -ENOENT; /* unused swap entry */ |
---|
3441 | 3553 | |
---|
3442 | | - p->swap_map[offset] = count | has_cache; |
---|
| 3554 | + WRITE_ONCE(p->swap_map[offset], count | has_cache); |
---|
3443 | 3555 | |
---|
3444 | 3556 | unlock_out: |
---|
3445 | 3557 | unlock_cluster_or_swap_info(p, ci); |
---|
3446 | 3558 | out: |
---|
| 3559 | + if (p) |
---|
| 3560 | + put_swap_device(p); |
---|
3447 | 3561 | return err; |
---|
3448 | | - |
---|
3449 | | -bad_file: |
---|
3450 | | - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); |
---|
3451 | | - goto out; |
---|
3452 | 3562 | } |
---|
3453 | 3563 | |
---|
3454 | 3564 | /* |
---|
.. | .. |
---|
3481 | 3591 | * |
---|
3482 | 3592 | * Called when allocating swap cache for existing swap entry, |
---|
3483 | 3593 | * This can return error codes. Returns 0 at success. |
---|
3484 | | - * -EBUSY means there is a swap cache. |
---|
| 3594 | + * -EEXIST means there is a swap cache. |
---|
3485 | 3595 | * Note: return code is different from swap_duplicate(). |
---|
3486 | 3596 | */ |
---|
3487 | 3597 | int swapcache_prepare(swp_entry_t entry) |
---|
.. | .. |
---|
3493 | 3603 | { |
---|
3494 | 3604 | return swap_type_to_swap_info(swp_type(entry)); |
---|
3495 | 3605 | } |
---|
| 3606 | +EXPORT_SYMBOL_GPL(swp_swap_info); |
---|
3496 | 3607 | |
---|
3497 | 3608 | struct swap_info_struct *page_swap_info(struct page *page) |
---|
3498 | 3609 | { |
---|
.. | .. |
---|
3540 | 3651 | struct page *list_page; |
---|
3541 | 3652 | pgoff_t offset; |
---|
3542 | 3653 | unsigned char count; |
---|
| 3654 | + int ret = 0; |
---|
3543 | 3655 | |
---|
3544 | 3656 | /* |
---|
3545 | 3657 | * When debugging, it's easier to use __GFP_ZERO here; but it's better |
---|
.. | .. |
---|
3547 | 3659 | */ |
---|
3548 | 3660 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); |
---|
3549 | 3661 | |
---|
3550 | | - si = swap_info_get(entry); |
---|
| 3662 | + si = get_swap_device(entry); |
---|
3551 | 3663 | if (!si) { |
---|
3552 | 3664 | /* |
---|
3553 | 3665 | * An acceptable race has occurred since the failing |
---|
3554 | | - * __swap_duplicate(): the swap entry has been freed, |
---|
3555 | | - * perhaps even the whole swap_map cleared for swapoff. |
---|
| 3666 | + * __swap_duplicate(): the swap device may be swapoff |
---|
3556 | 3667 | */ |
---|
3557 | 3668 | goto outer; |
---|
3558 | 3669 | } |
---|
| 3670 | + spin_lock(&si->lock); |
---|
3559 | 3671 | |
---|
3560 | 3672 | offset = swp_offset(entry); |
---|
3561 | 3673 | |
---|
.. | .. |
---|
3573 | 3685 | } |
---|
3574 | 3686 | |
---|
3575 | 3687 | if (!page) { |
---|
3576 | | - unlock_cluster(ci); |
---|
3577 | | - spin_unlock(&si->lock); |
---|
3578 | | - return -ENOMEM; |
---|
| 3688 | + ret = -ENOMEM; |
---|
| 3689 | + goto out; |
---|
3579 | 3690 | } |
---|
3580 | 3691 | |
---|
3581 | 3692 | /* |
---|
.. | .. |
---|
3627 | 3738 | out: |
---|
3628 | 3739 | unlock_cluster(ci); |
---|
3629 | 3740 | spin_unlock(&si->lock); |
---|
| 3741 | + put_swap_device(si); |
---|
3630 | 3742 | outer: |
---|
3631 | 3743 | if (page) |
---|
3632 | 3744 | __free_page(page); |
---|
3633 | | - return 0; |
---|
| 3745 | + return ret; |
---|
3634 | 3746 | } |
---|
3635 | 3747 | |
---|
3636 | 3748 | /* |
---|
.. | .. |
---|
3658 | 3770 | |
---|
3659 | 3771 | spin_lock(&si->cont_lock); |
---|
3660 | 3772 | offset &= ~PAGE_MASK; |
---|
3661 | | - page = list_entry(head->lru.next, struct page, lru); |
---|
| 3773 | + page = list_next_entry(head, lru); |
---|
3662 | 3774 | map = kmap_atomic(page) + offset; |
---|
3663 | 3775 | |
---|
3664 | 3776 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ |
---|
.. | .. |
---|
3670 | 3782 | */ |
---|
3671 | 3783 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { |
---|
3672 | 3784 | kunmap_atomic(map); |
---|
3673 | | - page = list_entry(page->lru.next, struct page, lru); |
---|
| 3785 | + page = list_next_entry(page, lru); |
---|
3674 | 3786 | BUG_ON(page == head); |
---|
3675 | 3787 | map = kmap_atomic(page) + offset; |
---|
3676 | 3788 | } |
---|
3677 | 3789 | if (*map == SWAP_CONT_MAX) { |
---|
3678 | 3790 | kunmap_atomic(map); |
---|
3679 | | - page = list_entry(page->lru.next, struct page, lru); |
---|
| 3791 | + page = list_next_entry(page, lru); |
---|
3680 | 3792 | if (page == head) { |
---|
3681 | 3793 | ret = false; /* add count continuation */ |
---|
3682 | 3794 | goto out; |
---|
.. | .. |
---|
3686 | 3798 | } |
---|
3687 | 3799 | *map += 1; |
---|
3688 | 3800 | kunmap_atomic(map); |
---|
3689 | | - page = list_entry(page->lru.prev, struct page, lru); |
---|
3690 | | - while (page != head) { |
---|
| 3801 | + while ((page = list_prev_entry(page, lru)) != head) { |
---|
3691 | 3802 | map = kmap_atomic(page) + offset; |
---|
3692 | 3803 | *map = COUNT_CONTINUED; |
---|
3693 | 3804 | kunmap_atomic(map); |
---|
3694 | | - page = list_entry(page->lru.prev, struct page, lru); |
---|
3695 | 3805 | } |
---|
3696 | 3806 | ret = true; /* incremented */ |
---|
3697 | 3807 | |
---|
.. | .. |
---|
3702 | 3812 | BUG_ON(count != COUNT_CONTINUED); |
---|
3703 | 3813 | while (*map == COUNT_CONTINUED) { |
---|
3704 | 3814 | kunmap_atomic(map); |
---|
3705 | | - page = list_entry(page->lru.next, struct page, lru); |
---|
| 3815 | + page = list_next_entry(page, lru); |
---|
3706 | 3816 | BUG_ON(page == head); |
---|
3707 | 3817 | map = kmap_atomic(page) + offset; |
---|
3708 | 3818 | } |
---|
.. | .. |
---|
3711 | 3821 | if (*map == 0) |
---|
3712 | 3822 | count = 0; |
---|
3713 | 3823 | kunmap_atomic(map); |
---|
3714 | | - page = list_entry(page->lru.prev, struct page, lru); |
---|
3715 | | - while (page != head) { |
---|
| 3824 | + while ((page = list_prev_entry(page, lru)) != head) { |
---|
3716 | 3825 | map = kmap_atomic(page) + offset; |
---|
3717 | 3826 | *map = SWAP_CONT_MAX | count; |
---|
3718 | 3827 | count = COUNT_CONTINUED; |
---|
3719 | 3828 | kunmap_atomic(map); |
---|
3720 | | - page = list_entry(page->lru.prev, struct page, lru); |
---|
3721 | 3829 | } |
---|
3722 | 3830 | ret = count == COUNT_CONTINUED; |
---|
3723 | 3831 | } |
---|
.. | .. |
---|
3749 | 3857 | } |
---|
3750 | 3858 | |
---|
3751 | 3859 | #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) |
---|
3752 | | -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, |
---|
3753 | | - gfp_t gfp_mask) |
---|
| 3860 | +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) |
---|
3754 | 3861 | { |
---|
3755 | 3862 | struct swap_info_struct *si, *next; |
---|
3756 | | - if (!(gfp_mask & __GFP_IO) || !memcg) |
---|
| 3863 | + int nid = page_to_nid(page); |
---|
| 3864 | + bool skip = false; |
---|
| 3865 | + |
---|
| 3866 | + if (!(gfp_mask & __GFP_IO)) |
---|
3757 | 3867 | return; |
---|
3758 | 3868 | |
---|
3759 | 3869 | if (!blk_cgroup_congested()) |
---|
.. | .. |
---|
3766 | 3876 | if (current->throttle_queue) |
---|
3767 | 3877 | return; |
---|
3768 | 3878 | |
---|
| 3879 | + trace_android_vh___cgroup_throttle_swaprate(nid, &skip); |
---|
| 3880 | + if (skip) |
---|
| 3881 | + return; |
---|
| 3882 | + |
---|
3769 | 3883 | spin_lock(&swap_avail_lock); |
---|
3770 | | - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], |
---|
3771 | | - avail_lists[node]) { |
---|
| 3884 | + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], |
---|
| 3885 | + avail_lists[nid]) { |
---|
3772 | 3886 | if (si->bdev) { |
---|
3773 | | - blkcg_schedule_throttle(bdev_get_queue(si->bdev), |
---|
3774 | | - true); |
---|
| 3887 | + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); |
---|
3775 | 3888 | break; |
---|
3776 | 3889 | } |
---|
3777 | 3890 | } |
---|