| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * linux/mm/swapfile.c |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 39 | 40 | #include <linux/swap_slots.h> |
|---|
| 40 | 41 | #include <linux/sort.h> |
|---|
| 41 | 42 | |
|---|
| 42 | | -#include <asm/pgtable.h> |
|---|
| 43 | 43 | #include <asm/tlbflush.h> |
|---|
| 44 | 44 | #include <linux/swapops.h> |
|---|
| 45 | 45 | #include <linux/swap_cgroup.h> |
|---|
| 46 | +#include <trace/hooks/mm.h> |
|---|
| 46 | 47 | |
|---|
| 47 | 48 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, |
|---|
| 48 | 49 | unsigned char); |
|---|
| .. | .. |
|---|
| 98 | 99 | |
|---|
| 99 | 100 | atomic_t nr_rotate_swap = ATOMIC_INIT(0); |
|---|
| 100 | 101 | |
|---|
| 101 | | -static struct swap_info_struct *swap_type_to_swap_info(int type) |
|---|
| 102 | +struct swap_info_struct *swap_type_to_swap_info(int type) |
|---|
| 102 | 103 | { |
|---|
| 103 | 104 | if (type >= READ_ONCE(nr_swapfiles)) |
|---|
| 104 | 105 | return NULL; |
|---|
| .. | .. |
|---|
| 106 | 107 | smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ |
|---|
| 107 | 108 | return READ_ONCE(swap_info[type]); |
|---|
| 108 | 109 | } |
|---|
| 110 | +EXPORT_SYMBOL_GPL(swap_type_to_swap_info); |
|---|
| 109 | 111 | |
|---|
| 110 | 112 | static inline unsigned char swap_count(unsigned char ent) |
|---|
| 111 | 113 | { |
|---|
| 112 | 114 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ |
|---|
| 113 | 115 | } |
|---|
| 114 | 116 | |
|---|
| 117 | +/* Reclaim the swap entry anyway if possible */ |
|---|
| 118 | +#define TTRS_ANYWAY 0x1 |
|---|
| 119 | +/* |
|---|
| 120 | + * Reclaim the swap entry if there are no more mappings of the |
|---|
| 121 | + * corresponding page |
|---|
| 122 | + */ |
|---|
| 123 | +#define TTRS_UNMAPPED 0x2 |
|---|
| 124 | +/* Reclaim the swap entry if swap is getting full*/ |
|---|
| 125 | +#define TTRS_FULL 0x4 |
|---|
| 126 | + |
|---|
| 115 | 127 | /* returns 1 if swap entry is freed */ |
|---|
| 116 | | -static int |
|---|
| 117 | | -__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
|---|
| 128 | +static int __try_to_reclaim_swap(struct swap_info_struct *si, |
|---|
| 129 | + unsigned long offset, unsigned long flags) |
|---|
| 118 | 130 | { |
|---|
| 119 | 131 | swp_entry_t entry = swp_entry(si->type, offset); |
|---|
| 120 | 132 | struct page *page; |
|---|
| 121 | 133 | int ret = 0; |
|---|
| 122 | 134 | |
|---|
| 123 | | - page = find_get_page(swap_address_space(entry), swp_offset(entry)); |
|---|
| 135 | + page = find_get_page(swap_address_space(entry), offset); |
|---|
| 124 | 136 | if (!page) |
|---|
| 125 | 137 | return 0; |
|---|
| 126 | 138 | /* |
|---|
| 127 | | - * This function is called from scan_swap_map() and it's called |
|---|
| 128 | | - * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. |
|---|
| 129 | | - * We have to use trylock for avoiding deadlock. This is a special |
|---|
| 139 | + * When this function is called from scan_swap_map_slots() and it's |
|---|
| 140 | + * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, |
|---|
| 141 | + * here. We have to use trylock for avoiding deadlock. This is a special |
|---|
| 130 | 142 | * case and you should use try_to_free_swap() with explicit lock_page() |
|---|
| 131 | 143 | * in usual operations. |
|---|
| 132 | 144 | */ |
|---|
| 133 | 145 | if (trylock_page(page)) { |
|---|
| 134 | | - ret = try_to_free_swap(page); |
|---|
| 146 | + if ((flags & TTRS_ANYWAY) || |
|---|
| 147 | + ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || |
|---|
| 148 | + ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) |
|---|
| 149 | + ret = try_to_free_swap(page); |
|---|
| 135 | 150 | unlock_page(page); |
|---|
| 136 | 151 | } |
|---|
| 137 | 152 | put_page(page); |
|---|
| 138 | 153 | return ret; |
|---|
| 154 | +} |
|---|
| 155 | + |
|---|
| 156 | +static inline struct swap_extent *first_se(struct swap_info_struct *sis) |
|---|
| 157 | +{ |
|---|
| 158 | + struct rb_node *rb = rb_first(&sis->swap_extent_root); |
|---|
| 159 | + return rb_entry(rb, struct swap_extent, rb_node); |
|---|
| 160 | +} |
|---|
| 161 | + |
|---|
| 162 | +static inline struct swap_extent *next_se(struct swap_extent *se) |
|---|
| 163 | +{ |
|---|
| 164 | + struct rb_node *rb = rb_next(&se->rb_node); |
|---|
| 165 | + return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; |
|---|
| 139 | 166 | } |
|---|
| 140 | 167 | |
|---|
| 141 | 168 | /* |
|---|
| .. | .. |
|---|
| 150 | 177 | int err = 0; |
|---|
| 151 | 178 | |
|---|
| 152 | 179 | /* Do not discard the swap header page! */ |
|---|
| 153 | | - se = &si->first_swap_extent; |
|---|
| 180 | + se = first_se(si); |
|---|
| 154 | 181 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
|---|
| 155 | 182 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); |
|---|
| 156 | 183 | if (nr_blocks) { |
|---|
| .. | .. |
|---|
| 161 | 188 | cond_resched(); |
|---|
| 162 | 189 | } |
|---|
| 163 | 190 | |
|---|
| 164 | | - list_for_each_entry(se, &si->first_swap_extent.list, list) { |
|---|
| 191 | + for (se = next_se(se); se; se = next_se(se)) { |
|---|
| 165 | 192 | start_block = se->start_block << (PAGE_SHIFT - 9); |
|---|
| 166 | 193 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
|---|
| 167 | 194 | |
|---|
| .. | .. |
|---|
| 175 | 202 | return err; /* That will often be -EOPNOTSUPP */ |
|---|
| 176 | 203 | } |
|---|
| 177 | 204 | |
|---|
| 205 | +static struct swap_extent * |
|---|
| 206 | +offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) |
|---|
| 207 | +{ |
|---|
| 208 | + struct swap_extent *se; |
|---|
| 209 | + struct rb_node *rb; |
|---|
| 210 | + |
|---|
| 211 | + rb = sis->swap_extent_root.rb_node; |
|---|
| 212 | + while (rb) { |
|---|
| 213 | + se = rb_entry(rb, struct swap_extent, rb_node); |
|---|
| 214 | + if (offset < se->start_page) |
|---|
| 215 | + rb = rb->rb_left; |
|---|
| 216 | + else if (offset >= se->start_page + se->nr_pages) |
|---|
| 217 | + rb = rb->rb_right; |
|---|
| 218 | + else |
|---|
| 219 | + return se; |
|---|
| 220 | + } |
|---|
| 221 | + /* It *must* be present */ |
|---|
| 222 | + BUG(); |
|---|
| 223 | +} |
|---|
| 224 | + |
|---|
| 225 | +sector_t swap_page_sector(struct page *page) |
|---|
| 226 | +{ |
|---|
| 227 | + struct swap_info_struct *sis = page_swap_info(page); |
|---|
| 228 | + struct swap_extent *se; |
|---|
| 229 | + sector_t sector; |
|---|
| 230 | + pgoff_t offset; |
|---|
| 231 | + |
|---|
| 232 | + offset = __page_file_index(page); |
|---|
| 233 | + se = offset_to_swap_extent(sis, offset); |
|---|
| 234 | + sector = se->start_block + (offset - se->start_page); |
|---|
| 235 | + return sector << (PAGE_SHIFT - 9); |
|---|
| 236 | +} |
|---|
| 237 | + |
|---|
| 178 | 238 | /* |
|---|
| 179 | 239 | * swap allocation tell device that a cluster of swap can now be discarded, |
|---|
| 180 | 240 | * to allow the swap device to optimize its wear-levelling. |
|---|
| .. | .. |
|---|
| 182 | 242 | static void discard_swap_cluster(struct swap_info_struct *si, |
|---|
| 183 | 243 | pgoff_t start_page, pgoff_t nr_pages) |
|---|
| 184 | 244 | { |
|---|
| 185 | | - struct swap_extent *se = si->curr_swap_extent; |
|---|
| 186 | | - int found_extent = 0; |
|---|
| 245 | + struct swap_extent *se = offset_to_swap_extent(si, start_page); |
|---|
| 187 | 246 | |
|---|
| 188 | 247 | while (nr_pages) { |
|---|
| 189 | | - if (se->start_page <= start_page && |
|---|
| 190 | | - start_page < se->start_page + se->nr_pages) { |
|---|
| 191 | | - pgoff_t offset = start_page - se->start_page; |
|---|
| 192 | | - sector_t start_block = se->start_block + offset; |
|---|
| 193 | | - sector_t nr_blocks = se->nr_pages - offset; |
|---|
| 248 | + pgoff_t offset = start_page - se->start_page; |
|---|
| 249 | + sector_t start_block = se->start_block + offset; |
|---|
| 250 | + sector_t nr_blocks = se->nr_pages - offset; |
|---|
| 194 | 251 | |
|---|
| 195 | | - if (nr_blocks > nr_pages) |
|---|
| 196 | | - nr_blocks = nr_pages; |
|---|
| 197 | | - start_page += nr_blocks; |
|---|
| 198 | | - nr_pages -= nr_blocks; |
|---|
| 252 | + if (nr_blocks > nr_pages) |
|---|
| 253 | + nr_blocks = nr_pages; |
|---|
| 254 | + start_page += nr_blocks; |
|---|
| 255 | + nr_pages -= nr_blocks; |
|---|
| 199 | 256 | |
|---|
| 200 | | - if (!found_extent++) |
|---|
| 201 | | - si->curr_swap_extent = se; |
|---|
| 257 | + start_block <<= PAGE_SHIFT - 9; |
|---|
| 258 | + nr_blocks <<= PAGE_SHIFT - 9; |
|---|
| 259 | + if (blkdev_issue_discard(si->bdev, start_block, |
|---|
| 260 | + nr_blocks, GFP_NOIO, 0)) |
|---|
| 261 | + break; |
|---|
| 202 | 262 | |
|---|
| 203 | | - start_block <<= PAGE_SHIFT - 9; |
|---|
| 204 | | - nr_blocks <<= PAGE_SHIFT - 9; |
|---|
| 205 | | - if (blkdev_issue_discard(si->bdev, start_block, |
|---|
| 206 | | - nr_blocks, GFP_NOIO, 0)) |
|---|
| 207 | | - break; |
|---|
| 208 | | - } |
|---|
| 209 | | - |
|---|
| 210 | | - se = list_next_entry(se, list); |
|---|
| 263 | + se = next_se(se); |
|---|
| 211 | 264 | } |
|---|
| 212 | 265 | } |
|---|
| 213 | 266 | |
|---|
| .. | .. |
|---|
| 562 | 615 | { |
|---|
| 563 | 616 | struct percpu_cluster *cluster; |
|---|
| 564 | 617 | struct swap_cluster_info *ci; |
|---|
| 565 | | - bool found_free; |
|---|
| 566 | 618 | unsigned long tmp, max; |
|---|
| 567 | 619 | |
|---|
| 568 | 620 | new_cluster: |
|---|
| .. | .. |
|---|
| 575 | 627 | } else if (!cluster_list_empty(&si->discard_clusters)) { |
|---|
| 576 | 628 | /* |
|---|
| 577 | 629 | * we don't have free cluster but have some clusters in |
|---|
| 578 | | - * discarding, do discard now and reclaim them |
|---|
| 630 | + * discarding, do discard now and reclaim them, then |
|---|
| 631 | + * reread cluster_next_cpu since we dropped si->lock |
|---|
| 579 | 632 | */ |
|---|
| 580 | 633 | swap_do_scheduled_discard(si); |
|---|
| 581 | | - *scan_base = *offset = si->cluster_next; |
|---|
| 634 | + *scan_base = this_cpu_read(*si->cluster_next_cpu); |
|---|
| 635 | + *offset = *scan_base; |
|---|
| 582 | 636 | goto new_cluster; |
|---|
| 583 | 637 | } else |
|---|
| 584 | 638 | return false; |
|---|
| 585 | 639 | } |
|---|
| 586 | | - |
|---|
| 587 | | - found_free = false; |
|---|
| 588 | 640 | |
|---|
| 589 | 641 | /* |
|---|
| 590 | 642 | * Other CPUs can use our cluster if they can't find a free cluster, |
|---|
| .. | .. |
|---|
| 593 | 645 | tmp = cluster->next; |
|---|
| 594 | 646 | max = min_t(unsigned long, si->max, |
|---|
| 595 | 647 | (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); |
|---|
| 596 | | - if (tmp >= max) { |
|---|
| 597 | | - cluster_set_null(&cluster->index); |
|---|
| 598 | | - goto new_cluster; |
|---|
| 599 | | - } |
|---|
| 600 | | - ci = lock_cluster(si, tmp); |
|---|
| 601 | | - while (tmp < max) { |
|---|
| 602 | | - if (!si->swap_map[tmp]) { |
|---|
| 603 | | - found_free = true; |
|---|
| 604 | | - break; |
|---|
| 648 | + if (tmp < max) { |
|---|
| 649 | + ci = lock_cluster(si, tmp); |
|---|
| 650 | + while (tmp < max) { |
|---|
| 651 | + if (!si->swap_map[tmp]) |
|---|
| 652 | + break; |
|---|
| 653 | + tmp++; |
|---|
| 605 | 654 | } |
|---|
| 606 | | - tmp++; |
|---|
| 655 | + unlock_cluster(ci); |
|---|
| 607 | 656 | } |
|---|
| 608 | | - unlock_cluster(ci); |
|---|
| 609 | | - if (!found_free) { |
|---|
| 657 | + if (tmp >= max) { |
|---|
| 610 | 658 | cluster_set_null(&cluster->index); |
|---|
| 611 | 659 | goto new_cluster; |
|---|
| 612 | 660 | } |
|---|
| 613 | 661 | cluster->next = tmp + 1; |
|---|
| 614 | 662 | *offset = tmp; |
|---|
| 615 | 663 | *scan_base = tmp; |
|---|
| 616 | | - return found_free; |
|---|
| 664 | + return true; |
|---|
| 617 | 665 | } |
|---|
| 618 | 666 | |
|---|
| 619 | 667 | static void __del_from_avail_list(struct swap_info_struct *p) |
|---|
| 620 | 668 | { |
|---|
| 621 | 669 | int nid; |
|---|
| 622 | 670 | |
|---|
| 671 | + assert_spin_locked(&p->lock); |
|---|
| 623 | 672 | for_each_node(nid) |
|---|
| 624 | 673 | plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); |
|---|
| 625 | 674 | } |
|---|
| 626 | 675 | |
|---|
| 627 | 676 | static void del_from_avail_list(struct swap_info_struct *p) |
|---|
| 628 | 677 | { |
|---|
| 678 | + bool skip = false; |
|---|
| 679 | + |
|---|
| 680 | + trace_android_vh_del_from_avail_list(p, &skip); |
|---|
| 681 | + if (skip) |
|---|
| 682 | + return; |
|---|
| 683 | + |
|---|
| 629 | 684 | spin_lock(&swap_avail_lock); |
|---|
| 630 | 685 | __del_from_avail_list(p); |
|---|
| 631 | 686 | spin_unlock(&swap_avail_lock); |
|---|
| .. | .. |
|---|
| 639 | 694 | if (offset == si->lowest_bit) |
|---|
| 640 | 695 | si->lowest_bit += nr_entries; |
|---|
| 641 | 696 | if (end == si->highest_bit) |
|---|
| 642 | | - si->highest_bit -= nr_entries; |
|---|
| 697 | + WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); |
|---|
| 643 | 698 | si->inuse_pages += nr_entries; |
|---|
| 644 | 699 | if (si->inuse_pages == si->pages) { |
|---|
| 645 | 700 | si->lowest_bit = si->max; |
|---|
| .. | .. |
|---|
| 651 | 706 | static void add_to_avail_list(struct swap_info_struct *p) |
|---|
| 652 | 707 | { |
|---|
| 653 | 708 | int nid; |
|---|
| 709 | + bool skip = false; |
|---|
| 710 | + |
|---|
| 711 | + trace_android_vh_add_to_avail_list(p, &skip); |
|---|
| 712 | + if (skip) |
|---|
| 713 | + return; |
|---|
| 654 | 714 | |
|---|
| 655 | 715 | spin_lock(&swap_avail_lock); |
|---|
| 656 | 716 | for_each_node(nid) { |
|---|
| .. | .. |
|---|
| 663 | 723 | static void swap_range_free(struct swap_info_struct *si, unsigned long offset, |
|---|
| 664 | 724 | unsigned int nr_entries) |
|---|
| 665 | 725 | { |
|---|
| 726 | + unsigned long begin = offset; |
|---|
| 666 | 727 | unsigned long end = offset + nr_entries - 1; |
|---|
| 667 | 728 | void (*swap_slot_free_notify)(struct block_device *, unsigned long); |
|---|
| 729 | + bool skip = false; |
|---|
| 668 | 730 | |
|---|
| 669 | 731 | if (offset < si->lowest_bit) |
|---|
| 670 | 732 | si->lowest_bit = offset; |
|---|
| 671 | 733 | if (end > si->highest_bit) { |
|---|
| 672 | 734 | bool was_full = !si->highest_bit; |
|---|
| 673 | 735 | |
|---|
| 674 | | - si->highest_bit = end; |
|---|
| 736 | + WRITE_ONCE(si->highest_bit, end); |
|---|
| 675 | 737 | if (was_full && (si->flags & SWP_WRITEOK)) |
|---|
| 676 | 738 | add_to_avail_list(si); |
|---|
| 677 | 739 | } |
|---|
| 678 | | - atomic_long_add(nr_entries, &nr_swap_pages); |
|---|
| 740 | + trace_android_vh_account_swap_pages(si, &skip); |
|---|
| 741 | + if (!skip) |
|---|
| 742 | + atomic_long_add(nr_entries, &nr_swap_pages); |
|---|
| 679 | 743 | si->inuse_pages -= nr_entries; |
|---|
| 680 | 744 | if (si->flags & SWP_BLKDEV) |
|---|
| 681 | 745 | swap_slot_free_notify = |
|---|
| .. | .. |
|---|
| 683 | 747 | else |
|---|
| 684 | 748 | swap_slot_free_notify = NULL; |
|---|
| 685 | 749 | while (offset <= end) { |
|---|
| 750 | + arch_swap_invalidate_page(si->type, offset); |
|---|
| 686 | 751 | frontswap_invalidate_page(si->type, offset); |
|---|
| 687 | 752 | if (swap_slot_free_notify) |
|---|
| 688 | 753 | swap_slot_free_notify(si->bdev, offset); |
|---|
| 689 | 754 | offset++; |
|---|
| 690 | 755 | } |
|---|
| 756 | + clear_shadow_from_swap_cache(si->type, begin, end); |
|---|
| 691 | 757 | } |
|---|
| 692 | 758 | |
|---|
| 693 | | -static int scan_swap_map_slots(struct swap_info_struct *si, |
|---|
| 759 | +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) |
|---|
| 760 | +{ |
|---|
| 761 | + unsigned long prev; |
|---|
| 762 | + |
|---|
| 763 | + if (!(si->flags & SWP_SOLIDSTATE)) { |
|---|
| 764 | + si->cluster_next = next; |
|---|
| 765 | + return; |
|---|
| 766 | + } |
|---|
| 767 | + |
|---|
| 768 | + prev = this_cpu_read(*si->cluster_next_cpu); |
|---|
| 769 | + /* |
|---|
| 770 | + * Cross the swap address space size aligned trunk, choose |
|---|
| 771 | + * another trunk randomly to avoid lock contention on swap |
|---|
| 772 | + * address space if possible. |
|---|
| 773 | + */ |
|---|
| 774 | + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != |
|---|
| 775 | + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { |
|---|
| 776 | + /* No free swap slots available */ |
|---|
| 777 | + if (si->highest_bit <= si->lowest_bit) |
|---|
| 778 | + return; |
|---|
| 779 | + next = si->lowest_bit + |
|---|
| 780 | + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); |
|---|
| 781 | + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); |
|---|
| 782 | + next = max_t(unsigned int, next, si->lowest_bit); |
|---|
| 783 | + } |
|---|
| 784 | + this_cpu_write(*si->cluster_next_cpu, next); |
|---|
| 785 | +} |
|---|
| 786 | + |
|---|
| 787 | +int scan_swap_map_slots(struct swap_info_struct *si, |
|---|
| 694 | 788 | unsigned char usage, int nr, |
|---|
| 695 | 789 | swp_entry_t slots[]) |
|---|
| 696 | 790 | { |
|---|
| .. | .. |
|---|
| 700 | 794 | unsigned long last_in_cluster = 0; |
|---|
| 701 | 795 | int latency_ration = LATENCY_LIMIT; |
|---|
| 702 | 796 | int n_ret = 0; |
|---|
| 703 | | - |
|---|
| 704 | | - if (nr > SWAP_BATCH) |
|---|
| 705 | | - nr = SWAP_BATCH; |
|---|
| 797 | + bool scanned_many = false; |
|---|
| 706 | 798 | |
|---|
| 707 | 799 | /* |
|---|
| 708 | 800 | * We try to cluster swap pages by allocating them sequentially |
|---|
| .. | .. |
|---|
| 716 | 808 | */ |
|---|
| 717 | 809 | |
|---|
| 718 | 810 | si->flags += SWP_SCANNING; |
|---|
| 719 | | - scan_base = offset = si->cluster_next; |
|---|
| 811 | + /* |
|---|
| 812 | + * Use percpu scan base for SSD to reduce lock contention on |
|---|
| 813 | + * cluster and swap cache. For HDD, sequential access is more |
|---|
| 814 | + * important. |
|---|
| 815 | + */ |
|---|
| 816 | + if (si->flags & SWP_SOLIDSTATE) |
|---|
| 817 | + scan_base = this_cpu_read(*si->cluster_next_cpu); |
|---|
| 818 | + else |
|---|
| 819 | + scan_base = si->cluster_next; |
|---|
| 820 | + offset = scan_base; |
|---|
| 720 | 821 | |
|---|
| 721 | 822 | /* SSD algorithm */ |
|---|
| 722 | 823 | if (si->cluster_info) { |
|---|
| 723 | | - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
|---|
| 724 | | - goto checks; |
|---|
| 725 | | - else |
|---|
| 824 | + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
|---|
| 726 | 825 | goto scan; |
|---|
| 727 | | - } |
|---|
| 728 | | - |
|---|
| 729 | | - if (unlikely(!si->cluster_nr--)) { |
|---|
| 826 | + } else if (unlikely(!si->cluster_nr--)) { |
|---|
| 730 | 827 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
|---|
| 731 | 828 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
|---|
| 732 | 829 | goto checks; |
|---|
| .. | .. |
|---|
| 789 | 886 | int swap_was_freed; |
|---|
| 790 | 887 | unlock_cluster(ci); |
|---|
| 791 | 888 | spin_unlock(&si->lock); |
|---|
| 792 | | - swap_was_freed = __try_to_reclaim_swap(si, offset); |
|---|
| 889 | + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); |
|---|
| 793 | 890 | spin_lock(&si->lock); |
|---|
| 794 | 891 | /* entry was freed successfully, try to use this again */ |
|---|
| 795 | 892 | if (swap_was_freed) |
|---|
| .. | .. |
|---|
| 804 | 901 | else |
|---|
| 805 | 902 | goto done; |
|---|
| 806 | 903 | } |
|---|
| 807 | | - si->swap_map[offset] = usage; |
|---|
| 904 | + WRITE_ONCE(si->swap_map[offset], usage); |
|---|
| 808 | 905 | inc_cluster_info_page(si, si->cluster_info, offset); |
|---|
| 809 | 906 | unlock_cluster(ci); |
|---|
| 810 | 907 | |
|---|
| 811 | 908 | swap_range_alloc(si, offset, 1); |
|---|
| 812 | | - si->cluster_next = offset + 1; |
|---|
| 813 | 909 | slots[n_ret++] = swp_entry(si->type, offset); |
|---|
| 814 | 910 | |
|---|
| 815 | 911 | /* got enough slots or reach max slots? */ |
|---|
| .. | .. |
|---|
| 832 | 928 | if (si->cluster_info) { |
|---|
| 833 | 929 | if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) |
|---|
| 834 | 930 | goto checks; |
|---|
| 835 | | - else |
|---|
| 836 | | - goto done; |
|---|
| 837 | | - } |
|---|
| 838 | | - /* non-ssd case */ |
|---|
| 839 | | - ++offset; |
|---|
| 840 | | - |
|---|
| 841 | | - /* non-ssd case, still more slots in cluster? */ |
|---|
| 842 | | - if (si->cluster_nr && !si->swap_map[offset]) { |
|---|
| 931 | + } else if (si->cluster_nr && !si->swap_map[++offset]) { |
|---|
| 932 | + /* non-ssd case, still more slots in cluster? */ |
|---|
| 843 | 933 | --si->cluster_nr; |
|---|
| 844 | 934 | goto checks; |
|---|
| 845 | 935 | } |
|---|
| 846 | 936 | |
|---|
| 937 | + /* |
|---|
| 938 | + * Even if there's no free clusters available (fragmented), |
|---|
| 939 | + * try to scan a little more quickly with lock held unless we |
|---|
| 940 | + * have scanned too many slots already. |
|---|
| 941 | + */ |
|---|
| 942 | + if (!scanned_many) { |
|---|
| 943 | + unsigned long scan_limit; |
|---|
| 944 | + |
|---|
| 945 | + if (offset < scan_base) |
|---|
| 946 | + scan_limit = scan_base; |
|---|
| 947 | + else |
|---|
| 948 | + scan_limit = si->highest_bit; |
|---|
| 949 | + for (; offset <= scan_limit && --latency_ration > 0; |
|---|
| 950 | + offset++) { |
|---|
| 951 | + if (!si->swap_map[offset]) |
|---|
| 952 | + goto checks; |
|---|
| 953 | + } |
|---|
| 954 | + } |
|---|
| 955 | + |
|---|
| 847 | 956 | done: |
|---|
| 957 | + set_cluster_next(si, offset + 1); |
|---|
| 848 | 958 | si->flags -= SWP_SCANNING; |
|---|
| 849 | 959 | return n_ret; |
|---|
| 850 | 960 | |
|---|
| 851 | 961 | scan: |
|---|
| 852 | 962 | spin_unlock(&si->lock); |
|---|
| 853 | | - while (++offset <= si->highest_bit) { |
|---|
| 854 | | - if (!si->swap_map[offset]) { |
|---|
| 963 | + while (++offset <= READ_ONCE(si->highest_bit)) { |
|---|
| 964 | + if (data_race(!si->swap_map[offset])) { |
|---|
| 855 | 965 | spin_lock(&si->lock); |
|---|
| 856 | 966 | goto checks; |
|---|
| 857 | 967 | } |
|---|
| 858 | | - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
|---|
| 968 | + if (vm_swap_full() && |
|---|
| 969 | + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
|---|
| 859 | 970 | spin_lock(&si->lock); |
|---|
| 860 | 971 | goto checks; |
|---|
| 861 | 972 | } |
|---|
| 862 | 973 | if (unlikely(--latency_ration < 0)) { |
|---|
| 863 | 974 | cond_resched(); |
|---|
| 864 | 975 | latency_ration = LATENCY_LIMIT; |
|---|
| 976 | + scanned_many = true; |
|---|
| 865 | 977 | } |
|---|
| 866 | 978 | } |
|---|
| 867 | 979 | offset = si->lowest_bit; |
|---|
| 868 | 980 | while (offset < scan_base) { |
|---|
| 869 | | - if (!si->swap_map[offset]) { |
|---|
| 981 | + if (data_race(!si->swap_map[offset])) { |
|---|
| 870 | 982 | spin_lock(&si->lock); |
|---|
| 871 | 983 | goto checks; |
|---|
| 872 | 984 | } |
|---|
| 873 | | - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
|---|
| 985 | + if (vm_swap_full() && |
|---|
| 986 | + READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { |
|---|
| 874 | 987 | spin_lock(&si->lock); |
|---|
| 875 | 988 | goto checks; |
|---|
| 876 | 989 | } |
|---|
| 877 | 990 | if (unlikely(--latency_ration < 0)) { |
|---|
| 878 | 991 | cond_resched(); |
|---|
| 879 | 992 | latency_ration = LATENCY_LIMIT; |
|---|
| 993 | + scanned_many = true; |
|---|
| 880 | 994 | } |
|---|
| 881 | 995 | offset++; |
|---|
| 882 | 996 | } |
|---|
| .. | .. |
|---|
| 886 | 1000 | si->flags -= SWP_SCANNING; |
|---|
| 887 | 1001 | return n_ret; |
|---|
| 888 | 1002 | } |
|---|
| 1003 | +EXPORT_SYMBOL_GPL(scan_swap_map_slots); |
|---|
| 889 | 1004 | |
|---|
| 890 | | -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) |
|---|
| 1005 | +int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) |
|---|
| 891 | 1006 | { |
|---|
| 892 | 1007 | unsigned long idx; |
|---|
| 893 | 1008 | struct swap_cluster_info *ci; |
|---|
| .. | .. |
|---|
| 921 | 1036 | |
|---|
| 922 | 1037 | return 1; |
|---|
| 923 | 1038 | } |
|---|
| 1039 | +EXPORT_SYMBOL_GPL(swap_alloc_cluster); |
|---|
| 924 | 1040 | |
|---|
| 925 | 1041 | static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) |
|---|
| 926 | 1042 | { |
|---|
| .. | .. |
|---|
| 928 | 1044 | struct swap_cluster_info *ci; |
|---|
| 929 | 1045 | |
|---|
| 930 | 1046 | ci = lock_cluster(si, offset); |
|---|
| 1047 | + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); |
|---|
| 931 | 1048 | cluster_set_count_flag(ci, 0, 0); |
|---|
| 932 | 1049 | free_cluster(si, idx); |
|---|
| 933 | 1050 | unlock_cluster(ci); |
|---|
| .. | .. |
|---|
| 960 | 1077 | /* Only single cluster request supported */ |
|---|
| 961 | 1078 | WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); |
|---|
| 962 | 1079 | |
|---|
| 1080 | + spin_lock(&swap_avail_lock); |
|---|
| 1081 | + |
|---|
| 963 | 1082 | avail_pgs = atomic_long_read(&nr_swap_pages) / size; |
|---|
| 964 | | - if (avail_pgs <= 0) |
|---|
| 1083 | + if (avail_pgs <= 0) { |
|---|
| 1084 | + spin_unlock(&swap_avail_lock); |
|---|
| 965 | 1085 | goto noswap; |
|---|
| 1086 | + } |
|---|
| 966 | 1087 | |
|---|
| 967 | | - if (n_goal > SWAP_BATCH) |
|---|
| 968 | | - n_goal = SWAP_BATCH; |
|---|
| 969 | | - |
|---|
| 970 | | - if (n_goal > avail_pgs) |
|---|
| 971 | | - n_goal = avail_pgs; |
|---|
| 1088 | + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); |
|---|
| 972 | 1089 | |
|---|
| 973 | 1090 | atomic_long_sub(n_goal * size, &nr_swap_pages); |
|---|
| 974 | | - |
|---|
| 975 | | - spin_lock(&swap_avail_lock); |
|---|
| 976 | 1091 | |
|---|
| 977 | 1092 | start_over: |
|---|
| 978 | 1093 | node = numa_node_id(); |
|---|
| .. | .. |
|---|
| 1008 | 1123 | goto check_out; |
|---|
| 1009 | 1124 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
|---|
| 1010 | 1125 | si->type); |
|---|
| 1126 | + cond_resched(); |
|---|
| 1011 | 1127 | |
|---|
| 1012 | 1128 | spin_lock(&swap_avail_lock); |
|---|
| 1013 | 1129 | nextsi: |
|---|
| .. | .. |
|---|
| 1041 | 1157 | { |
|---|
| 1042 | 1158 | struct swap_info_struct *si = swap_type_to_swap_info(type); |
|---|
| 1043 | 1159 | pgoff_t offset; |
|---|
| 1160 | + bool skip = false; |
|---|
| 1044 | 1161 | |
|---|
| 1045 | 1162 | if (!si) |
|---|
| 1046 | 1163 | goto fail; |
|---|
| 1047 | 1164 | |
|---|
| 1048 | 1165 | spin_lock(&si->lock); |
|---|
| 1049 | 1166 | if (si->flags & SWP_WRITEOK) { |
|---|
| 1050 | | - atomic_long_dec(&nr_swap_pages); |
|---|
| 1051 | 1167 | /* This is called for allocating swap entry, not cache */ |
|---|
| 1052 | 1168 | offset = scan_swap_map(si, 1); |
|---|
| 1053 | 1169 | if (offset) { |
|---|
| 1170 | + trace_android_vh_account_swap_pages(si, &skip); |
|---|
| 1171 | + if (!skip) |
|---|
| 1172 | + atomic_long_dec(&nr_swap_pages); |
|---|
| 1054 | 1173 | spin_unlock(&si->lock); |
|---|
| 1055 | 1174 | return swp_entry(type, offset); |
|---|
| 1056 | 1175 | } |
|---|
| 1057 | | - atomic_long_inc(&nr_swap_pages); |
|---|
| 1058 | 1176 | } |
|---|
| 1059 | 1177 | spin_unlock(&si->lock); |
|---|
| 1060 | 1178 | fail: |
|---|
| .. | .. |
|---|
| 1064 | 1182 | static struct swap_info_struct *__swap_info_get(swp_entry_t entry) |
|---|
| 1065 | 1183 | { |
|---|
| 1066 | 1184 | struct swap_info_struct *p; |
|---|
| 1067 | | - unsigned long offset, type; |
|---|
| 1185 | + unsigned long offset; |
|---|
| 1068 | 1186 | |
|---|
| 1069 | 1187 | if (!entry.val) |
|---|
| 1070 | 1188 | goto out; |
|---|
| 1071 | | - type = swp_type(entry); |
|---|
| 1072 | | - p = swap_type_to_swap_info(type); |
|---|
| 1189 | + p = swp_swap_info(entry); |
|---|
| 1073 | 1190 | if (!p) |
|---|
| 1074 | 1191 | goto bad_nofile; |
|---|
| 1075 | | - if (!(p->flags & SWP_USED)) |
|---|
| 1192 | + if (data_race(!(p->flags & SWP_USED))) |
|---|
| 1076 | 1193 | goto bad_device; |
|---|
| 1077 | 1194 | offset = swp_offset(entry); |
|---|
| 1078 | 1195 | if (offset >= p->max) |
|---|
| .. | .. |
|---|
| 1098 | 1215 | p = __swap_info_get(entry); |
|---|
| 1099 | 1216 | if (!p) |
|---|
| 1100 | 1217 | goto out; |
|---|
| 1101 | | - if (!p->swap_map[swp_offset(entry)]) |
|---|
| 1218 | + if (data_race(!p->swap_map[swp_offset(entry)])) |
|---|
| 1102 | 1219 | goto bad_free; |
|---|
| 1103 | 1220 | return p; |
|---|
| 1104 | 1221 | |
|---|
| 1105 | 1222 | bad_free: |
|---|
| 1106 | 1223 | pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); |
|---|
| 1107 | | - goto out; |
|---|
| 1108 | 1224 | out: |
|---|
| 1109 | 1225 | return NULL; |
|---|
| 1110 | 1226 | } |
|---|
| .. | .. |
|---|
| 1167 | 1283 | } |
|---|
| 1168 | 1284 | |
|---|
| 1169 | 1285 | usage = count | has_cache; |
|---|
| 1170 | | - p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; |
|---|
| 1286 | + if (usage) |
|---|
| 1287 | + WRITE_ONCE(p->swap_map[offset], usage); |
|---|
| 1288 | + else |
|---|
| 1289 | + WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); |
|---|
| 1171 | 1290 | |
|---|
| 1172 | 1291 | return usage; |
|---|
| 1173 | 1292 | } |
|---|
| 1174 | 1293 | |
|---|
| 1294 | +/* |
|---|
| 1295 | + * Check whether swap entry is valid in the swap device. If so, |
|---|
| 1296 | + * return pointer to swap_info_struct, and keep the swap entry valid |
|---|
| 1297 | + * via preventing the swap device from being swapoff, until |
|---|
| 1298 | + * put_swap_device() is called. Otherwise return NULL. |
|---|
| 1299 | + * |
|---|
| 1300 | + * The entirety of the RCU read critical section must come before the |
|---|
| 1301 | + * return from or after the call to synchronize_rcu() in |
|---|
| 1302 | + * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is |
|---|
| 1303 | + * true, the si->map, si->cluster_info, etc. must be valid in the |
|---|
| 1304 | + * critical section. |
|---|
| 1305 | + * |
|---|
| 1306 | + * Notice that swapoff or swapoff+swapon can still happen before the |
|---|
| 1307 | + * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() |
|---|
| 1308 | + * in put_swap_device() if there isn't any other way to prevent |
|---|
| 1309 | + * swapoff, such as page lock, page table lock, etc. The caller must |
|---|
| 1310 | + * be prepared for that. For example, the following situation is |
|---|
| 1311 | + * possible. |
|---|
| 1312 | + * |
|---|
| 1313 | + * CPU1 CPU2 |
|---|
| 1314 | + * do_swap_page() |
|---|
| 1315 | + * ... swapoff+swapon |
|---|
| 1316 | + * __read_swap_cache_async() |
|---|
| 1317 | + * swapcache_prepare() |
|---|
| 1318 | + * __swap_duplicate() |
|---|
| 1319 | + * // check swap_map |
|---|
| 1320 | + * // verify PTE not changed |
|---|
| 1321 | + * |
|---|
| 1322 | + * In __swap_duplicate(), the swap_map need to be checked before |
|---|
| 1323 | + * changing partly because the specified swap entry may be for another |
|---|
| 1324 | + * swap device which has been swapoff. And in do_swap_page(), after |
|---|
| 1325 | + * the page is read from the swap device, the PTE is verified not |
|---|
| 1326 | + * changed with the page table locked to check whether the swap device |
|---|
| 1327 | + * has been swapoff or swapoff+swapon. |
|---|
| 1328 | + */ |
|---|
| 1329 | +struct swap_info_struct *get_swap_device(swp_entry_t entry) |
|---|
| 1330 | +{ |
|---|
| 1331 | + struct swap_info_struct *si; |
|---|
| 1332 | + unsigned long offset; |
|---|
| 1333 | + |
|---|
| 1334 | + if (!entry.val) |
|---|
| 1335 | + goto out; |
|---|
| 1336 | + si = swp_swap_info(entry); |
|---|
| 1337 | + if (!si) |
|---|
| 1338 | + goto bad_nofile; |
|---|
| 1339 | + |
|---|
| 1340 | + rcu_read_lock(); |
|---|
| 1341 | + if (data_race(!(si->flags & SWP_VALID))) |
|---|
| 1342 | + goto unlock_out; |
|---|
| 1343 | + offset = swp_offset(entry); |
|---|
| 1344 | + if (offset >= si->max) |
|---|
| 1345 | + goto unlock_out; |
|---|
| 1346 | + |
|---|
| 1347 | + return si; |
|---|
| 1348 | +bad_nofile: |
|---|
| 1349 | + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); |
|---|
| 1350 | +out: |
|---|
| 1351 | + return NULL; |
|---|
| 1352 | +unlock_out: |
|---|
| 1353 | + rcu_read_unlock(); |
|---|
| 1354 | + return NULL; |
|---|
| 1355 | +} |
|---|
| 1356 | + |
|---|
| 1175 | 1357 | static unsigned char __swap_entry_free(struct swap_info_struct *p, |
|---|
| 1176 | | - swp_entry_t entry, unsigned char usage) |
|---|
| 1358 | + swp_entry_t entry) |
|---|
| 1177 | 1359 | { |
|---|
| 1178 | 1360 | struct swap_cluster_info *ci; |
|---|
| 1179 | 1361 | unsigned long offset = swp_offset(entry); |
|---|
| 1362 | + unsigned char usage; |
|---|
| 1180 | 1363 | |
|---|
| 1181 | 1364 | ci = lock_cluster_or_swap_info(p, offset); |
|---|
| 1182 | | - usage = __swap_entry_free_locked(p, offset, usage); |
|---|
| 1365 | + usage = __swap_entry_free_locked(p, offset, 1); |
|---|
| 1183 | 1366 | unlock_cluster_or_swap_info(p, ci); |
|---|
| 1367 | + if (!usage) |
|---|
| 1368 | + free_swap_slot(entry); |
|---|
| 1184 | 1369 | |
|---|
| 1185 | 1370 | return usage; |
|---|
| 1186 | 1371 | } |
|---|
| .. | .. |
|---|
| 1211 | 1396 | struct swap_info_struct *p; |
|---|
| 1212 | 1397 | |
|---|
| 1213 | 1398 | p = _swap_info_get(entry); |
|---|
| 1214 | | - if (p) { |
|---|
| 1215 | | - if (!__swap_entry_free(p, entry, 1)) |
|---|
| 1216 | | - free_swap_slot(entry); |
|---|
| 1217 | | - } |
|---|
| 1399 | + if (p) |
|---|
| 1400 | + __swap_entry_free(p, entry); |
|---|
| 1218 | 1401 | } |
|---|
| 1219 | 1402 | |
|---|
| 1220 | 1403 | /* |
|---|
| .. | .. |
|---|
| 1229 | 1412 | unsigned char *map; |
|---|
| 1230 | 1413 | unsigned int i, free_entries = 0; |
|---|
| 1231 | 1414 | unsigned char val; |
|---|
| 1232 | | - int size = swap_entry_size(hpage_nr_pages(page)); |
|---|
| 1415 | + int size = swap_entry_size(thp_nr_pages(page)); |
|---|
| 1233 | 1416 | |
|---|
| 1234 | 1417 | si = _swap_info_get(entry); |
|---|
| 1235 | 1418 | if (!si) |
|---|
| .. | .. |
|---|
| 1249 | 1432 | if (free_entries == SWAPFILE_CLUSTER) { |
|---|
| 1250 | 1433 | unlock_cluster_or_swap_info(si, ci); |
|---|
| 1251 | 1434 | spin_lock(&si->lock); |
|---|
| 1252 | | - ci = lock_cluster(si, offset); |
|---|
| 1253 | | - memset(map, 0, SWAPFILE_CLUSTER); |
|---|
| 1254 | | - unlock_cluster(ci); |
|---|
| 1255 | 1435 | mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); |
|---|
| 1256 | 1436 | swap_free_cluster(si, idx); |
|---|
| 1257 | 1437 | spin_unlock(&si->lock); |
|---|
| .. | .. |
|---|
| 1321 | 1501 | if (p) |
|---|
| 1322 | 1502 | spin_unlock(&p->lock); |
|---|
| 1323 | 1503 | } |
|---|
| 1504 | +EXPORT_SYMBOL_GPL(swapcache_free_entries); |
|---|
| 1324 | 1505 | |
|---|
| 1325 | 1506 | /* |
|---|
| 1326 | 1507 | * How many references to page are currently swapped out? |
|---|
| .. | .. |
|---|
| 1346 | 1527 | return count; |
|---|
| 1347 | 1528 | } |
|---|
| 1348 | 1529 | |
|---|
| 1349 | | -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) |
|---|
| 1530 | +int __swap_count(swp_entry_t entry) |
|---|
| 1350 | 1531 | { |
|---|
| 1532 | + struct swap_info_struct *si; |
|---|
| 1351 | 1533 | pgoff_t offset = swp_offset(entry); |
|---|
| 1534 | + int count = 0; |
|---|
| 1352 | 1535 | |
|---|
| 1353 | | - return swap_count(si->swap_map[offset]); |
|---|
| 1536 | + si = get_swap_device(entry); |
|---|
| 1537 | + if (si) { |
|---|
| 1538 | + count = swap_count(si->swap_map[offset]); |
|---|
| 1539 | + put_swap_device(si); |
|---|
| 1540 | + } |
|---|
| 1541 | + return count; |
|---|
| 1354 | 1542 | } |
|---|
| 1355 | 1543 | |
|---|
| 1356 | 1544 | static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) |
|---|
| .. | .. |
|---|
| 1375 | 1563 | int count = 0; |
|---|
| 1376 | 1564 | struct swap_info_struct *si; |
|---|
| 1377 | 1565 | |
|---|
| 1378 | | - si = __swap_info_get(entry); |
|---|
| 1379 | | - if (si) |
|---|
| 1566 | + si = get_swap_device(entry); |
|---|
| 1567 | + if (si) { |
|---|
| 1380 | 1568 | count = swap_swapcount(si, entry); |
|---|
| 1569 | + put_swap_device(si); |
|---|
| 1570 | + } |
|---|
| 1381 | 1571 | return count; |
|---|
| 1382 | 1572 | } |
|---|
| 1383 | 1573 | |
|---|
| .. | .. |
|---|
| 1624 | 1814 | int free_swap_and_cache(swp_entry_t entry) |
|---|
| 1625 | 1815 | { |
|---|
| 1626 | 1816 | struct swap_info_struct *p; |
|---|
| 1627 | | - struct page *page = NULL; |
|---|
| 1628 | 1817 | unsigned char count; |
|---|
| 1629 | 1818 | |
|---|
| 1630 | 1819 | if (non_swap_entry(entry)) |
|---|
| .. | .. |
|---|
| 1632 | 1821 | |
|---|
| 1633 | 1822 | p = _swap_info_get(entry); |
|---|
| 1634 | 1823 | if (p) { |
|---|
| 1635 | | - count = __swap_entry_free(p, entry, 1); |
|---|
| 1824 | + count = __swap_entry_free(p, entry); |
|---|
| 1636 | 1825 | if (count == SWAP_HAS_CACHE && |
|---|
| 1637 | | - !swap_page_trans_huge_swapped(p, entry)) { |
|---|
| 1638 | | - page = find_get_page(swap_address_space(entry), |
|---|
| 1639 | | - swp_offset(entry)); |
|---|
| 1640 | | - if (page && !trylock_page(page)) { |
|---|
| 1641 | | - put_page(page); |
|---|
| 1642 | | - page = NULL; |
|---|
| 1643 | | - } |
|---|
| 1644 | | - } else if (!count) |
|---|
| 1645 | | - free_swap_slot(entry); |
|---|
| 1646 | | - } |
|---|
| 1647 | | - if (page) { |
|---|
| 1648 | | - /* |
|---|
| 1649 | | - * Not mapped elsewhere, or swap space full? Free it! |
|---|
| 1650 | | - * Also recheck PageSwapCache now page is locked (above). |
|---|
| 1651 | | - */ |
|---|
| 1652 | | - if (PageSwapCache(page) && !PageWriteback(page) && |
|---|
| 1653 | | - (!page_mapped(page) || mem_cgroup_swap_full(page)) && |
|---|
| 1654 | | - !swap_page_trans_huge_swapped(p, entry)) { |
|---|
| 1655 | | - page = compound_head(page); |
|---|
| 1656 | | - delete_from_swap_cache(page); |
|---|
| 1657 | | - SetPageDirty(page); |
|---|
| 1658 | | - } |
|---|
| 1659 | | - unlock_page(page); |
|---|
| 1660 | | - put_page(page); |
|---|
| 1826 | + !swap_page_trans_huge_swapped(p, entry)) |
|---|
| 1827 | + __try_to_reclaim_swap(p, swp_offset(entry), |
|---|
| 1828 | + TTRS_UNMAPPED | TTRS_FULL); |
|---|
| 1661 | 1829 | } |
|---|
| 1662 | 1830 | return p != NULL; |
|---|
| 1663 | 1831 | } |
|---|
| .. | .. |
|---|
| 1671 | 1839 | * |
|---|
| 1672 | 1840 | * This is needed for the suspend to disk (aka swsusp). |
|---|
| 1673 | 1841 | */ |
|---|
| 1674 | | -int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
|---|
| 1842 | +int swap_type_of(dev_t device, sector_t offset) |
|---|
| 1675 | 1843 | { |
|---|
| 1676 | | - struct block_device *bdev = NULL; |
|---|
| 1677 | 1844 | int type; |
|---|
| 1678 | 1845 | |
|---|
| 1679 | | - if (device) |
|---|
| 1680 | | - bdev = bdget(device); |
|---|
| 1846 | + if (!device) |
|---|
| 1847 | + return -1; |
|---|
| 1681 | 1848 | |
|---|
| 1682 | 1849 | spin_lock(&swap_lock); |
|---|
| 1683 | 1850 | for (type = 0; type < nr_swapfiles; type++) { |
|---|
| .. | .. |
|---|
| 1686 | 1853 | if (!(sis->flags & SWP_WRITEOK)) |
|---|
| 1687 | 1854 | continue; |
|---|
| 1688 | 1855 | |
|---|
| 1689 | | - if (!bdev) { |
|---|
| 1690 | | - if (bdev_p) |
|---|
| 1691 | | - *bdev_p = bdgrab(sis->bdev); |
|---|
| 1692 | | - |
|---|
| 1693 | | - spin_unlock(&swap_lock); |
|---|
| 1694 | | - return type; |
|---|
| 1695 | | - } |
|---|
| 1696 | | - if (bdev == sis->bdev) { |
|---|
| 1697 | | - struct swap_extent *se = &sis->first_swap_extent; |
|---|
| 1856 | + if (device == sis->bdev->bd_dev) { |
|---|
| 1857 | + struct swap_extent *se = first_se(sis); |
|---|
| 1698 | 1858 | |
|---|
| 1699 | 1859 | if (se->start_block == offset) { |
|---|
| 1700 | | - if (bdev_p) |
|---|
| 1701 | | - *bdev_p = bdgrab(sis->bdev); |
|---|
| 1702 | | - |
|---|
| 1703 | 1860 | spin_unlock(&swap_lock); |
|---|
| 1704 | | - bdput(bdev); |
|---|
| 1705 | 1861 | return type; |
|---|
| 1706 | 1862 | } |
|---|
| 1707 | 1863 | } |
|---|
| 1708 | 1864 | } |
|---|
| 1709 | 1865 | spin_unlock(&swap_lock); |
|---|
| 1710 | | - if (bdev) |
|---|
| 1711 | | - bdput(bdev); |
|---|
| 1866 | + return -ENODEV; |
|---|
| 1867 | +} |
|---|
| 1712 | 1868 | |
|---|
| 1869 | +int find_first_swap(dev_t *device) |
|---|
| 1870 | +{ |
|---|
| 1871 | + int type; |
|---|
| 1872 | + |
|---|
| 1873 | + spin_lock(&swap_lock); |
|---|
| 1874 | + for (type = 0; type < nr_swapfiles; type++) { |
|---|
| 1875 | + struct swap_info_struct *sis = swap_info[type]; |
|---|
| 1876 | + |
|---|
| 1877 | + if (!(sis->flags & SWP_WRITEOK)) |
|---|
| 1878 | + continue; |
|---|
| 1879 | + *device = sis->bdev->bd_dev; |
|---|
| 1880 | + spin_unlock(&swap_lock); |
|---|
| 1881 | + return type; |
|---|
| 1882 | + } |
|---|
| 1883 | + spin_unlock(&swap_lock); |
|---|
| 1713 | 1884 | return -ENODEV; |
|---|
| 1714 | 1885 | } |
|---|
| 1715 | 1886 | |
|---|
| .. | .. |
|---|
| 1756 | 1927 | |
|---|
| 1757 | 1928 | static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) |
|---|
| 1758 | 1929 | { |
|---|
| 1759 | | - return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); |
|---|
| 1930 | + return pte_same(pte_swp_clear_flags(pte), swp_pte); |
|---|
| 1760 | 1931 | } |
|---|
| 1761 | 1932 | |
|---|
| 1762 | 1933 | /* |
|---|
| .. | .. |
|---|
| 1768 | 1939 | unsigned long addr, swp_entry_t entry, struct page *page) |
|---|
| 1769 | 1940 | { |
|---|
| 1770 | 1941 | struct page *swapcache; |
|---|
| 1771 | | - struct mem_cgroup *memcg; |
|---|
| 1772 | 1942 | spinlock_t *ptl; |
|---|
| 1773 | 1943 | pte_t *pte; |
|---|
| 1774 | 1944 | int ret = 1; |
|---|
| .. | .. |
|---|
| 1778 | 1948 | if (unlikely(!page)) |
|---|
| 1779 | 1949 | return -ENOMEM; |
|---|
| 1780 | 1950 | |
|---|
| 1781 | | - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
|---|
| 1782 | | - &memcg, false)) { |
|---|
| 1783 | | - ret = -ENOMEM; |
|---|
| 1784 | | - goto out_nolock; |
|---|
| 1785 | | - } |
|---|
| 1786 | | - |
|---|
| 1787 | 1951 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
|---|
| 1788 | 1952 | if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { |
|---|
| 1789 | | - mem_cgroup_cancel_charge(page, memcg, false); |
|---|
| 1790 | 1953 | ret = 0; |
|---|
| 1791 | 1954 | goto out; |
|---|
| 1792 | 1955 | } |
|---|
| .. | .. |
|---|
| 1798 | 1961 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
|---|
| 1799 | 1962 | if (page == swapcache) { |
|---|
| 1800 | 1963 | page_add_anon_rmap(page, vma, addr, false); |
|---|
| 1801 | | - mem_cgroup_commit_charge(page, memcg, true, false); |
|---|
| 1802 | 1964 | } else { /* ksm created a completely new copy */ |
|---|
| 1803 | 1965 | page_add_new_anon_rmap(page, vma, addr, false); |
|---|
| 1804 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
|---|
| 1805 | | - lru_cache_add_active_or_unevictable(page, vma); |
|---|
| 1966 | + lru_cache_add_inactive_or_unevictable(page, vma); |
|---|
| 1806 | 1967 | } |
|---|
| 1807 | 1968 | swap_free(entry); |
|---|
| 1808 | | - /* |
|---|
| 1809 | | - * Move the page to the active list so it is not |
|---|
| 1810 | | - * immediately swapped out again after swapon. |
|---|
| 1811 | | - */ |
|---|
| 1812 | | - activate_page(page); |
|---|
| 1813 | 1969 | out: |
|---|
| 1814 | 1970 | pte_unmap_unlock(pte, ptl); |
|---|
| 1815 | | -out_nolock: |
|---|
| 1816 | 1971 | if (page != swapcache) { |
|---|
| 1817 | 1972 | unlock_page(page); |
|---|
| 1818 | 1973 | put_page(page); |
|---|
| .. | .. |
|---|
| 1821 | 1976 | } |
|---|
| 1822 | 1977 | |
|---|
| 1823 | 1978 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
|---|
| 1824 | | - unsigned long addr, unsigned long end, |
|---|
| 1825 | | - swp_entry_t entry, struct page *page) |
|---|
| 1979 | + unsigned long addr, unsigned long end, |
|---|
| 1980 | + unsigned int type, bool frontswap, |
|---|
| 1981 | + unsigned long *fs_pages_to_unuse) |
|---|
| 1826 | 1982 | { |
|---|
| 1827 | | - pte_t swp_pte = swp_entry_to_pte(entry); |
|---|
| 1983 | + struct page *page; |
|---|
| 1984 | + swp_entry_t entry; |
|---|
| 1828 | 1985 | pte_t *pte; |
|---|
| 1986 | + struct swap_info_struct *si; |
|---|
| 1987 | + unsigned long offset; |
|---|
| 1829 | 1988 | int ret = 0; |
|---|
| 1989 | + volatile unsigned char *swap_map; |
|---|
| 1830 | 1990 | |
|---|
| 1831 | | - /* |
|---|
| 1832 | | - * We don't actually need pte lock while scanning for swp_pte: since |
|---|
| 1833 | | - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the |
|---|
| 1834 | | - * page table while we're scanning; though it could get zapped, and on |
|---|
| 1835 | | - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse |
|---|
| 1836 | | - * of unmatched parts which look like swp_pte, so unuse_pte must |
|---|
| 1837 | | - * recheck under pte lock. Scanning without pte lock lets it be |
|---|
| 1838 | | - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. |
|---|
| 1839 | | - */ |
|---|
| 1991 | + si = swap_info[type]; |
|---|
| 1840 | 1992 | pte = pte_offset_map(pmd, addr); |
|---|
| 1841 | 1993 | do { |
|---|
| 1842 | | - /* |
|---|
| 1843 | | - * swapoff spends a _lot_ of time in this loop! |
|---|
| 1844 | | - * Test inline before going to call unuse_pte. |
|---|
| 1845 | | - */ |
|---|
| 1846 | | - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { |
|---|
| 1847 | | - pte_unmap(pte); |
|---|
| 1848 | | - ret = unuse_pte(vma, pmd, addr, entry, page); |
|---|
| 1849 | | - if (ret) |
|---|
| 1850 | | - goto out; |
|---|
| 1851 | | - pte = pte_offset_map(pmd, addr); |
|---|
| 1994 | + if (!is_swap_pte(*pte)) |
|---|
| 1995 | + continue; |
|---|
| 1996 | + |
|---|
| 1997 | + entry = pte_to_swp_entry(*pte); |
|---|
| 1998 | + if (swp_type(entry) != type) |
|---|
| 1999 | + continue; |
|---|
| 2000 | + |
|---|
| 2001 | + offset = swp_offset(entry); |
|---|
| 2002 | + if (frontswap && !frontswap_test(si, offset)) |
|---|
| 2003 | + continue; |
|---|
| 2004 | + |
|---|
| 2005 | + pte_unmap(pte); |
|---|
| 2006 | + swap_map = &si->swap_map[offset]; |
|---|
| 2007 | + page = lookup_swap_cache(entry, vma, addr); |
|---|
| 2008 | + if (!page) { |
|---|
| 2009 | + struct vm_fault vmf = { |
|---|
| 2010 | + .vma = vma, |
|---|
| 2011 | + .address = addr, |
|---|
| 2012 | + .pmd = pmd, |
|---|
| 2013 | + }; |
|---|
| 2014 | + |
|---|
| 2015 | + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, |
|---|
| 2016 | + &vmf); |
|---|
| 1852 | 2017 | } |
|---|
| 2018 | + if (!page) { |
|---|
| 2019 | + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) |
|---|
| 2020 | + goto try_next; |
|---|
| 2021 | + return -ENOMEM; |
|---|
| 2022 | + } |
|---|
| 2023 | + |
|---|
| 2024 | + lock_page(page); |
|---|
| 2025 | + wait_on_page_writeback(page); |
|---|
| 2026 | + ret = unuse_pte(vma, pmd, addr, entry, page); |
|---|
| 2027 | + if (ret < 0) { |
|---|
| 2028 | + unlock_page(page); |
|---|
| 2029 | + put_page(page); |
|---|
| 2030 | + goto out; |
|---|
| 2031 | + } |
|---|
| 2032 | + |
|---|
| 2033 | + try_to_free_swap(page); |
|---|
| 2034 | + trace_android_vh_unuse_swap_page(si, page); |
|---|
| 2035 | + unlock_page(page); |
|---|
| 2036 | + put_page(page); |
|---|
| 2037 | + |
|---|
| 2038 | + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { |
|---|
| 2039 | + ret = FRONTSWAP_PAGES_UNUSED; |
|---|
| 2040 | + goto out; |
|---|
| 2041 | + } |
|---|
| 2042 | +try_next: |
|---|
| 2043 | + pte = pte_offset_map(pmd, addr); |
|---|
| 1853 | 2044 | } while (pte++, addr += PAGE_SIZE, addr != end); |
|---|
| 1854 | 2045 | pte_unmap(pte - 1); |
|---|
| 2046 | + |
|---|
| 2047 | + ret = 0; |
|---|
| 1855 | 2048 | out: |
|---|
| 1856 | 2049 | return ret; |
|---|
| 1857 | 2050 | } |
|---|
| 1858 | 2051 | |
|---|
| 1859 | 2052 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
|---|
| 1860 | 2053 | unsigned long addr, unsigned long end, |
|---|
| 1861 | | - swp_entry_t entry, struct page *page) |
|---|
| 2054 | + unsigned int type, bool frontswap, |
|---|
| 2055 | + unsigned long *fs_pages_to_unuse) |
|---|
| 1862 | 2056 | { |
|---|
| 1863 | 2057 | pmd_t *pmd; |
|---|
| 1864 | 2058 | unsigned long next; |
|---|
| .. | .. |
|---|
| 1870 | 2064 | next = pmd_addr_end(addr, end); |
|---|
| 1871 | 2065 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
|---|
| 1872 | 2066 | continue; |
|---|
| 1873 | | - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
|---|
| 2067 | + ret = unuse_pte_range(vma, pmd, addr, next, type, |
|---|
| 2068 | + frontswap, fs_pages_to_unuse); |
|---|
| 1874 | 2069 | if (ret) |
|---|
| 1875 | 2070 | return ret; |
|---|
| 1876 | 2071 | } while (pmd++, addr = next, addr != end); |
|---|
| .. | .. |
|---|
| 1879 | 2074 | |
|---|
| 1880 | 2075 | static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, |
|---|
| 1881 | 2076 | unsigned long addr, unsigned long end, |
|---|
| 1882 | | - swp_entry_t entry, struct page *page) |
|---|
| 2077 | + unsigned int type, bool frontswap, |
|---|
| 2078 | + unsigned long *fs_pages_to_unuse) |
|---|
| 1883 | 2079 | { |
|---|
| 1884 | 2080 | pud_t *pud; |
|---|
| 1885 | 2081 | unsigned long next; |
|---|
| .. | .. |
|---|
| 1890 | 2086 | next = pud_addr_end(addr, end); |
|---|
| 1891 | 2087 | if (pud_none_or_clear_bad(pud)) |
|---|
| 1892 | 2088 | continue; |
|---|
| 1893 | | - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
|---|
| 2089 | + ret = unuse_pmd_range(vma, pud, addr, next, type, |
|---|
| 2090 | + frontswap, fs_pages_to_unuse); |
|---|
| 1894 | 2091 | if (ret) |
|---|
| 1895 | 2092 | return ret; |
|---|
| 1896 | 2093 | } while (pud++, addr = next, addr != end); |
|---|
| .. | .. |
|---|
| 1899 | 2096 | |
|---|
| 1900 | 2097 | static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, |
|---|
| 1901 | 2098 | unsigned long addr, unsigned long end, |
|---|
| 1902 | | - swp_entry_t entry, struct page *page) |
|---|
| 2099 | + unsigned int type, bool frontswap, |
|---|
| 2100 | + unsigned long *fs_pages_to_unuse) |
|---|
| 1903 | 2101 | { |
|---|
| 1904 | 2102 | p4d_t *p4d; |
|---|
| 1905 | 2103 | unsigned long next; |
|---|
| .. | .. |
|---|
| 1910 | 2108 | next = p4d_addr_end(addr, end); |
|---|
| 1911 | 2109 | if (p4d_none_or_clear_bad(p4d)) |
|---|
| 1912 | 2110 | continue; |
|---|
| 1913 | | - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); |
|---|
| 2111 | + ret = unuse_pud_range(vma, p4d, addr, next, type, |
|---|
| 2112 | + frontswap, fs_pages_to_unuse); |
|---|
| 1914 | 2113 | if (ret) |
|---|
| 1915 | 2114 | return ret; |
|---|
| 1916 | 2115 | } while (p4d++, addr = next, addr != end); |
|---|
| 1917 | 2116 | return 0; |
|---|
| 1918 | 2117 | } |
|---|
| 1919 | 2118 | |
|---|
| 1920 | | -static int unuse_vma(struct vm_area_struct *vma, |
|---|
| 1921 | | - swp_entry_t entry, struct page *page) |
|---|
| 2119 | +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, |
|---|
| 2120 | + bool frontswap, unsigned long *fs_pages_to_unuse) |
|---|
| 1922 | 2121 | { |
|---|
| 1923 | 2122 | pgd_t *pgd; |
|---|
| 1924 | 2123 | unsigned long addr, end, next; |
|---|
| 1925 | 2124 | int ret; |
|---|
| 1926 | 2125 | |
|---|
| 1927 | | - if (page_anon_vma(page)) { |
|---|
| 1928 | | - addr = page_address_in_vma(page, vma); |
|---|
| 1929 | | - if (addr == -EFAULT) |
|---|
| 1930 | | - return 0; |
|---|
| 1931 | | - else |
|---|
| 1932 | | - end = addr + PAGE_SIZE; |
|---|
| 1933 | | - } else { |
|---|
| 1934 | | - addr = vma->vm_start; |
|---|
| 1935 | | - end = vma->vm_end; |
|---|
| 1936 | | - } |
|---|
| 2126 | + addr = vma->vm_start; |
|---|
| 2127 | + end = vma->vm_end; |
|---|
| 1937 | 2128 | |
|---|
| 1938 | 2129 | pgd = pgd_offset(vma->vm_mm, addr); |
|---|
| 1939 | 2130 | do { |
|---|
| 1940 | 2131 | next = pgd_addr_end(addr, end); |
|---|
| 1941 | 2132 | if (pgd_none_or_clear_bad(pgd)) |
|---|
| 1942 | 2133 | continue; |
|---|
| 1943 | | - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); |
|---|
| 2134 | + ret = unuse_p4d_range(vma, pgd, addr, next, type, |
|---|
| 2135 | + frontswap, fs_pages_to_unuse); |
|---|
| 1944 | 2136 | if (ret) |
|---|
| 1945 | 2137 | return ret; |
|---|
| 1946 | 2138 | } while (pgd++, addr = next, addr != end); |
|---|
| 1947 | 2139 | return 0; |
|---|
| 1948 | 2140 | } |
|---|
| 1949 | 2141 | |
|---|
| 1950 | | -static int unuse_mm(struct mm_struct *mm, |
|---|
| 1951 | | - swp_entry_t entry, struct page *page) |
|---|
| 2142 | +static int unuse_mm(struct mm_struct *mm, unsigned int type, |
|---|
| 2143 | + bool frontswap, unsigned long *fs_pages_to_unuse) |
|---|
| 1952 | 2144 | { |
|---|
| 1953 | 2145 | struct vm_area_struct *vma; |
|---|
| 1954 | 2146 | int ret = 0; |
|---|
| 1955 | 2147 | |
|---|
| 1956 | | - if (!down_read_trylock(&mm->mmap_sem)) { |
|---|
| 1957 | | - /* |
|---|
| 1958 | | - * Activate page so shrink_inactive_list is unlikely to unmap |
|---|
| 1959 | | - * its ptes while lock is dropped, so swapoff can make progress. |
|---|
| 1960 | | - */ |
|---|
| 1961 | | - activate_page(page); |
|---|
| 1962 | | - unlock_page(page); |
|---|
| 1963 | | - down_read(&mm->mmap_sem); |
|---|
| 1964 | | - lock_page(page); |
|---|
| 1965 | | - } |
|---|
| 2148 | + mmap_read_lock(mm); |
|---|
| 1966 | 2149 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
|---|
| 1967 | | - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
|---|
| 1968 | | - break; |
|---|
| 2150 | + if (vma->anon_vma) { |
|---|
| 2151 | + ret = unuse_vma(vma, type, frontswap, |
|---|
| 2152 | + fs_pages_to_unuse); |
|---|
| 2153 | + if (ret) |
|---|
| 2154 | + break; |
|---|
| 2155 | + } |
|---|
| 1969 | 2156 | cond_resched(); |
|---|
| 1970 | 2157 | } |
|---|
| 1971 | | - up_read(&mm->mmap_sem); |
|---|
| 1972 | | - return (ret < 0)? ret: 0; |
|---|
| 2158 | + mmap_read_unlock(mm); |
|---|
| 2159 | + return ret; |
|---|
| 1973 | 2160 | } |
|---|
| 1974 | 2161 | |
|---|
| 1975 | 2162 | /* |
|---|
| 1976 | 2163 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
|---|
| 1977 | | - * from current position to next entry still in use. |
|---|
| 1978 | | - * Recycle to start on reaching the end, returning 0 when empty. |
|---|
| 2164 | + * from current position to next entry still in use. Return 0 |
|---|
| 2165 | + * if there are no inuse entries after prev till end of the map. |
|---|
| 1979 | 2166 | */ |
|---|
| 1980 | 2167 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
|---|
| 1981 | 2168 | unsigned int prev, bool frontswap) |
|---|
| 1982 | 2169 | { |
|---|
| 1983 | | - unsigned int max = si->max; |
|---|
| 1984 | | - unsigned int i = prev; |
|---|
| 2170 | + unsigned int i; |
|---|
| 1985 | 2171 | unsigned char count; |
|---|
| 1986 | 2172 | |
|---|
| 1987 | 2173 | /* |
|---|
| .. | .. |
|---|
| 1990 | 2176 | * hits are okay, and sys_swapoff() has already prevented new |
|---|
| 1991 | 2177 | * allocations from this area (while holding swap_lock). |
|---|
| 1992 | 2178 | */ |
|---|
| 1993 | | - for (;;) { |
|---|
| 1994 | | - if (++i >= max) { |
|---|
| 1995 | | - if (!prev) { |
|---|
| 1996 | | - i = 0; |
|---|
| 1997 | | - break; |
|---|
| 1998 | | - } |
|---|
| 1999 | | - /* |
|---|
| 2000 | | - * No entries in use at top of swap_map, |
|---|
| 2001 | | - * loop back to start and recheck there. |
|---|
| 2002 | | - */ |
|---|
| 2003 | | - max = prev + 1; |
|---|
| 2004 | | - prev = 0; |
|---|
| 2005 | | - i = 1; |
|---|
| 2006 | | - } |
|---|
| 2179 | + for (i = prev + 1; i < si->max; i++) { |
|---|
| 2007 | 2180 | count = READ_ONCE(si->swap_map[i]); |
|---|
| 2008 | 2181 | if (count && swap_count(count) != SWAP_MAP_BAD) |
|---|
| 2009 | 2182 | if (!frontswap || frontswap_test(si, i)) |
|---|
| .. | .. |
|---|
| 2011 | 2184 | if ((i % LATENCY_LIMIT) == 0) |
|---|
| 2012 | 2185 | cond_resched(); |
|---|
| 2013 | 2186 | } |
|---|
| 2187 | + |
|---|
| 2188 | + if (i == si->max) |
|---|
| 2189 | + i = 0; |
|---|
| 2190 | + |
|---|
| 2014 | 2191 | return i; |
|---|
| 2015 | 2192 | } |
|---|
| 2016 | 2193 | |
|---|
| 2017 | 2194 | /* |
|---|
| 2018 | | - * We completely avoid races by reading each swap page in advance, |
|---|
| 2019 | | - * and then search for the process using it. All the necessary |
|---|
| 2020 | | - * page table adjustments can then be made atomically. |
|---|
| 2021 | | - * |
|---|
| 2022 | | - * if the boolean frontswap is true, only unuse pages_to_unuse pages; |
|---|
| 2195 | + * If the boolean frontswap is true, only unuse pages_to_unuse pages; |
|---|
| 2023 | 2196 | * pages_to_unuse==0 means all pages; ignored if frontswap is false |
|---|
| 2024 | 2197 | */ |
|---|
| 2025 | 2198 | int try_to_unuse(unsigned int type, bool frontswap, |
|---|
| 2026 | 2199 | unsigned long pages_to_unuse) |
|---|
| 2027 | 2200 | { |
|---|
| 2201 | + struct mm_struct *prev_mm; |
|---|
| 2202 | + struct mm_struct *mm; |
|---|
| 2203 | + struct list_head *p; |
|---|
| 2204 | + int retval = 0; |
|---|
| 2028 | 2205 | struct swap_info_struct *si = swap_info[type]; |
|---|
| 2029 | | - struct mm_struct *start_mm; |
|---|
| 2030 | | - volatile unsigned char *swap_map; /* swap_map is accessed without |
|---|
| 2031 | | - * locking. Mark it as volatile |
|---|
| 2032 | | - * to prevent compiler doing |
|---|
| 2033 | | - * something odd. |
|---|
| 2034 | | - */ |
|---|
| 2035 | | - unsigned char swcount; |
|---|
| 2036 | 2206 | struct page *page; |
|---|
| 2037 | 2207 | swp_entry_t entry; |
|---|
| 2038 | | - unsigned int i = 0; |
|---|
| 2039 | | - int retval = 0; |
|---|
| 2208 | + unsigned int i; |
|---|
| 2040 | 2209 | |
|---|
| 2041 | | - /* |
|---|
| 2042 | | - * When searching mms for an entry, a good strategy is to |
|---|
| 2043 | | - * start at the first mm we freed the previous entry from |
|---|
| 2044 | | - * (though actually we don't notice whether we or coincidence |
|---|
| 2045 | | - * freed the entry). Initialize this start_mm with a hold. |
|---|
| 2046 | | - * |
|---|
| 2047 | | - * A simpler strategy would be to start at the last mm we |
|---|
| 2048 | | - * freed the previous entry from; but that would take less |
|---|
| 2049 | | - * advantage of mmlist ordering, which clusters forked mms |
|---|
| 2050 | | - * together, child after parent. If we race with dup_mmap(), we |
|---|
| 2051 | | - * prefer to resolve parent before child, lest we miss entries |
|---|
| 2052 | | - * duplicated after we scanned child: using last mm would invert |
|---|
| 2053 | | - * that. |
|---|
| 2054 | | - */ |
|---|
| 2055 | | - start_mm = &init_mm; |
|---|
| 2056 | | - mmget(&init_mm); |
|---|
| 2210 | + if (!READ_ONCE(si->inuse_pages)) |
|---|
| 2211 | + return 0; |
|---|
| 2057 | 2212 | |
|---|
| 2058 | | - /* |
|---|
| 2059 | | - * Keep on scanning until all entries have gone. Usually, |
|---|
| 2060 | | - * one pass through swap_map is enough, but not necessarily: |
|---|
| 2061 | | - * there are races when an instance of an entry might be missed. |
|---|
| 2062 | | - */ |
|---|
| 2063 | | - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
|---|
| 2064 | | - if (signal_pending(current)) { |
|---|
| 2065 | | - retval = -EINTR; |
|---|
| 2066 | | - break; |
|---|
| 2067 | | - } |
|---|
| 2213 | + if (!frontswap) |
|---|
| 2214 | + pages_to_unuse = 0; |
|---|
| 2068 | 2215 | |
|---|
| 2069 | | - /* |
|---|
| 2070 | | - * Get a page for the entry, using the existing swap |
|---|
| 2071 | | - * cache page if there is one. Otherwise, get a clean |
|---|
| 2072 | | - * page and read the swap into it. |
|---|
| 2073 | | - */ |
|---|
| 2074 | | - swap_map = &si->swap_map[i]; |
|---|
| 2075 | | - entry = swp_entry(type, i); |
|---|
| 2076 | | - page = read_swap_cache_async(entry, |
|---|
| 2077 | | - GFP_HIGHUSER_MOVABLE, NULL, 0, false); |
|---|
| 2078 | | - if (!page) { |
|---|
| 2079 | | - /* |
|---|
| 2080 | | - * Either swap_duplicate() failed because entry |
|---|
| 2081 | | - * has been freed independently, and will not be |
|---|
| 2082 | | - * reused since sys_swapoff() already disabled |
|---|
| 2083 | | - * allocation from here, or alloc_page() failed. |
|---|
| 2084 | | - */ |
|---|
| 2085 | | - swcount = *swap_map; |
|---|
| 2086 | | - /* |
|---|
| 2087 | | - * We don't hold lock here, so the swap entry could be |
|---|
| 2088 | | - * SWAP_MAP_BAD (when the cluster is discarding). |
|---|
| 2089 | | - * Instead of fail out, We can just skip the swap |
|---|
| 2090 | | - * entry because swapoff will wait for discarding |
|---|
| 2091 | | - * finish anyway. |
|---|
| 2092 | | - */ |
|---|
| 2093 | | - if (!swcount || swcount == SWAP_MAP_BAD) |
|---|
| 2094 | | - continue; |
|---|
| 2095 | | - retval = -ENOMEM; |
|---|
| 2096 | | - break; |
|---|
| 2097 | | - } |
|---|
| 2216 | +retry: |
|---|
| 2217 | + retval = shmem_unuse(type, frontswap, &pages_to_unuse); |
|---|
| 2218 | + if (retval) |
|---|
| 2219 | + goto out; |
|---|
| 2098 | 2220 | |
|---|
| 2099 | | - /* |
|---|
| 2100 | | - * Don't hold on to start_mm if it looks like exiting. |
|---|
| 2101 | | - */ |
|---|
| 2102 | | - if (atomic_read(&start_mm->mm_users) == 1) { |
|---|
| 2103 | | - mmput(start_mm); |
|---|
| 2104 | | - start_mm = &init_mm; |
|---|
| 2105 | | - mmget(&init_mm); |
|---|
| 2106 | | - } |
|---|
| 2221 | + prev_mm = &init_mm; |
|---|
| 2222 | + mmget(prev_mm); |
|---|
| 2107 | 2223 | |
|---|
| 2108 | | - /* |
|---|
| 2109 | | - * Wait for and lock page. When do_swap_page races with |
|---|
| 2110 | | - * try_to_unuse, do_swap_page can handle the fault much |
|---|
| 2111 | | - * faster than try_to_unuse can locate the entry. This |
|---|
| 2112 | | - * apparently redundant "wait_on_page_locked" lets try_to_unuse |
|---|
| 2113 | | - * defer to do_swap_page in such a case - in some tests, |
|---|
| 2114 | | - * do_swap_page and try_to_unuse repeatedly compete. |
|---|
| 2115 | | - */ |
|---|
| 2116 | | - wait_on_page_locked(page); |
|---|
| 2117 | | - wait_on_page_writeback(page); |
|---|
| 2118 | | - lock_page(page); |
|---|
| 2119 | | - wait_on_page_writeback(page); |
|---|
| 2224 | + spin_lock(&mmlist_lock); |
|---|
| 2225 | + p = &init_mm.mmlist; |
|---|
| 2226 | + while (READ_ONCE(si->inuse_pages) && |
|---|
| 2227 | + !signal_pending(current) && |
|---|
| 2228 | + (p = p->next) != &init_mm.mmlist) { |
|---|
| 2120 | 2229 | |
|---|
| 2121 | | - /* |
|---|
| 2122 | | - * Remove all references to entry. |
|---|
| 2123 | | - */ |
|---|
| 2124 | | - swcount = *swap_map; |
|---|
| 2125 | | - if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
|---|
| 2126 | | - retval = shmem_unuse(entry, page); |
|---|
| 2127 | | - /* page has already been unlocked and released */ |
|---|
| 2128 | | - if (retval < 0) |
|---|
| 2129 | | - break; |
|---|
| 2230 | + mm = list_entry(p, struct mm_struct, mmlist); |
|---|
| 2231 | + if (!mmget_not_zero(mm)) |
|---|
| 2130 | 2232 | continue; |
|---|
| 2131 | | - } |
|---|
| 2132 | | - if (swap_count(swcount) && start_mm != &init_mm) |
|---|
| 2133 | | - retval = unuse_mm(start_mm, entry, page); |
|---|
| 2233 | + spin_unlock(&mmlist_lock); |
|---|
| 2234 | + mmput(prev_mm); |
|---|
| 2235 | + prev_mm = mm; |
|---|
| 2236 | + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); |
|---|
| 2134 | 2237 | |
|---|
| 2135 | | - if (swap_count(*swap_map)) { |
|---|
| 2136 | | - int set_start_mm = (*swap_map >= swcount); |
|---|
| 2137 | | - struct list_head *p = &start_mm->mmlist; |
|---|
| 2138 | | - struct mm_struct *new_start_mm = start_mm; |
|---|
| 2139 | | - struct mm_struct *prev_mm = start_mm; |
|---|
| 2140 | | - struct mm_struct *mm; |
|---|
| 2141 | | - |
|---|
| 2142 | | - mmget(new_start_mm); |
|---|
| 2143 | | - mmget(prev_mm); |
|---|
| 2144 | | - spin_lock(&mmlist_lock); |
|---|
| 2145 | | - while (swap_count(*swap_map) && !retval && |
|---|
| 2146 | | - (p = p->next) != &start_mm->mmlist) { |
|---|
| 2147 | | - mm = list_entry(p, struct mm_struct, mmlist); |
|---|
| 2148 | | - if (!mmget_not_zero(mm)) |
|---|
| 2149 | | - continue; |
|---|
| 2150 | | - spin_unlock(&mmlist_lock); |
|---|
| 2151 | | - mmput(prev_mm); |
|---|
| 2152 | | - prev_mm = mm; |
|---|
| 2153 | | - |
|---|
| 2154 | | - cond_resched(); |
|---|
| 2155 | | - |
|---|
| 2156 | | - swcount = *swap_map; |
|---|
| 2157 | | - if (!swap_count(swcount)) /* any usage ? */ |
|---|
| 2158 | | - ; |
|---|
| 2159 | | - else if (mm == &init_mm) |
|---|
| 2160 | | - set_start_mm = 1; |
|---|
| 2161 | | - else |
|---|
| 2162 | | - retval = unuse_mm(mm, entry, page); |
|---|
| 2163 | | - |
|---|
| 2164 | | - if (set_start_mm && *swap_map < swcount) { |
|---|
| 2165 | | - mmput(new_start_mm); |
|---|
| 2166 | | - mmget(mm); |
|---|
| 2167 | | - new_start_mm = mm; |
|---|
| 2168 | | - set_start_mm = 0; |
|---|
| 2169 | | - } |
|---|
| 2170 | | - spin_lock(&mmlist_lock); |
|---|
| 2171 | | - } |
|---|
| 2172 | | - spin_unlock(&mmlist_lock); |
|---|
| 2173 | | - mmput(prev_mm); |
|---|
| 2174 | | - mmput(start_mm); |
|---|
| 2175 | | - start_mm = new_start_mm; |
|---|
| 2176 | | - } |
|---|
| 2177 | 2238 | if (retval) { |
|---|
| 2178 | | - unlock_page(page); |
|---|
| 2179 | | - put_page(page); |
|---|
| 2180 | | - break; |
|---|
| 2239 | + mmput(prev_mm); |
|---|
| 2240 | + goto out; |
|---|
| 2181 | 2241 | } |
|---|
| 2182 | | - |
|---|
| 2183 | | - /* |
|---|
| 2184 | | - * If a reference remains (rare), we would like to leave |
|---|
| 2185 | | - * the page in the swap cache; but try_to_unmap could |
|---|
| 2186 | | - * then re-duplicate the entry once we drop page lock, |
|---|
| 2187 | | - * so we might loop indefinitely; also, that page could |
|---|
| 2188 | | - * not be swapped out to other storage meanwhile. So: |
|---|
| 2189 | | - * delete from cache even if there's another reference, |
|---|
| 2190 | | - * after ensuring that the data has been saved to disk - |
|---|
| 2191 | | - * since if the reference remains (rarer), it will be |
|---|
| 2192 | | - * read from disk into another page. Splitting into two |
|---|
| 2193 | | - * pages would be incorrect if swap supported "shared |
|---|
| 2194 | | - * private" pages, but they are handled by tmpfs files. |
|---|
| 2195 | | - * |
|---|
| 2196 | | - * Given how unuse_vma() targets one particular offset |
|---|
| 2197 | | - * in an anon_vma, once the anon_vma has been determined, |
|---|
| 2198 | | - * this splitting happens to be just what is needed to |
|---|
| 2199 | | - * handle where KSM pages have been swapped out: re-reading |
|---|
| 2200 | | - * is unnecessarily slow, but we can fix that later on. |
|---|
| 2201 | | - */ |
|---|
| 2202 | | - if (swap_count(*swap_map) && |
|---|
| 2203 | | - PageDirty(page) && PageSwapCache(page)) { |
|---|
| 2204 | | - struct writeback_control wbc = { |
|---|
| 2205 | | - .sync_mode = WB_SYNC_NONE, |
|---|
| 2206 | | - }; |
|---|
| 2207 | | - |
|---|
| 2208 | | - swap_writepage(compound_head(page), &wbc); |
|---|
| 2209 | | - lock_page(page); |
|---|
| 2210 | | - wait_on_page_writeback(page); |
|---|
| 2211 | | - } |
|---|
| 2212 | | - |
|---|
| 2213 | | - /* |
|---|
| 2214 | | - * It is conceivable that a racing task removed this page from |
|---|
| 2215 | | - * swap cache just before we acquired the page lock at the top, |
|---|
| 2216 | | - * or while we dropped it in unuse_mm(). The page might even |
|---|
| 2217 | | - * be back in swap cache on another swap area: that we must not |
|---|
| 2218 | | - * delete, since it may not have been written out to swap yet. |
|---|
| 2219 | | - */ |
|---|
| 2220 | | - if (PageSwapCache(page) && |
|---|
| 2221 | | - likely(page_private(page) == entry.val) && |
|---|
| 2222 | | - (!PageTransCompound(page) || |
|---|
| 2223 | | - !swap_page_trans_huge_swapped(si, entry))) |
|---|
| 2224 | | - delete_from_swap_cache(compound_head(page)); |
|---|
| 2225 | | - |
|---|
| 2226 | | - /* |
|---|
| 2227 | | - * So we could skip searching mms once swap count went |
|---|
| 2228 | | - * to 1, we did not mark any present ptes as dirty: must |
|---|
| 2229 | | - * mark page dirty so shrink_page_list will preserve it. |
|---|
| 2230 | | - */ |
|---|
| 2231 | | - SetPageDirty(page); |
|---|
| 2232 | | - unlock_page(page); |
|---|
| 2233 | | - put_page(page); |
|---|
| 2234 | 2242 | |
|---|
| 2235 | 2243 | /* |
|---|
| 2236 | 2244 | * Make sure that we aren't completely killing |
|---|
| 2237 | 2245 | * interactive performance. |
|---|
| 2238 | 2246 | */ |
|---|
| 2239 | 2247 | cond_resched(); |
|---|
| 2240 | | - if (frontswap && pages_to_unuse > 0) { |
|---|
| 2241 | | - if (!--pages_to_unuse) |
|---|
| 2242 | | - break; |
|---|
| 2243 | | - } |
|---|
| 2248 | + spin_lock(&mmlist_lock); |
|---|
| 2249 | + } |
|---|
| 2250 | + spin_unlock(&mmlist_lock); |
|---|
| 2251 | + |
|---|
| 2252 | + mmput(prev_mm); |
|---|
| 2253 | + |
|---|
| 2254 | + i = 0; |
|---|
| 2255 | + while (READ_ONCE(si->inuse_pages) && |
|---|
| 2256 | + !signal_pending(current) && |
|---|
| 2257 | + (i = find_next_to_unuse(si, i, frontswap)) != 0) { |
|---|
| 2258 | + |
|---|
| 2259 | + entry = swp_entry(type, i); |
|---|
| 2260 | + page = find_get_page(swap_address_space(entry), i); |
|---|
| 2261 | + if (!page) |
|---|
| 2262 | + continue; |
|---|
| 2263 | + |
|---|
| 2264 | + /* |
|---|
| 2265 | + * It is conceivable that a racing task removed this page from |
|---|
| 2266 | + * swap cache just before we acquired the page lock. The page |
|---|
| 2267 | + * might even be back in swap cache on another swap area. But |
|---|
| 2268 | + * that is okay, try_to_free_swap() only removes stale pages. |
|---|
| 2269 | + */ |
|---|
| 2270 | + lock_page(page); |
|---|
| 2271 | + wait_on_page_writeback(page); |
|---|
| 2272 | + try_to_free_swap(page); |
|---|
| 2273 | + trace_android_vh_unuse_swap_page(si, page); |
|---|
| 2274 | + unlock_page(page); |
|---|
| 2275 | + put_page(page); |
|---|
| 2276 | + |
|---|
| 2277 | + /* |
|---|
| 2278 | + * For frontswap, we just need to unuse pages_to_unuse, if |
|---|
| 2279 | + * it was specified. Need not check frontswap again here as |
|---|
| 2280 | + * we already zeroed out pages_to_unuse if not frontswap. |
|---|
| 2281 | + */ |
|---|
| 2282 | + if (pages_to_unuse && --pages_to_unuse == 0) |
|---|
| 2283 | + goto out; |
|---|
| 2244 | 2284 | } |
|---|
| 2245 | 2285 | |
|---|
| 2246 | | - mmput(start_mm); |
|---|
| 2247 | | - return retval; |
|---|
| 2286 | + /* |
|---|
| 2287 | + * Lets check again to see if there are still swap entries in the map. |
|---|
| 2288 | + * If yes, we would need to do retry the unuse logic again. |
|---|
| 2289 | + * Under global memory pressure, swap entries can be reinserted back |
|---|
| 2290 | + * into process space after the mmlist loop above passes over them. |
|---|
| 2291 | + * |
|---|
| 2292 | + * Limit the number of retries? No: when mmget_not_zero() above fails, |
|---|
| 2293 | + * that mm is likely to be freeing swap from exit_mmap(), which proceeds |
|---|
| 2294 | + * at its own independent pace; and even shmem_writepage() could have |
|---|
| 2295 | + * been preempted after get_swap_page(), temporarily hiding that swap. |
|---|
| 2296 | + * It's easy and robust (though cpu-intensive) just to keep retrying. |
|---|
| 2297 | + */ |
|---|
| 2298 | + if (READ_ONCE(si->inuse_pages)) { |
|---|
| 2299 | + if (!signal_pending(current)) |
|---|
| 2300 | + goto retry; |
|---|
| 2301 | + retval = -EINTR; |
|---|
| 2302 | + } |
|---|
| 2303 | +out: |
|---|
| 2304 | + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; |
|---|
| 2248 | 2305 | } |
|---|
| 2249 | 2306 | |
|---|
| 2250 | 2307 | /* |
|---|
| .. | .. |
|---|
| 2276 | 2333 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
|---|
| 2277 | 2334 | { |
|---|
| 2278 | 2335 | struct swap_info_struct *sis; |
|---|
| 2279 | | - struct swap_extent *start_se; |
|---|
| 2280 | 2336 | struct swap_extent *se; |
|---|
| 2281 | 2337 | pgoff_t offset; |
|---|
| 2282 | 2338 | |
|---|
| .. | .. |
|---|
| 2284 | 2340 | *bdev = sis->bdev; |
|---|
| 2285 | 2341 | |
|---|
| 2286 | 2342 | offset = swp_offset(entry); |
|---|
| 2287 | | - start_se = sis->curr_swap_extent; |
|---|
| 2288 | | - se = start_se; |
|---|
| 2289 | | - |
|---|
| 2290 | | - for ( ; ; ) { |
|---|
| 2291 | | - if (se->start_page <= offset && |
|---|
| 2292 | | - offset < (se->start_page + se->nr_pages)) { |
|---|
| 2293 | | - return se->start_block + (offset - se->start_page); |
|---|
| 2294 | | - } |
|---|
| 2295 | | - se = list_next_entry(se, list); |
|---|
| 2296 | | - sis->curr_swap_extent = se; |
|---|
| 2297 | | - BUG_ON(se == start_se); /* It *must* be present */ |
|---|
| 2298 | | - } |
|---|
| 2343 | + se = offset_to_swap_extent(sis, offset); |
|---|
| 2344 | + return se->start_block + (offset - se->start_page); |
|---|
| 2299 | 2345 | } |
|---|
| 2300 | 2346 | |
|---|
| 2301 | 2347 | /* |
|---|
| .. | .. |
|---|
| 2305 | 2351 | { |
|---|
| 2306 | 2352 | swp_entry_t entry; |
|---|
| 2307 | 2353 | entry.val = page_private(page); |
|---|
| 2308 | | - return map_swap_entry(entry, bdev) << (PAGE_SHIFT - 9); |
|---|
| 2354 | + return map_swap_entry(entry, bdev); |
|---|
| 2309 | 2355 | } |
|---|
| 2310 | 2356 | |
|---|
| 2311 | 2357 | /* |
|---|
| .. | .. |
|---|
| 2313 | 2359 | */ |
|---|
| 2314 | 2360 | static void destroy_swap_extents(struct swap_info_struct *sis) |
|---|
| 2315 | 2361 | { |
|---|
| 2316 | | - while (!list_empty(&sis->first_swap_extent.list)) { |
|---|
| 2317 | | - struct swap_extent *se; |
|---|
| 2362 | + while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { |
|---|
| 2363 | + struct rb_node *rb = sis->swap_extent_root.rb_node; |
|---|
| 2364 | + struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); |
|---|
| 2318 | 2365 | |
|---|
| 2319 | | - se = list_first_entry(&sis->first_swap_extent.list, |
|---|
| 2320 | | - struct swap_extent, list); |
|---|
| 2321 | | - list_del(&se->list); |
|---|
| 2366 | + rb_erase(rb, &sis->swap_extent_root); |
|---|
| 2322 | 2367 | kfree(se); |
|---|
| 2323 | 2368 | } |
|---|
| 2324 | 2369 | |
|---|
| 2325 | | - if (sis->flags & SWP_FILE) { |
|---|
| 2370 | + if (sis->flags & SWP_ACTIVATED) { |
|---|
| 2326 | 2371 | struct file *swap_file = sis->swap_file; |
|---|
| 2327 | 2372 | struct address_space *mapping = swap_file->f_mapping; |
|---|
| 2328 | 2373 | |
|---|
| 2329 | | - sis->flags &= ~SWP_FILE; |
|---|
| 2330 | | - mapping->a_ops->swap_deactivate(swap_file); |
|---|
| 2374 | + sis->flags &= ~SWP_ACTIVATED; |
|---|
| 2375 | + if (mapping->a_ops->swap_deactivate) |
|---|
| 2376 | + mapping->a_ops->swap_deactivate(swap_file); |
|---|
| 2331 | 2377 | } |
|---|
| 2332 | 2378 | } |
|---|
| 2333 | 2379 | |
|---|
| 2334 | 2380 | /* |
|---|
| 2335 | 2381 | * Add a block range (and the corresponding page range) into this swapdev's |
|---|
| 2336 | | - * extent list. The extent list is kept sorted in page order. |
|---|
| 2382 | + * extent tree. |
|---|
| 2337 | 2383 | * |
|---|
| 2338 | 2384 | * This function rather assumes that it is called in ascending page order. |
|---|
| 2339 | 2385 | */ |
|---|
| .. | .. |
|---|
| 2341 | 2387 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
|---|
| 2342 | 2388 | unsigned long nr_pages, sector_t start_block) |
|---|
| 2343 | 2389 | { |
|---|
| 2390 | + struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; |
|---|
| 2344 | 2391 | struct swap_extent *se; |
|---|
| 2345 | 2392 | struct swap_extent *new_se; |
|---|
| 2346 | | - struct list_head *lh; |
|---|
| 2347 | 2393 | |
|---|
| 2348 | | - if (start_page == 0) { |
|---|
| 2349 | | - se = &sis->first_swap_extent; |
|---|
| 2350 | | - sis->curr_swap_extent = se; |
|---|
| 2351 | | - se->start_page = 0; |
|---|
| 2352 | | - se->nr_pages = nr_pages; |
|---|
| 2353 | | - se->start_block = start_block; |
|---|
| 2354 | | - return 1; |
|---|
| 2355 | | - } else { |
|---|
| 2356 | | - lh = sis->first_swap_extent.list.prev; /* Highest extent */ |
|---|
| 2357 | | - se = list_entry(lh, struct swap_extent, list); |
|---|
| 2394 | + /* |
|---|
| 2395 | + * place the new node at the right most since the |
|---|
| 2396 | + * function is called in ascending page order. |
|---|
| 2397 | + */ |
|---|
| 2398 | + while (*link) { |
|---|
| 2399 | + parent = *link; |
|---|
| 2400 | + link = &parent->rb_right; |
|---|
| 2401 | + } |
|---|
| 2402 | + |
|---|
| 2403 | + if (parent) { |
|---|
| 2404 | + se = rb_entry(parent, struct swap_extent, rb_node); |
|---|
| 2358 | 2405 | BUG_ON(se->start_page + se->nr_pages != start_page); |
|---|
| 2359 | 2406 | if (se->start_block + se->nr_pages == start_block) { |
|---|
| 2360 | 2407 | /* Merge it */ |
|---|
| .. | .. |
|---|
| 2363 | 2410 | } |
|---|
| 2364 | 2411 | } |
|---|
| 2365 | 2412 | |
|---|
| 2366 | | - /* |
|---|
| 2367 | | - * No merge. Insert a new extent, preserving ordering. |
|---|
| 2368 | | - */ |
|---|
| 2413 | + /* No merge, insert a new extent. */ |
|---|
| 2369 | 2414 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); |
|---|
| 2370 | 2415 | if (new_se == NULL) |
|---|
| 2371 | 2416 | return -ENOMEM; |
|---|
| .. | .. |
|---|
| 2373 | 2418 | new_se->nr_pages = nr_pages; |
|---|
| 2374 | 2419 | new_se->start_block = start_block; |
|---|
| 2375 | 2420 | |
|---|
| 2376 | | - list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
|---|
| 2421 | + rb_link_node(&new_se->rb_node, parent, link); |
|---|
| 2422 | + rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); |
|---|
| 2377 | 2423 | return 1; |
|---|
| 2378 | 2424 | } |
|---|
| 2379 | 2425 | EXPORT_SYMBOL_GPL(add_swap_extent); |
|---|
| .. | .. |
|---|
| 2423 | 2469 | |
|---|
| 2424 | 2470 | if (mapping->a_ops->swap_activate) { |
|---|
| 2425 | 2471 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
|---|
| 2472 | + if (ret >= 0) |
|---|
| 2473 | + sis->flags |= SWP_ACTIVATED; |
|---|
| 2426 | 2474 | if (!ret) { |
|---|
| 2427 | | - sis->flags |= SWP_FILE; |
|---|
| 2475 | + sis->flags |= SWP_FS_OPS; |
|---|
| 2428 | 2476 | ret = add_swap_extent(sis, 0, sis->max, 0); |
|---|
| 2429 | 2477 | *span = sis->pages; |
|---|
| 2430 | 2478 | } |
|---|
| .. | .. |
|---|
| 2446 | 2494 | return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; |
|---|
| 2447 | 2495 | } |
|---|
| 2448 | 2496 | |
|---|
| 2449 | | -static void _enable_swap_info(struct swap_info_struct *p, int prio, |
|---|
| 2450 | | - unsigned char *swap_map, |
|---|
| 2451 | | - struct swap_cluster_info *cluster_info) |
|---|
| 2497 | +static void setup_swap_info(struct swap_info_struct *p, int prio, |
|---|
| 2498 | + unsigned char *swap_map, |
|---|
| 2499 | + struct swap_cluster_info *cluster_info) |
|---|
| 2452 | 2500 | { |
|---|
| 2453 | 2501 | int i; |
|---|
| 2454 | 2502 | |
|---|
| .. | .. |
|---|
| 2473 | 2521 | } |
|---|
| 2474 | 2522 | p->swap_map = swap_map; |
|---|
| 2475 | 2523 | p->cluster_info = cluster_info; |
|---|
| 2476 | | - p->flags |= SWP_WRITEOK; |
|---|
| 2477 | | - atomic_long_add(p->pages, &nr_swap_pages); |
|---|
| 2478 | | - total_swap_pages += p->pages; |
|---|
| 2524 | +} |
|---|
| 2479 | 2525 | |
|---|
| 2526 | +static void _enable_swap_info(struct swap_info_struct *p) |
|---|
| 2527 | +{ |
|---|
| 2528 | + bool skip = false; |
|---|
| 2529 | + |
|---|
| 2530 | + p->flags |= SWP_WRITEOK | SWP_VALID; |
|---|
| 2531 | + trace_android_vh_account_swap_pages(p, &skip); |
|---|
| 2532 | + if (!skip) { |
|---|
| 2533 | + atomic_long_add(p->pages, &nr_swap_pages); |
|---|
| 2534 | + total_swap_pages += p->pages; |
|---|
| 2535 | + } |
|---|
| 2480 | 2536 | assert_spin_locked(&swap_lock); |
|---|
| 2481 | 2537 | /* |
|---|
| 2482 | 2538 | * both lists are plists, and thus priority ordered. |
|---|
| .. | .. |
|---|
| 2500 | 2556 | frontswap_init(p->type, frontswap_map); |
|---|
| 2501 | 2557 | spin_lock(&swap_lock); |
|---|
| 2502 | 2558 | spin_lock(&p->lock); |
|---|
| 2503 | | - _enable_swap_info(p, prio, swap_map, cluster_info); |
|---|
| 2559 | + setup_swap_info(p, prio, swap_map, cluster_info); |
|---|
| 2560 | + spin_unlock(&p->lock); |
|---|
| 2561 | + spin_unlock(&swap_lock); |
|---|
| 2562 | + /* |
|---|
| 2563 | + * Guarantee swap_map, cluster_info, etc. fields are valid |
|---|
| 2564 | + * between get/put_swap_device() if SWP_VALID bit is set |
|---|
| 2565 | + */ |
|---|
| 2566 | + synchronize_rcu(); |
|---|
| 2567 | + spin_lock(&swap_lock); |
|---|
| 2568 | + spin_lock(&p->lock); |
|---|
| 2569 | + _enable_swap_info(p); |
|---|
| 2504 | 2570 | spin_unlock(&p->lock); |
|---|
| 2505 | 2571 | spin_unlock(&swap_lock); |
|---|
| 2506 | 2572 | } |
|---|
| .. | .. |
|---|
| 2509 | 2575 | { |
|---|
| 2510 | 2576 | spin_lock(&swap_lock); |
|---|
| 2511 | 2577 | spin_lock(&p->lock); |
|---|
| 2512 | | - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
|---|
| 2578 | + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
|---|
| 2579 | + _enable_swap_info(p); |
|---|
| 2513 | 2580 | spin_unlock(&p->lock); |
|---|
| 2514 | 2581 | spin_unlock(&swap_lock); |
|---|
| 2515 | 2582 | } |
|---|
| .. | .. |
|---|
| 2537 | 2604 | struct filename *pathname; |
|---|
| 2538 | 2605 | int err, found = 0; |
|---|
| 2539 | 2606 | unsigned int old_block_size; |
|---|
| 2607 | + bool skip = false; |
|---|
| 2540 | 2608 | |
|---|
| 2541 | 2609 | if (!capable(CAP_SYS_ADMIN)) |
|---|
| 2542 | 2610 | return -EPERM; |
|---|
| .. | .. |
|---|
| 2574 | 2642 | spin_unlock(&swap_lock); |
|---|
| 2575 | 2643 | goto out_dput; |
|---|
| 2576 | 2644 | } |
|---|
| 2577 | | - del_from_avail_list(p); |
|---|
| 2578 | 2645 | spin_lock(&p->lock); |
|---|
| 2646 | + del_from_avail_list(p); |
|---|
| 2579 | 2647 | if (p->prio < 0) { |
|---|
| 2580 | 2648 | struct swap_info_struct *si = p; |
|---|
| 2581 | 2649 | int nid; |
|---|
| .. | .. |
|---|
| 2591 | 2659 | least_priority++; |
|---|
| 2592 | 2660 | } |
|---|
| 2593 | 2661 | plist_del(&p->list, &swap_active_head); |
|---|
| 2594 | | - atomic_long_sub(p->pages, &nr_swap_pages); |
|---|
| 2595 | | - total_swap_pages -= p->pages; |
|---|
| 2662 | + trace_android_vh_account_swap_pages(p, &skip); |
|---|
| 2663 | + if (!skip) { |
|---|
| 2664 | + atomic_long_sub(p->pages, &nr_swap_pages); |
|---|
| 2665 | + total_swap_pages -= p->pages; |
|---|
| 2666 | + } |
|---|
| 2596 | 2667 | p->flags &= ~SWP_WRITEOK; |
|---|
| 2597 | 2668 | spin_unlock(&p->lock); |
|---|
| 2598 | 2669 | spin_unlock(&swap_lock); |
|---|
| .. | .. |
|---|
| 2611 | 2682 | } |
|---|
| 2612 | 2683 | |
|---|
| 2613 | 2684 | reenable_swap_slots_cache_unlock(); |
|---|
| 2685 | + |
|---|
| 2686 | + spin_lock(&swap_lock); |
|---|
| 2687 | + spin_lock(&p->lock); |
|---|
| 2688 | + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ |
|---|
| 2689 | + spin_unlock(&p->lock); |
|---|
| 2690 | + spin_unlock(&swap_lock); |
|---|
| 2691 | + /* |
|---|
| 2692 | + * wait for swap operations protected by get/put_swap_device() |
|---|
| 2693 | + * to complete |
|---|
| 2694 | + */ |
|---|
| 2695 | + synchronize_rcu(); |
|---|
| 2614 | 2696 | |
|---|
| 2615 | 2697 | flush_work(&p->discard_work); |
|---|
| 2616 | 2698 | |
|---|
| .. | .. |
|---|
| 2647 | 2729 | frontswap_map = frontswap_map_get(p); |
|---|
| 2648 | 2730 | spin_unlock(&p->lock); |
|---|
| 2649 | 2731 | spin_unlock(&swap_lock); |
|---|
| 2732 | + arch_swap_invalidate_area(p->type); |
|---|
| 2650 | 2733 | frontswap_invalidate_area(p->type); |
|---|
| 2651 | 2734 | frontswap_map_set(p, NULL); |
|---|
| 2652 | 2735 | mutex_unlock(&swapon_mutex); |
|---|
| 2653 | 2736 | free_percpu(p->percpu_cluster); |
|---|
| 2654 | 2737 | p->percpu_cluster = NULL; |
|---|
| 2738 | + free_percpu(p->cluster_next_cpu); |
|---|
| 2739 | + p->cluster_next_cpu = NULL; |
|---|
| 2655 | 2740 | vfree(swap_map); |
|---|
| 2656 | 2741 | kvfree(cluster_info); |
|---|
| 2657 | 2742 | kvfree(frontswap_map); |
|---|
| .. | .. |
|---|
| 2759 | 2844 | struct swap_info_struct *si = v; |
|---|
| 2760 | 2845 | struct file *file; |
|---|
| 2761 | 2846 | int len; |
|---|
| 2847 | + unsigned int bytes, inuse; |
|---|
| 2762 | 2848 | |
|---|
| 2763 | 2849 | if (si == SEQ_START_TOKEN) { |
|---|
| 2764 | | - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
|---|
| 2850 | + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); |
|---|
| 2765 | 2851 | return 0; |
|---|
| 2766 | 2852 | } |
|---|
| 2767 | 2853 | |
|---|
| 2854 | + bytes = si->pages << (PAGE_SHIFT - 10); |
|---|
| 2855 | + inuse = si->inuse_pages << (PAGE_SHIFT - 10); |
|---|
| 2856 | + |
|---|
| 2768 | 2857 | file = si->swap_file; |
|---|
| 2769 | 2858 | len = seq_file_path(swap, file, " \t\n\\"); |
|---|
| 2770 | | - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
|---|
| 2859 | + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", |
|---|
| 2771 | 2860 | len < 40 ? 40 - len : 1, " ", |
|---|
| 2772 | 2861 | S_ISBLK(file_inode(file)->i_mode) ? |
|---|
| 2773 | 2862 | "partition" : "file\t", |
|---|
| 2774 | | - si->pages << (PAGE_SHIFT - 10), |
|---|
| 2775 | | - si->inuse_pages << (PAGE_SHIFT - 10), |
|---|
| 2863 | + bytes, bytes < 10000000 ? "\t" : "", |
|---|
| 2864 | + inuse, inuse < 10000000 ? "\t" : "", |
|---|
| 2776 | 2865 | si->prio); |
|---|
| 2777 | 2866 | return 0; |
|---|
| 2778 | 2867 | } |
|---|
| .. | .. |
|---|
| 2798 | 2887 | return 0; |
|---|
| 2799 | 2888 | } |
|---|
| 2800 | 2889 | |
|---|
| 2801 | | -static const struct file_operations proc_swaps_operations = { |
|---|
| 2802 | | - .open = swaps_open, |
|---|
| 2803 | | - .read = seq_read, |
|---|
| 2804 | | - .llseek = seq_lseek, |
|---|
| 2805 | | - .release = seq_release, |
|---|
| 2806 | | - .poll = swaps_poll, |
|---|
| 2890 | +static const struct proc_ops swaps_proc_ops = { |
|---|
| 2891 | + .proc_flags = PROC_ENTRY_PERMANENT, |
|---|
| 2892 | + .proc_open = swaps_open, |
|---|
| 2893 | + .proc_read = seq_read, |
|---|
| 2894 | + .proc_lseek = seq_lseek, |
|---|
| 2895 | + .proc_release = seq_release, |
|---|
| 2896 | + .proc_poll = swaps_poll, |
|---|
| 2807 | 2897 | }; |
|---|
| 2808 | 2898 | |
|---|
| 2809 | 2899 | static int __init procswaps_init(void) |
|---|
| 2810 | 2900 | { |
|---|
| 2811 | | - proc_create("swaps", 0, NULL, &proc_swaps_operations); |
|---|
| 2901 | + proc_create("swaps", 0, NULL, &swaps_proc_ops); |
|---|
| 2812 | 2902 | return 0; |
|---|
| 2813 | 2903 | } |
|---|
| 2814 | 2904 | __initcall(procswaps_init); |
|---|
| .. | .. |
|---|
| 2825 | 2915 | |
|---|
| 2826 | 2916 | static struct swap_info_struct *alloc_swap_info(void) |
|---|
| 2827 | 2917 | { |
|---|
| 2828 | | - struct swap_info_struct *p; |
|---|
| 2918 | + struct swap_info_struct *p = NULL; |
|---|
| 2829 | 2919 | struct swap_info_struct *defer = NULL; |
|---|
| 2830 | 2920 | unsigned int type; |
|---|
| 2831 | 2921 | int i; |
|---|
| 2832 | | - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); |
|---|
| 2922 | + bool skip = false; |
|---|
| 2833 | 2923 | |
|---|
| 2834 | | - p = kvzalloc(size, GFP_KERNEL); |
|---|
| 2924 | + trace_android_rvh_alloc_si(&p, &skip); |
|---|
| 2925 | + trace_android_vh_alloc_si(&p, &skip); |
|---|
| 2926 | + if (!skip) |
|---|
| 2927 | + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); |
|---|
| 2835 | 2928 | if (!p) |
|---|
| 2836 | 2929 | return ERR_PTR(-ENOMEM); |
|---|
| 2837 | 2930 | |
|---|
| .. | .. |
|---|
| 2863 | 2956 | * would be relying on p->type to remain valid. |
|---|
| 2864 | 2957 | */ |
|---|
| 2865 | 2958 | } |
|---|
| 2866 | | - INIT_LIST_HEAD(&p->first_swap_extent.list); |
|---|
| 2959 | + p->swap_extent_root = RB_ROOT; |
|---|
| 2867 | 2960 | plist_node_init(&p->list, 0); |
|---|
| 2868 | 2961 | for_each_node(i) |
|---|
| 2869 | 2962 | plist_node_init(&p->avail_lists[i], 0); |
|---|
| .. | .. |
|---|
| 2881 | 2974 | int error; |
|---|
| 2882 | 2975 | |
|---|
| 2883 | 2976 | if (S_ISBLK(inode->i_mode)) { |
|---|
| 2884 | | - p->bdev = bdgrab(I_BDEV(inode)); |
|---|
| 2885 | | - error = blkdev_get(p->bdev, |
|---|
| 2977 | + p->bdev = blkdev_get_by_dev(inode->i_rdev, |
|---|
| 2886 | 2978 | FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); |
|---|
| 2887 | | - if (error < 0) { |
|---|
| 2979 | + if (IS_ERR(p->bdev)) { |
|---|
| 2980 | + error = PTR_ERR(p->bdev); |
|---|
| 2888 | 2981 | p->bdev = NULL; |
|---|
| 2889 | 2982 | return error; |
|---|
| 2890 | 2983 | } |
|---|
| .. | .. |
|---|
| 2892 | 2985 | error = set_blocksize(p->bdev, PAGE_SIZE); |
|---|
| 2893 | 2986 | if (error < 0) |
|---|
| 2894 | 2987 | return error; |
|---|
| 2988 | + /* |
|---|
| 2989 | + * Zoned block devices contain zones that have a sequential |
|---|
| 2990 | + * write only restriction. Hence zoned block devices are not |
|---|
| 2991 | + * suitable for swapping. Disallow them here. |
|---|
| 2992 | + */ |
|---|
| 2993 | + if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) |
|---|
| 2994 | + return -EINVAL; |
|---|
| 2895 | 2995 | p->flags |= SWP_BLKDEV; |
|---|
| 2896 | 2996 | } else if (S_ISREG(inode->i_mode)) { |
|---|
| 2897 | 2997 | p->bdev = inode->i_sb->s_bdev; |
|---|
| .. | .. |
|---|
| 3188 | 3288 | goto bad_swap_unlock_inode; |
|---|
| 3189 | 3289 | } |
|---|
| 3190 | 3290 | |
|---|
| 3191 | | - if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) |
|---|
| 3291 | + if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) |
|---|
| 3192 | 3292 | p->flags |= SWP_STABLE_WRITES; |
|---|
| 3193 | 3293 | |
|---|
| 3194 | | - if (bdi_cap_synchronous_io(inode_to_bdi(inode))) |
|---|
| 3294 | + if (p->bdev && p->bdev->bd_disk->fops->rw_page) |
|---|
| 3195 | 3295 | p->flags |= SWP_SYNCHRONOUS_IO; |
|---|
| 3196 | 3296 | |
|---|
| 3197 | 3297 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
|---|
| .. | .. |
|---|
| 3199 | 3299 | unsigned long ci, nr_cluster; |
|---|
| 3200 | 3300 | |
|---|
| 3201 | 3301 | p->flags |= SWP_SOLIDSTATE; |
|---|
| 3302 | + p->cluster_next_cpu = alloc_percpu(unsigned int); |
|---|
| 3303 | + if (!p->cluster_next_cpu) { |
|---|
| 3304 | + error = -ENOMEM; |
|---|
| 3305 | + goto bad_swap_unlock_inode; |
|---|
| 3306 | + } |
|---|
| 3202 | 3307 | /* |
|---|
| 3203 | 3308 | * select a random position to start with to help wear leveling |
|---|
| 3204 | 3309 | * SSD |
|---|
| 3205 | 3310 | */ |
|---|
| 3206 | | - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
|---|
| 3311 | + for_each_possible_cpu(cpu) { |
|---|
| 3312 | + per_cpu(*p->cluster_next_cpu, cpu) = |
|---|
| 3313 | + 1 + prandom_u32_max(p->highest_bit); |
|---|
| 3314 | + } |
|---|
| 3207 | 3315 | nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
|---|
| 3208 | 3316 | |
|---|
| 3209 | 3317 | cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), |
|---|
| .. | .. |
|---|
| 3289 | 3397 | error = inode_drain_writes(inode); |
|---|
| 3290 | 3398 | if (error) { |
|---|
| 3291 | 3399 | inode->i_flags &= ~S_SWAPFILE; |
|---|
| 3292 | | - goto bad_swap_unlock_inode; |
|---|
| 3400 | + goto free_swap_address_space; |
|---|
| 3293 | 3401 | } |
|---|
| 3294 | 3402 | |
|---|
| 3295 | 3403 | mutex_lock(&swapon_mutex); |
|---|
| .. | .. |
|---|
| 3297 | 3405 | if (swap_flags & SWAP_FLAG_PREFER) |
|---|
| 3298 | 3406 | prio = |
|---|
| 3299 | 3407 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
|---|
| 3408 | + |
|---|
| 3409 | + trace_android_vh_swap_avail_heads_init(swap_avail_heads); |
|---|
| 3300 | 3410 | enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); |
|---|
| 3301 | 3411 | |
|---|
| 3412 | + trace_android_vh_init_swap_info_struct(p, swap_avail_heads); |
|---|
| 3302 | 3413 | pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
|---|
| 3303 | 3414 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
|---|
| 3304 | 3415 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
|---|
| .. | .. |
|---|
| 3314 | 3425 | |
|---|
| 3315 | 3426 | error = 0; |
|---|
| 3316 | 3427 | goto out; |
|---|
| 3428 | +free_swap_address_space: |
|---|
| 3429 | + exit_swap_address_space(p->type); |
|---|
| 3317 | 3430 | bad_swap_unlock_inode: |
|---|
| 3318 | 3431 | inode_unlock(inode); |
|---|
| 3319 | 3432 | bad_swap: |
|---|
| 3320 | 3433 | free_percpu(p->percpu_cluster); |
|---|
| 3321 | 3434 | p->percpu_cluster = NULL; |
|---|
| 3435 | + free_percpu(p->cluster_next_cpu); |
|---|
| 3436 | + p->cluster_next_cpu = NULL; |
|---|
| 3322 | 3437 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
|---|
| 3323 | 3438 | set_blocksize(p->bdev, p->old_block_size); |
|---|
| 3324 | 3439 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
|---|
| .. | .. |
|---|
| 3359 | 3474 | spin_lock(&swap_lock); |
|---|
| 3360 | 3475 | for (type = 0; type < nr_swapfiles; type++) { |
|---|
| 3361 | 3476 | struct swap_info_struct *si = swap_info[type]; |
|---|
| 3477 | + bool skip = false; |
|---|
| 3362 | 3478 | |
|---|
| 3363 | | - if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
|---|
| 3479 | + trace_android_vh_si_swapinfo(si, &skip); |
|---|
| 3480 | + if (!skip && (si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
|---|
| 3364 | 3481 | nr_to_be_unused += si->inuse_pages; |
|---|
| 3365 | 3482 | } |
|---|
| 3366 | 3483 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
|---|
| 3367 | 3484 | val->totalswap = total_swap_pages + nr_to_be_unused; |
|---|
| 3368 | 3485 | spin_unlock(&swap_lock); |
|---|
| 3369 | 3486 | } |
|---|
| 3487 | +EXPORT_SYMBOL_GPL(si_swapinfo); |
|---|
| 3370 | 3488 | |
|---|
| 3371 | 3489 | /* |
|---|
| 3372 | 3490 | * Verify that a swap entry is valid and increment its swap map count. |
|---|
| .. | .. |
|---|
| 3388 | 3506 | unsigned char has_cache; |
|---|
| 3389 | 3507 | int err = -EINVAL; |
|---|
| 3390 | 3508 | |
|---|
| 3391 | | - if (non_swap_entry(entry)) |
|---|
| 3392 | | - goto out; |
|---|
| 3393 | | - |
|---|
| 3394 | | - p = swp_swap_info(entry); |
|---|
| 3509 | + p = get_swap_device(entry); |
|---|
| 3395 | 3510 | if (!p) |
|---|
| 3396 | | - goto bad_file; |
|---|
| 3511 | + goto out; |
|---|
| 3397 | 3512 | |
|---|
| 3398 | 3513 | offset = swp_offset(entry); |
|---|
| 3399 | | - if (unlikely(offset >= p->max)) |
|---|
| 3400 | | - goto out; |
|---|
| 3401 | | - |
|---|
| 3402 | 3514 | ci = lock_cluster_or_swap_info(p, offset); |
|---|
| 3403 | 3515 | |
|---|
| 3404 | 3516 | count = p->swap_map[offset]; |
|---|
| .. | .. |
|---|
| 3439 | 3551 | } else |
|---|
| 3440 | 3552 | err = -ENOENT; /* unused swap entry */ |
|---|
| 3441 | 3553 | |
|---|
| 3442 | | - p->swap_map[offset] = count | has_cache; |
|---|
| 3554 | + WRITE_ONCE(p->swap_map[offset], count | has_cache); |
|---|
| 3443 | 3555 | |
|---|
| 3444 | 3556 | unlock_out: |
|---|
| 3445 | 3557 | unlock_cluster_or_swap_info(p, ci); |
|---|
| 3446 | 3558 | out: |
|---|
| 3559 | + if (p) |
|---|
| 3560 | + put_swap_device(p); |
|---|
| 3447 | 3561 | return err; |
|---|
| 3448 | | - |
|---|
| 3449 | | -bad_file: |
|---|
| 3450 | | - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); |
|---|
| 3451 | | - goto out; |
|---|
| 3452 | 3562 | } |
|---|
| 3453 | 3563 | |
|---|
| 3454 | 3564 | /* |
|---|
| .. | .. |
|---|
| 3481 | 3591 | * |
|---|
| 3482 | 3592 | * Called when allocating swap cache for existing swap entry, |
|---|
| 3483 | 3593 | * This can return error codes. Returns 0 at success. |
|---|
| 3484 | | - * -EBUSY means there is a swap cache. |
|---|
| 3594 | + * -EEXIST means there is a swap cache. |
|---|
| 3485 | 3595 | * Note: return code is different from swap_duplicate(). |
|---|
| 3486 | 3596 | */ |
|---|
| 3487 | 3597 | int swapcache_prepare(swp_entry_t entry) |
|---|
| .. | .. |
|---|
| 3493 | 3603 | { |
|---|
| 3494 | 3604 | return swap_type_to_swap_info(swp_type(entry)); |
|---|
| 3495 | 3605 | } |
|---|
| 3606 | +EXPORT_SYMBOL_GPL(swp_swap_info); |
|---|
| 3496 | 3607 | |
|---|
| 3497 | 3608 | struct swap_info_struct *page_swap_info(struct page *page) |
|---|
| 3498 | 3609 | { |
|---|
| .. | .. |
|---|
| 3540 | 3651 | struct page *list_page; |
|---|
| 3541 | 3652 | pgoff_t offset; |
|---|
| 3542 | 3653 | unsigned char count; |
|---|
| 3654 | + int ret = 0; |
|---|
| 3543 | 3655 | |
|---|
| 3544 | 3656 | /* |
|---|
| 3545 | 3657 | * When debugging, it's easier to use __GFP_ZERO here; but it's better |
|---|
| .. | .. |
|---|
| 3547 | 3659 | */ |
|---|
| 3548 | 3660 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); |
|---|
| 3549 | 3661 | |
|---|
| 3550 | | - si = swap_info_get(entry); |
|---|
| 3662 | + si = get_swap_device(entry); |
|---|
| 3551 | 3663 | if (!si) { |
|---|
| 3552 | 3664 | /* |
|---|
| 3553 | 3665 | * An acceptable race has occurred since the failing |
|---|
| 3554 | | - * __swap_duplicate(): the swap entry has been freed, |
|---|
| 3555 | | - * perhaps even the whole swap_map cleared for swapoff. |
|---|
| 3666 | + * __swap_duplicate(): the swap device may be swapoff |
|---|
| 3556 | 3667 | */ |
|---|
| 3557 | 3668 | goto outer; |
|---|
| 3558 | 3669 | } |
|---|
| 3670 | + spin_lock(&si->lock); |
|---|
| 3559 | 3671 | |
|---|
| 3560 | 3672 | offset = swp_offset(entry); |
|---|
| 3561 | 3673 | |
|---|
| .. | .. |
|---|
| 3573 | 3685 | } |
|---|
| 3574 | 3686 | |
|---|
| 3575 | 3687 | if (!page) { |
|---|
| 3576 | | - unlock_cluster(ci); |
|---|
| 3577 | | - spin_unlock(&si->lock); |
|---|
| 3578 | | - return -ENOMEM; |
|---|
| 3688 | + ret = -ENOMEM; |
|---|
| 3689 | + goto out; |
|---|
| 3579 | 3690 | } |
|---|
| 3580 | 3691 | |
|---|
| 3581 | 3692 | /* |
|---|
| .. | .. |
|---|
| 3627 | 3738 | out: |
|---|
| 3628 | 3739 | unlock_cluster(ci); |
|---|
| 3629 | 3740 | spin_unlock(&si->lock); |
|---|
| 3741 | + put_swap_device(si); |
|---|
| 3630 | 3742 | outer: |
|---|
| 3631 | 3743 | if (page) |
|---|
| 3632 | 3744 | __free_page(page); |
|---|
| 3633 | | - return 0; |
|---|
| 3745 | + return ret; |
|---|
| 3634 | 3746 | } |
|---|
| 3635 | 3747 | |
|---|
| 3636 | 3748 | /* |
|---|
| .. | .. |
|---|
| 3658 | 3770 | |
|---|
| 3659 | 3771 | spin_lock(&si->cont_lock); |
|---|
| 3660 | 3772 | offset &= ~PAGE_MASK; |
|---|
| 3661 | | - page = list_entry(head->lru.next, struct page, lru); |
|---|
| 3773 | + page = list_next_entry(head, lru); |
|---|
| 3662 | 3774 | map = kmap_atomic(page) + offset; |
|---|
| 3663 | 3775 | |
|---|
| 3664 | 3776 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ |
|---|
| .. | .. |
|---|
| 3670 | 3782 | */ |
|---|
| 3671 | 3783 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { |
|---|
| 3672 | 3784 | kunmap_atomic(map); |
|---|
| 3673 | | - page = list_entry(page->lru.next, struct page, lru); |
|---|
| 3785 | + page = list_next_entry(page, lru); |
|---|
| 3674 | 3786 | BUG_ON(page == head); |
|---|
| 3675 | 3787 | map = kmap_atomic(page) + offset; |
|---|
| 3676 | 3788 | } |
|---|
| 3677 | 3789 | if (*map == SWAP_CONT_MAX) { |
|---|
| 3678 | 3790 | kunmap_atomic(map); |
|---|
| 3679 | | - page = list_entry(page->lru.next, struct page, lru); |
|---|
| 3791 | + page = list_next_entry(page, lru); |
|---|
| 3680 | 3792 | if (page == head) { |
|---|
| 3681 | 3793 | ret = false; /* add count continuation */ |
|---|
| 3682 | 3794 | goto out; |
|---|
| .. | .. |
|---|
| 3686 | 3798 | } |
|---|
| 3687 | 3799 | *map += 1; |
|---|
| 3688 | 3800 | kunmap_atomic(map); |
|---|
| 3689 | | - page = list_entry(page->lru.prev, struct page, lru); |
|---|
| 3690 | | - while (page != head) { |
|---|
| 3801 | + while ((page = list_prev_entry(page, lru)) != head) { |
|---|
| 3691 | 3802 | map = kmap_atomic(page) + offset; |
|---|
| 3692 | 3803 | *map = COUNT_CONTINUED; |
|---|
| 3693 | 3804 | kunmap_atomic(map); |
|---|
| 3694 | | - page = list_entry(page->lru.prev, struct page, lru); |
|---|
| 3695 | 3805 | } |
|---|
| 3696 | 3806 | ret = true; /* incremented */ |
|---|
| 3697 | 3807 | |
|---|
| .. | .. |
|---|
| 3702 | 3812 | BUG_ON(count != COUNT_CONTINUED); |
|---|
| 3703 | 3813 | while (*map == COUNT_CONTINUED) { |
|---|
| 3704 | 3814 | kunmap_atomic(map); |
|---|
| 3705 | | - page = list_entry(page->lru.next, struct page, lru); |
|---|
| 3815 | + page = list_next_entry(page, lru); |
|---|
| 3706 | 3816 | BUG_ON(page == head); |
|---|
| 3707 | 3817 | map = kmap_atomic(page) + offset; |
|---|
| 3708 | 3818 | } |
|---|
| .. | .. |
|---|
| 3711 | 3821 | if (*map == 0) |
|---|
| 3712 | 3822 | count = 0; |
|---|
| 3713 | 3823 | kunmap_atomic(map); |
|---|
| 3714 | | - page = list_entry(page->lru.prev, struct page, lru); |
|---|
| 3715 | | - while (page != head) { |
|---|
| 3824 | + while ((page = list_prev_entry(page, lru)) != head) { |
|---|
| 3716 | 3825 | map = kmap_atomic(page) + offset; |
|---|
| 3717 | 3826 | *map = SWAP_CONT_MAX | count; |
|---|
| 3718 | 3827 | count = COUNT_CONTINUED; |
|---|
| 3719 | 3828 | kunmap_atomic(map); |
|---|
| 3720 | | - page = list_entry(page->lru.prev, struct page, lru); |
|---|
| 3721 | 3829 | } |
|---|
| 3722 | 3830 | ret = count == COUNT_CONTINUED; |
|---|
| 3723 | 3831 | } |
|---|
| .. | .. |
|---|
| 3749 | 3857 | } |
|---|
| 3750 | 3858 | |
|---|
| 3751 | 3859 | #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) |
|---|
| 3752 | | -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, |
|---|
| 3753 | | - gfp_t gfp_mask) |
|---|
| 3860 | +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) |
|---|
| 3754 | 3861 | { |
|---|
| 3755 | 3862 | struct swap_info_struct *si, *next; |
|---|
| 3756 | | - if (!(gfp_mask & __GFP_IO) || !memcg) |
|---|
| 3863 | + int nid = page_to_nid(page); |
|---|
| 3864 | + bool skip = false; |
|---|
| 3865 | + |
|---|
| 3866 | + if (!(gfp_mask & __GFP_IO)) |
|---|
| 3757 | 3867 | return; |
|---|
| 3758 | 3868 | |
|---|
| 3759 | 3869 | if (!blk_cgroup_congested()) |
|---|
| .. | .. |
|---|
| 3766 | 3876 | if (current->throttle_queue) |
|---|
| 3767 | 3877 | return; |
|---|
| 3768 | 3878 | |
|---|
| 3879 | + trace_android_vh___cgroup_throttle_swaprate(nid, &skip); |
|---|
| 3880 | + if (skip) |
|---|
| 3881 | + return; |
|---|
| 3882 | + |
|---|
| 3769 | 3883 | spin_lock(&swap_avail_lock); |
|---|
| 3770 | | - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], |
|---|
| 3771 | | - avail_lists[node]) { |
|---|
| 3884 | + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], |
|---|
| 3885 | + avail_lists[nid]) { |
|---|
| 3772 | 3886 | if (si->bdev) { |
|---|
| 3773 | | - blkcg_schedule_throttle(bdev_get_queue(si->bdev), |
|---|
| 3774 | | - true); |
|---|
| 3887 | + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); |
|---|
| 3775 | 3888 | break; |
|---|
| 3776 | 3889 | } |
|---|
| 3777 | 3890 | } |
|---|