| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Memory merging support. |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 10 | 11 | * Andrea Arcangeli |
|---|
| 11 | 12 | * Chris Wright |
|---|
| 12 | 13 | * Hugh Dickins |
|---|
| 13 | | - * |
|---|
| 14 | | - * This work is licensed under the terms of the GNU GPL, version 2. |
|---|
| 15 | 14 | */ |
|---|
| 16 | 15 | |
|---|
| 17 | 16 | #include <linux/errno.h> |
|---|
| .. | .. |
|---|
| 25 | 24 | #include <linux/pagemap.h> |
|---|
| 26 | 25 | #include <linux/rmap.h> |
|---|
| 27 | 26 | #include <linux/spinlock.h> |
|---|
| 28 | | -#include <linux/jhash.h> |
|---|
| 27 | +#include <linux/xxhash.h> |
|---|
| 29 | 28 | #include <linux/delay.h> |
|---|
| 30 | 29 | #include <linux/kthread.h> |
|---|
| 31 | 30 | #include <linux/wait.h> |
|---|
| .. | .. |
|---|
| 82 | 81 | * different KSM page copy of that content |
|---|
| 83 | 82 | * |
|---|
| 84 | 83 | * Internally, the regular nodes, "dups" and "chains" are represented |
|---|
| 85 | | - * using the same :c:type:`struct stable_node` structure. |
|---|
| 84 | + * using the same struct stable_node structure. |
|---|
| 86 | 85 | * |
|---|
| 87 | 86 | * In addition to the stable tree, KSM uses a second data structure called the |
|---|
| 88 | 87 | * unstable tree: this tree holds pointers to pages which have been found to |
|---|
| .. | .. |
|---|
| 296 | 295 | static void wait_while_offlining(void); |
|---|
| 297 | 296 | |
|---|
| 298 | 297 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
|---|
| 298 | +static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); |
|---|
| 299 | 299 | static DEFINE_MUTEX(ksm_thread_mutex); |
|---|
| 300 | 300 | static DEFINE_SPINLOCK(ksm_mmlist_lock); |
|---|
| 301 | 301 | |
|---|
| .. | .. |
|---|
| 442 | 442 | /* |
|---|
| 443 | 443 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
|---|
| 444 | 444 | * page tables after it has passed through ksm_exit() - which, if necessary, |
|---|
| 445 | | - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set |
|---|
| 445 | + * takes mmap_lock briefly to serialize against them. ksm_exit() does not set |
|---|
| 446 | 446 | * a special flag: they can just back out as soon as mm_users goes to zero. |
|---|
| 447 | 447 | * ksm_test_exit() is used throughout to make this test for exit: in some |
|---|
| 448 | 448 | * places for correctness, in some places just to avoid unnecessary work. |
|---|
| .. | .. |
|---|
| 455 | 455 | /* |
|---|
| 456 | 456 | * We use break_ksm to break COW on a ksm page: it's a stripped down |
|---|
| 457 | 457 | * |
|---|
| 458 | | - * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) |
|---|
| 458 | + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) |
|---|
| 459 | 459 | * put_page(page); |
|---|
| 460 | 460 | * |
|---|
| 461 | 461 | * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, |
|---|
| .. | .. |
|---|
| 480 | 480 | break; |
|---|
| 481 | 481 | if (PageKsm(page)) |
|---|
| 482 | 482 | ret = handle_mm_fault(vma, addr, |
|---|
| 483 | | - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); |
|---|
| 483 | + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, |
|---|
| 484 | + NULL); |
|---|
| 484 | 485 | else |
|---|
| 485 | 486 | ret = VM_FAULT_WRITE; |
|---|
| 486 | | - put_page(page); |
|---|
| 487 | + put_user_page(page); |
|---|
| 487 | 488 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); |
|---|
| 488 | 489 | /* |
|---|
| 489 | 490 | * We must loop because handle_mm_fault() may back out if there's |
|---|
| .. | .. |
|---|
| 542 | 543 | */ |
|---|
| 543 | 544 | put_anon_vma(rmap_item->anon_vma); |
|---|
| 544 | 545 | |
|---|
| 545 | | - down_read(&mm->mmap_sem); |
|---|
| 546 | + mmap_read_lock(mm); |
|---|
| 546 | 547 | vma = find_mergeable_vma(mm, addr); |
|---|
| 547 | 548 | if (vma) |
|---|
| 548 | 549 | break_ksm(vma, addr); |
|---|
| 549 | | - up_read(&mm->mmap_sem); |
|---|
| 550 | + mmap_read_unlock(mm); |
|---|
| 550 | 551 | } |
|---|
| 551 | 552 | |
|---|
| 552 | 553 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
|---|
| .. | .. |
|---|
| 556 | 557 | struct vm_area_struct *vma; |
|---|
| 557 | 558 | struct page *page; |
|---|
| 558 | 559 | |
|---|
| 559 | | - down_read(&mm->mmap_sem); |
|---|
| 560 | + mmap_read_lock(mm); |
|---|
| 560 | 561 | vma = find_mergeable_vma(mm, addr); |
|---|
| 561 | 562 | if (!vma) |
|---|
| 562 | 563 | goto out; |
|---|
| .. | .. |
|---|
| 568 | 569 | flush_anon_page(vma, page, addr); |
|---|
| 569 | 570 | flush_dcache_page(page); |
|---|
| 570 | 571 | } else { |
|---|
| 571 | | - put_page(page); |
|---|
| 572 | + put_user_page(page); |
|---|
| 572 | 573 | out: |
|---|
| 573 | 574 | page = NULL; |
|---|
| 574 | 575 | } |
|---|
| 575 | | - up_read(&mm->mmap_sem); |
|---|
| 576 | + mmap_read_unlock(mm); |
|---|
| 576 | 577 | return page; |
|---|
| 577 | 578 | } |
|---|
| 578 | 579 | |
|---|
| .. | .. |
|---|
| 597 | 598 | chain->chain_prune_time = jiffies; |
|---|
| 598 | 599 | chain->rmap_hlist_len = STABLE_NODE_CHAIN; |
|---|
| 599 | 600 | #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) |
|---|
| 600 | | - chain->nid = -1; /* debug */ |
|---|
| 601 | + chain->nid = NUMA_NO_NODE; /* debug */ |
|---|
| 601 | 602 | #endif |
|---|
| 602 | 603 | ksm_stable_node_chains++; |
|---|
| 603 | 604 | |
|---|
| .. | .. |
|---|
| 612 | 613 | * Move the old stable node to the second dimension |
|---|
| 613 | 614 | * queued in the hlist_dup. The invariant is that all |
|---|
| 614 | 615 | * dup stable_nodes in the chain->hlist point to pages |
|---|
| 615 | | - * that are wrprotected and have the exact same |
|---|
| 616 | + * that are write protected and have the exact same |
|---|
| 616 | 617 | * content. |
|---|
| 617 | 618 | */ |
|---|
| 618 | 619 | stable_node_chain_add_dup(dup, chain); |
|---|
| .. | .. |
|---|
| 666 | 667 | free_stable_node(stable_node); |
|---|
| 667 | 668 | } |
|---|
| 668 | 669 | |
|---|
| 670 | +enum get_ksm_page_flags { |
|---|
| 671 | + GET_KSM_PAGE_NOLOCK, |
|---|
| 672 | + GET_KSM_PAGE_LOCK, |
|---|
| 673 | + GET_KSM_PAGE_TRYLOCK |
|---|
| 674 | +}; |
|---|
| 675 | + |
|---|
| 669 | 676 | /* |
|---|
| 670 | 677 | * get_ksm_page: checks if the page indicated by the stable node |
|---|
| 671 | 678 | * is still its ksm page, despite having held no reference to it. |
|---|
| .. | .. |
|---|
| 685 | 692 | * a page to put something that might look like our key in page->mapping. |
|---|
| 686 | 693 | * is on its way to being freed; but it is an anomaly to bear in mind. |
|---|
| 687 | 694 | */ |
|---|
| 688 | | -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
|---|
| 695 | +static struct page *get_ksm_page(struct stable_node *stable_node, |
|---|
| 696 | + enum get_ksm_page_flags flags) |
|---|
| 689 | 697 | { |
|---|
| 690 | 698 | struct page *page; |
|---|
| 691 | 699 | void *expected_mapping; |
|---|
| .. | .. |
|---|
| 705 | 713 | * case this node is no longer referenced, and should be freed; |
|---|
| 706 | 714 | * however, it might mean that the page is under page_ref_freeze(). |
|---|
| 707 | 715 | * The __remove_mapping() case is easy, again the node is now stale; |
|---|
| 708 | | - * but if page is swapcache in migrate_page_move_mapping(), it might |
|---|
| 709 | | - * still be our page, in which case it's essential to keep the node. |
|---|
| 716 | + * the same is in reuse_ksm_page() case; but if page is swapcache |
|---|
| 717 | + * in migrate_page_move_mapping(), it might still be our page, |
|---|
| 718 | + * in which case it's essential to keep the node. |
|---|
| 710 | 719 | */ |
|---|
| 711 | 720 | while (!get_page_unless_zero(page)) { |
|---|
| 712 | 721 | /* |
|---|
| .. | .. |
|---|
| 727 | 736 | goto stale; |
|---|
| 728 | 737 | } |
|---|
| 729 | 738 | |
|---|
| 730 | | - if (lock_it) { |
|---|
| 739 | + if (flags == GET_KSM_PAGE_TRYLOCK) { |
|---|
| 740 | + if (!trylock_page(page)) { |
|---|
| 741 | + put_page(page); |
|---|
| 742 | + return ERR_PTR(-EBUSY); |
|---|
| 743 | + } |
|---|
| 744 | + } else if (flags == GET_KSM_PAGE_LOCK) |
|---|
| 731 | 745 | lock_page(page); |
|---|
| 746 | + |
|---|
| 747 | + if (flags != GET_KSM_PAGE_NOLOCK) { |
|---|
| 732 | 748 | if (READ_ONCE(page->mapping) != expected_mapping) { |
|---|
| 733 | 749 | unlock_page(page); |
|---|
| 734 | 750 | put_page(page); |
|---|
| .. | .. |
|---|
| 762 | 778 | struct page *page; |
|---|
| 763 | 779 | |
|---|
| 764 | 780 | stable_node = rmap_item->head; |
|---|
| 765 | | - page = get_ksm_page(stable_node, true); |
|---|
| 781 | + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
|---|
| 766 | 782 | if (!page) |
|---|
| 767 | 783 | goto out; |
|---|
| 768 | 784 | |
|---|
| .. | .. |
|---|
| 817 | 833 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
|---|
| 818 | 834 | * than check every pte of a given vma, the locking doesn't quite work for |
|---|
| 819 | 835 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
|---|
| 820 | | - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
|---|
| 836 | + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing |
|---|
| 821 | 837 | * rmap_items from parent to child at fork time (so as not to waste time |
|---|
| 822 | 838 | * if exit comes before the next scan reaches it). |
|---|
| 823 | 839 | * |
|---|
| .. | .. |
|---|
| 863 | 879 | struct page *page; |
|---|
| 864 | 880 | int err; |
|---|
| 865 | 881 | |
|---|
| 866 | | - page = get_ksm_page(stable_node, true); |
|---|
| 882 | + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
|---|
| 867 | 883 | if (!page) { |
|---|
| 868 | 884 | /* |
|---|
| 869 | 885 | * get_ksm_page did remove_node_from_stable_tree itself. |
|---|
| .. | .. |
|---|
| 962 | 978 | for (mm_slot = ksm_scan.mm_slot; |
|---|
| 963 | 979 | mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { |
|---|
| 964 | 980 | mm = mm_slot->mm; |
|---|
| 965 | | - down_read(&mm->mmap_sem); |
|---|
| 981 | + mmap_read_lock(mm); |
|---|
| 966 | 982 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
|---|
| 967 | 983 | if (ksm_test_exit(mm)) |
|---|
| 968 | 984 | break; |
|---|
| .. | .. |
|---|
| 975 | 991 | } |
|---|
| 976 | 992 | |
|---|
| 977 | 993 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
|---|
| 978 | | - up_read(&mm->mmap_sem); |
|---|
| 994 | + mmap_read_unlock(mm); |
|---|
| 979 | 995 | |
|---|
| 980 | 996 | spin_lock(&ksm_mmlist_lock); |
|---|
| 981 | 997 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
|---|
| .. | .. |
|---|
| 998 | 1014 | return 0; |
|---|
| 999 | 1015 | |
|---|
| 1000 | 1016 | error: |
|---|
| 1001 | | - up_read(&mm->mmap_sem); |
|---|
| 1017 | + mmap_read_unlock(mm); |
|---|
| 1002 | 1018 | spin_lock(&ksm_mmlist_lock); |
|---|
| 1003 | 1019 | ksm_scan.mm_slot = &ksm_mm_head; |
|---|
| 1004 | 1020 | spin_unlock(&ksm_mmlist_lock); |
|---|
| .. | .. |
|---|
| 1010 | 1026 | { |
|---|
| 1011 | 1027 | u32 checksum; |
|---|
| 1012 | 1028 | void *addr = kmap_atomic(page); |
|---|
| 1013 | | - checksum = jhash2(addr, PAGE_SIZE / 4, 17); |
|---|
| 1029 | + checksum = xxhash(addr, PAGE_SIZE, 0); |
|---|
| 1014 | 1030 | kunmap_atomic(addr); |
|---|
| 1015 | 1031 | return checksum; |
|---|
| 1016 | | -} |
|---|
| 1017 | | - |
|---|
| 1018 | | -static int memcmp_pages(struct page *page1, struct page *page2) |
|---|
| 1019 | | -{ |
|---|
| 1020 | | - char *addr1, *addr2; |
|---|
| 1021 | | - int ret; |
|---|
| 1022 | | - |
|---|
| 1023 | | - addr1 = kmap_atomic(page1); |
|---|
| 1024 | | - addr2 = kmap_atomic(page2); |
|---|
| 1025 | | - ret = memcmp(addr1, addr2, PAGE_SIZE); |
|---|
| 1026 | | - kunmap_atomic(addr2); |
|---|
| 1027 | | - kunmap_atomic(addr1); |
|---|
| 1028 | | - return ret; |
|---|
| 1029 | | -} |
|---|
| 1030 | | - |
|---|
| 1031 | | -static inline int pages_identical(struct page *page1, struct page *page2) |
|---|
| 1032 | | -{ |
|---|
| 1033 | | - return !memcmp_pages(page1, page2); |
|---|
| 1034 | 1032 | } |
|---|
| 1035 | 1033 | |
|---|
| 1036 | 1034 | static int write_protect_page(struct vm_area_struct *vma, struct page *page, |
|---|
| .. | .. |
|---|
| 1043 | 1041 | }; |
|---|
| 1044 | 1042 | int swapped; |
|---|
| 1045 | 1043 | int err = -EFAULT; |
|---|
| 1046 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
|---|
| 1047 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
|---|
| 1044 | + struct mmu_notifier_range range; |
|---|
| 1048 | 1045 | |
|---|
| 1049 | 1046 | pvmw.address = page_address_in_vma(page, vma); |
|---|
| 1050 | 1047 | if (pvmw.address == -EFAULT) |
|---|
| .. | .. |
|---|
| 1052 | 1049 | |
|---|
| 1053 | 1050 | BUG_ON(PageTransCompound(page)); |
|---|
| 1054 | 1051 | |
|---|
| 1055 | | - mmun_start = pvmw.address; |
|---|
| 1056 | | - mmun_end = pvmw.address + PAGE_SIZE; |
|---|
| 1057 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
|---|
| 1052 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, |
|---|
| 1053 | + pvmw.address, |
|---|
| 1054 | + pvmw.address + PAGE_SIZE); |
|---|
| 1055 | + mmu_notifier_invalidate_range_start(&range); |
|---|
| 1058 | 1056 | |
|---|
| 1059 | 1057 | if (!page_vma_mapped_walk(&pvmw)) |
|---|
| 1060 | 1058 | goto out_mn; |
|---|
| .. | .. |
|---|
| 1106 | 1104 | out_unlock: |
|---|
| 1107 | 1105 | page_vma_mapped_walk_done(&pvmw); |
|---|
| 1108 | 1106 | out_mn: |
|---|
| 1109 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
|---|
| 1107 | + mmu_notifier_invalidate_range_end(&range); |
|---|
| 1110 | 1108 | out: |
|---|
| 1111 | 1109 | return err; |
|---|
| 1112 | 1110 | } |
|---|
| .. | .. |
|---|
| 1130 | 1128 | spinlock_t *ptl; |
|---|
| 1131 | 1129 | unsigned long addr; |
|---|
| 1132 | 1130 | int err = -EFAULT; |
|---|
| 1133 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
|---|
| 1134 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
|---|
| 1131 | + struct mmu_notifier_range range; |
|---|
| 1135 | 1132 | |
|---|
| 1136 | 1133 | addr = page_address_in_vma(page, vma); |
|---|
| 1137 | 1134 | if (addr == -EFAULT) |
|---|
| .. | .. |
|---|
| 1141 | 1138 | if (!pmd) |
|---|
| 1142 | 1139 | goto out; |
|---|
| 1143 | 1140 | |
|---|
| 1144 | | - mmun_start = addr; |
|---|
| 1145 | | - mmun_end = addr + PAGE_SIZE; |
|---|
| 1146 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
|---|
| 1141 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, |
|---|
| 1142 | + addr + PAGE_SIZE); |
|---|
| 1143 | + mmu_notifier_invalidate_range_start(&range); |
|---|
| 1147 | 1144 | |
|---|
| 1148 | 1145 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
|---|
| 1149 | 1146 | if (!pte_same(*ptep, orig_pte)) { |
|---|
| .. | .. |
|---|
| 1153 | 1150 | |
|---|
| 1154 | 1151 | /* |
|---|
| 1155 | 1152 | * No need to check ksm_use_zero_pages here: we can only have a |
|---|
| 1156 | | - * zero_page here if ksm_use_zero_pages was enabled alreaady. |
|---|
| 1153 | + * zero_page here if ksm_use_zero_pages was enabled already. |
|---|
| 1157 | 1154 | */ |
|---|
| 1158 | 1155 | if (!is_zero_pfn(page_to_pfn(kpage))) { |
|---|
| 1159 | 1156 | get_page(kpage); |
|---|
| .. | .. |
|---|
| 1189 | 1186 | pte_unmap_unlock(ptep, ptl); |
|---|
| 1190 | 1187 | err = 0; |
|---|
| 1191 | 1188 | out_mn: |
|---|
| 1192 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
|---|
| 1189 | + mmu_notifier_invalidate_range_end(&range); |
|---|
| 1193 | 1190 | out: |
|---|
| 1194 | 1191 | return err; |
|---|
| 1195 | 1192 | } |
|---|
| .. | .. |
|---|
| 1285 | 1282 | struct vm_area_struct *vma; |
|---|
| 1286 | 1283 | int err = -EFAULT; |
|---|
| 1287 | 1284 | |
|---|
| 1288 | | - down_read(&mm->mmap_sem); |
|---|
| 1285 | + mmap_read_lock(mm); |
|---|
| 1289 | 1286 | vma = find_mergeable_vma(mm, rmap_item->address); |
|---|
| 1290 | 1287 | if (!vma) |
|---|
| 1291 | 1288 | goto out; |
|---|
| .. | .. |
|---|
| 1297 | 1294 | /* Unstable nid is in union with stable anon_vma: remove first */ |
|---|
| 1298 | 1295 | remove_rmap_item_from_tree(rmap_item); |
|---|
| 1299 | 1296 | |
|---|
| 1300 | | - /* Must get reference to anon_vma while still holding mmap_sem */ |
|---|
| 1297 | + /* Must get reference to anon_vma while still holding mmap_lock */ |
|---|
| 1301 | 1298 | rmap_item->anon_vma = vma->anon_vma; |
|---|
| 1302 | 1299 | get_anon_vma(vma->anon_vma); |
|---|
| 1303 | 1300 | out: |
|---|
| 1304 | | - up_read(&mm->mmap_sem); |
|---|
| 1301 | + mmap_read_unlock(mm); |
|---|
| 1305 | 1302 | return err; |
|---|
| 1306 | 1303 | } |
|---|
| 1307 | 1304 | |
|---|
| .. | .. |
|---|
| 1388 | 1385 | * stable_node parameter itself will be freed from |
|---|
| 1389 | 1386 | * under us if it returns NULL. |
|---|
| 1390 | 1387 | */ |
|---|
| 1391 | | - _tree_page = get_ksm_page(dup, false); |
|---|
| 1388 | + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); |
|---|
| 1392 | 1389 | if (!_tree_page) |
|---|
| 1393 | 1390 | continue; |
|---|
| 1394 | 1391 | nr += 1; |
|---|
| .. | .. |
|---|
| 1511 | 1508 | if (!is_stable_node_chain(stable_node)) { |
|---|
| 1512 | 1509 | if (is_page_sharing_candidate(stable_node)) { |
|---|
| 1513 | 1510 | *_stable_node_dup = stable_node; |
|---|
| 1514 | | - return get_ksm_page(stable_node, false); |
|---|
| 1511 | + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); |
|---|
| 1515 | 1512 | } |
|---|
| 1516 | 1513 | /* |
|---|
| 1517 | 1514 | * _stable_node_dup set to NULL means the stable_node |
|---|
| .. | .. |
|---|
| 1613 | 1610 | * continue. All KSM pages belonging to the |
|---|
| 1614 | 1611 | * stable_node dups in a stable_node chain |
|---|
| 1615 | 1612 | * have the same content and they're |
|---|
| 1616 | | - * wrprotected at all times. Any will work |
|---|
| 1613 | + * write protected at all times. Any will work |
|---|
| 1617 | 1614 | * fine to continue the walk. |
|---|
| 1618 | 1615 | */ |
|---|
| 1619 | | - tree_page = get_ksm_page(stable_node_any, false); |
|---|
| 1616 | + tree_page = get_ksm_page(stable_node_any, |
|---|
| 1617 | + GET_KSM_PAGE_NOLOCK); |
|---|
| 1620 | 1618 | } |
|---|
| 1621 | 1619 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
|---|
| 1622 | 1620 | if (!tree_page) { |
|---|
| .. | .. |
|---|
| 1676 | 1674 | * It would be more elegant to return stable_node |
|---|
| 1677 | 1675 | * than kpage, but that involves more changes. |
|---|
| 1678 | 1676 | */ |
|---|
| 1679 | | - tree_page = get_ksm_page(stable_node_dup, true); |
|---|
| 1677 | + tree_page = get_ksm_page(stable_node_dup, |
|---|
| 1678 | + GET_KSM_PAGE_TRYLOCK); |
|---|
| 1679 | + |
|---|
| 1680 | + if (PTR_ERR(tree_page) == -EBUSY) |
|---|
| 1681 | + return ERR_PTR(-EBUSY); |
|---|
| 1682 | + |
|---|
| 1680 | 1683 | if (unlikely(!tree_page)) |
|---|
| 1681 | 1684 | /* |
|---|
| 1682 | 1685 | * The tree may have been rebalanced, |
|---|
| .. | .. |
|---|
| 1842 | 1845 | * continue. All KSM pages belonging to the |
|---|
| 1843 | 1846 | * stable_node dups in a stable_node chain |
|---|
| 1844 | 1847 | * have the same content and they're |
|---|
| 1845 | | - * wrprotected at all times. Any will work |
|---|
| 1848 | + * write protected at all times. Any will work |
|---|
| 1846 | 1849 | * fine to continue the walk. |
|---|
| 1847 | 1850 | */ |
|---|
| 1848 | | - tree_page = get_ksm_page(stable_node_any, false); |
|---|
| 1851 | + tree_page = get_ksm_page(stable_node_any, |
|---|
| 1852 | + GET_KSM_PAGE_NOLOCK); |
|---|
| 1849 | 1853 | } |
|---|
| 1850 | 1854 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
|---|
| 1851 | 1855 | if (!tree_page) { |
|---|
| .. | .. |
|---|
| 1946 | 1950 | * Don't substitute a ksm page for a forked page. |
|---|
| 1947 | 1951 | */ |
|---|
| 1948 | 1952 | if (page == tree_page) { |
|---|
| 1949 | | - put_page(tree_page); |
|---|
| 1953 | + put_user_page(tree_page); |
|---|
| 1950 | 1954 | return NULL; |
|---|
| 1951 | 1955 | } |
|---|
| 1952 | 1956 | |
|---|
| .. | .. |
|---|
| 1954 | 1958 | |
|---|
| 1955 | 1959 | parent = *new; |
|---|
| 1956 | 1960 | if (ret < 0) { |
|---|
| 1957 | | - put_page(tree_page); |
|---|
| 1961 | + put_user_page(tree_page); |
|---|
| 1958 | 1962 | new = &parent->rb_left; |
|---|
| 1959 | 1963 | } else if (ret > 0) { |
|---|
| 1960 | | - put_page(tree_page); |
|---|
| 1964 | + put_user_page(tree_page); |
|---|
| 1961 | 1965 | new = &parent->rb_right; |
|---|
| 1962 | 1966 | } else if (!ksm_merge_across_nodes && |
|---|
| 1963 | 1967 | page_to_nid(tree_page) != nid) { |
|---|
| .. | .. |
|---|
| 1966 | 1970 | * it will be flushed out and put in the right unstable |
|---|
| 1967 | 1971 | * tree next time: only merge with it when across_nodes. |
|---|
| 1968 | 1972 | */ |
|---|
| 1969 | | - put_page(tree_page); |
|---|
| 1973 | + put_user_page(tree_page); |
|---|
| 1970 | 1974 | return NULL; |
|---|
| 1971 | 1975 | } else { |
|---|
| 1972 | 1976 | *tree_pagep = tree_page; |
|---|
| .. | .. |
|---|
| 1999 | 2003 | * duplicate. page_migration could break later if rmap breaks, |
|---|
| 2000 | 2004 | * so we can as well crash here. We really need to check for |
|---|
| 2001 | 2005 | * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check |
|---|
| 2002 | | - * for other negative values as an undeflow if detected here |
|---|
| 2006 | + * for other negative values as an underflow if detected here |
|---|
| 2003 | 2007 | * for the first time (and not when decreasing rmap_hlist_len) |
|---|
| 2004 | 2008 | * would be sign of memory corruption in the stable_node. |
|---|
| 2005 | 2009 | */ |
|---|
| .. | .. |
|---|
| 2071 | 2075 | remove_rmap_item_from_tree(rmap_item); |
|---|
| 2072 | 2076 | |
|---|
| 2073 | 2077 | if (kpage) { |
|---|
| 2078 | + if (PTR_ERR(kpage) == -EBUSY) |
|---|
| 2079 | + return; |
|---|
| 2080 | + |
|---|
| 2074 | 2081 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
|---|
| 2075 | 2082 | if (!err) { |
|---|
| 2076 | 2083 | /* |
|---|
| .. | .. |
|---|
| 2105 | 2112 | if (ksm_use_zero_pages && (checksum == zero_checksum)) { |
|---|
| 2106 | 2113 | struct vm_area_struct *vma; |
|---|
| 2107 | 2114 | |
|---|
| 2108 | | - down_read(&mm->mmap_sem); |
|---|
| 2115 | + mmap_read_lock(mm); |
|---|
| 2109 | 2116 | vma = find_mergeable_vma(mm, rmap_item->address); |
|---|
| 2110 | 2117 | if (vma) { |
|---|
| 2111 | 2118 | err = try_to_merge_one_page(vma, page, |
|---|
| .. | .. |
|---|
| 2117 | 2124 | */ |
|---|
| 2118 | 2125 | err = 0; |
|---|
| 2119 | 2126 | } |
|---|
| 2120 | | - up_read(&mm->mmap_sem); |
|---|
| 2127 | + mmap_read_unlock(mm); |
|---|
| 2121 | 2128 | /* |
|---|
| 2122 | 2129 | * In case of failure, the page was not really empty, so we |
|---|
| 2123 | 2130 | * need to continue. Otherwise we're done. |
|---|
| .. | .. |
|---|
| 2144 | 2151 | */ |
|---|
| 2145 | 2152 | split = PageTransCompound(page) |
|---|
| 2146 | 2153 | && compound_head(page) == compound_head(tree_page); |
|---|
| 2147 | | - put_page(tree_page); |
|---|
| 2154 | + put_user_page(tree_page); |
|---|
| 2148 | 2155 | if (kpage) { |
|---|
| 2149 | 2156 | /* |
|---|
| 2150 | 2157 | * The pages were successfully merged: insert new |
|---|
| .. | .. |
|---|
| 2253 | 2260 | |
|---|
| 2254 | 2261 | list_for_each_entry_safe(stable_node, next, |
|---|
| 2255 | 2262 | &migrate_nodes, list) { |
|---|
| 2256 | | - page = get_ksm_page(stable_node, false); |
|---|
| 2263 | + page = get_ksm_page(stable_node, |
|---|
| 2264 | + GET_KSM_PAGE_NOLOCK); |
|---|
| 2257 | 2265 | if (page) |
|---|
| 2258 | 2266 | put_page(page); |
|---|
| 2259 | 2267 | cond_resched(); |
|---|
| .. | .. |
|---|
| 2279 | 2287 | } |
|---|
| 2280 | 2288 | |
|---|
| 2281 | 2289 | mm = slot->mm; |
|---|
| 2282 | | - down_read(&mm->mmap_sem); |
|---|
| 2290 | + mmap_read_lock(mm); |
|---|
| 2283 | 2291 | if (ksm_test_exit(mm)) |
|---|
| 2284 | 2292 | vma = NULL; |
|---|
| 2285 | 2293 | else |
|---|
| .. | .. |
|---|
| 2312 | 2320 | &rmap_item->rmap_list; |
|---|
| 2313 | 2321 | ksm_scan.address += PAGE_SIZE; |
|---|
| 2314 | 2322 | } else |
|---|
| 2315 | | - put_page(*page); |
|---|
| 2316 | | - up_read(&mm->mmap_sem); |
|---|
| 2323 | + put_user_page(*page); |
|---|
| 2324 | + mmap_read_unlock(mm); |
|---|
| 2317 | 2325 | return rmap_item; |
|---|
| 2318 | 2326 | } |
|---|
| 2319 | | - put_page(*page); |
|---|
| 2327 | + put_user_page(*page); |
|---|
| 2320 | 2328 | ksm_scan.address += PAGE_SIZE; |
|---|
| 2321 | 2329 | cond_resched(); |
|---|
| 2322 | 2330 | } |
|---|
| .. | .. |
|---|
| 2337 | 2345 | struct mm_slot, mm_list); |
|---|
| 2338 | 2346 | if (ksm_scan.address == 0) { |
|---|
| 2339 | 2347 | /* |
|---|
| 2340 | | - * We've completed a full scan of all vmas, holding mmap_sem |
|---|
| 2348 | + * We've completed a full scan of all vmas, holding mmap_lock |
|---|
| 2341 | 2349 | * throughout, and found no VM_MERGEABLE: so do the same as |
|---|
| 2342 | 2350 | * __ksm_exit does to remove this mm from all our lists now. |
|---|
| 2343 | 2351 | * This applies either when cleaning up after __ksm_exit |
|---|
| 2344 | 2352 | * (but beware: we can reach here even before __ksm_exit), |
|---|
| 2345 | 2353 | * or when all VM_MERGEABLE areas have been unmapped (and |
|---|
| 2346 | | - * mmap_sem then protects against race with MADV_MERGEABLE). |
|---|
| 2354 | + * mmap_lock then protects against race with MADV_MERGEABLE). |
|---|
| 2347 | 2355 | */ |
|---|
| 2348 | 2356 | hash_del(&slot->link); |
|---|
| 2349 | 2357 | list_del(&slot->mm_list); |
|---|
| .. | .. |
|---|
| 2351 | 2359 | |
|---|
| 2352 | 2360 | free_mm_slot(slot); |
|---|
| 2353 | 2361 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); |
|---|
| 2354 | | - up_read(&mm->mmap_sem); |
|---|
| 2362 | + mmap_read_unlock(mm); |
|---|
| 2355 | 2363 | mmdrop(mm); |
|---|
| 2356 | 2364 | } else { |
|---|
| 2357 | | - up_read(&mm->mmap_sem); |
|---|
| 2365 | + mmap_read_unlock(mm); |
|---|
| 2358 | 2366 | /* |
|---|
| 2359 | | - * up_read(&mm->mmap_sem) first because after |
|---|
| 2367 | + * mmap_read_unlock(mm) first because after |
|---|
| 2360 | 2368 | * spin_unlock(&ksm_mmlist_lock) run, the "mm" may |
|---|
| 2361 | 2369 | * already have been freed under us by __ksm_exit() |
|---|
| 2362 | 2370 | * because the "mm_slot" is still hashed and |
|---|
| .. | .. |
|---|
| 2381 | 2389 | static void ksm_do_scan(unsigned int scan_npages) |
|---|
| 2382 | 2390 | { |
|---|
| 2383 | 2391 | struct rmap_item *rmap_item; |
|---|
| 2384 | | - struct page *uninitialized_var(page); |
|---|
| 2392 | + struct page *page; |
|---|
| 2385 | 2393 | |
|---|
| 2386 | 2394 | while (scan_npages-- && likely(!freezing(current))) { |
|---|
| 2387 | 2395 | cond_resched(); |
|---|
| .. | .. |
|---|
| 2400 | 2408 | |
|---|
| 2401 | 2409 | static int ksm_scan_thread(void *nothing) |
|---|
| 2402 | 2410 | { |
|---|
| 2411 | + unsigned int sleep_ms; |
|---|
| 2412 | + |
|---|
| 2403 | 2413 | set_freezable(); |
|---|
| 2404 | 2414 | set_user_nice(current, 5); |
|---|
| 2405 | 2415 | |
|---|
| .. | .. |
|---|
| 2413 | 2423 | try_to_freeze(); |
|---|
| 2414 | 2424 | |
|---|
| 2415 | 2425 | if (ksmd_should_run()) { |
|---|
| 2416 | | - schedule_timeout_interruptible( |
|---|
| 2417 | | - msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
|---|
| 2426 | + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); |
|---|
| 2427 | + wait_event_interruptible_timeout(ksm_iter_wait, |
|---|
| 2428 | + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), |
|---|
| 2429 | + msecs_to_jiffies(sleep_ms)); |
|---|
| 2418 | 2430 | } else { |
|---|
| 2419 | 2431 | wait_event_freezable(ksm_thread_wait, |
|---|
| 2420 | 2432 | ksmd_should_run() || kthread_should_stop()); |
|---|
| .. | .. |
|---|
| 2476 | 2488 | |
|---|
| 2477 | 2489 | return 0; |
|---|
| 2478 | 2490 | } |
|---|
| 2491 | +EXPORT_SYMBOL_GPL(ksm_madvise); |
|---|
| 2479 | 2492 | |
|---|
| 2480 | 2493 | int __ksm_enter(struct mm_struct *mm) |
|---|
| 2481 | 2494 | { |
|---|
| .. | .. |
|---|
| 2525 | 2538 | * This process is exiting: if it's straightforward (as is the |
|---|
| 2526 | 2539 | * case when ksmd was never running), free mm_slot immediately. |
|---|
| 2527 | 2540 | * But if it's at the cursor or has rmap_items linked to it, use |
|---|
| 2528 | | - * mmap_sem to synchronize with any break_cows before pagetables |
|---|
| 2541 | + * mmap_lock to synchronize with any break_cows before pagetables |
|---|
| 2529 | 2542 | * are freed, and leave the mm_slot on the list for ksmd to free. |
|---|
| 2530 | 2543 | * Beware: ksm may already have noticed it exiting and freed the slot. |
|---|
| 2531 | 2544 | */ |
|---|
| .. | .. |
|---|
| 2549 | 2562 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); |
|---|
| 2550 | 2563 | mmdrop(mm); |
|---|
| 2551 | 2564 | } else if (mm_slot) { |
|---|
| 2552 | | - down_write(&mm->mmap_sem); |
|---|
| 2553 | | - up_write(&mm->mmap_sem); |
|---|
| 2565 | + mmap_write_lock(mm); |
|---|
| 2566 | + mmap_write_unlock(mm); |
|---|
| 2554 | 2567 | } |
|---|
| 2555 | 2568 | } |
|---|
| 2556 | 2569 | |
|---|
| .. | .. |
|---|
| 2574 | 2587 | return page; /* let do_swap_page report the error */ |
|---|
| 2575 | 2588 | |
|---|
| 2576 | 2589 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
|---|
| 2590 | + if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { |
|---|
| 2591 | + put_page(new_page); |
|---|
| 2592 | + new_page = NULL; |
|---|
| 2593 | + } |
|---|
| 2577 | 2594 | if (new_page) { |
|---|
| 2578 | 2595 | copy_user_highpage(new_page, page, address, vma); |
|---|
| 2579 | 2596 | |
|---|
| .. | .. |
|---|
| 2609 | 2626 | struct vm_area_struct *vma; |
|---|
| 2610 | 2627 | |
|---|
| 2611 | 2628 | cond_resched(); |
|---|
| 2612 | | - anon_vma_lock_read(anon_vma); |
|---|
| 2629 | + if (!anon_vma_trylock_read(anon_vma)) { |
|---|
| 2630 | + if (rwc->try_lock) { |
|---|
| 2631 | + rwc->contended = true; |
|---|
| 2632 | + return; |
|---|
| 2633 | + } |
|---|
| 2634 | + anon_vma_lock_read(anon_vma); |
|---|
| 2635 | + } |
|---|
| 2613 | 2636 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
|---|
| 2614 | 2637 | 0, ULONG_MAX) { |
|---|
| 2615 | 2638 | unsigned long addr; |
|---|
| .. | .. |
|---|
| 2785 | 2808 | */ |
|---|
| 2786 | 2809 | ksm_check_stable_tree(mn->start_pfn, |
|---|
| 2787 | 2810 | mn->start_pfn + mn->nr_pages); |
|---|
| 2788 | | - /* fallthrough */ |
|---|
| 2789 | | - |
|---|
| 2811 | + fallthrough; |
|---|
| 2790 | 2812 | case MEM_CANCEL_OFFLINE: |
|---|
| 2791 | 2813 | mutex_lock(&ksm_thread_mutex); |
|---|
| 2792 | 2814 | ksm_run &= ~KSM_RUN_OFFLINE; |
|---|
| .. | .. |
|---|
| 2833 | 2855 | return -EINVAL; |
|---|
| 2834 | 2856 | |
|---|
| 2835 | 2857 | ksm_thread_sleep_millisecs = msecs; |
|---|
| 2858 | + wake_up_interruptible(&ksm_iter_wait); |
|---|
| 2836 | 2859 | |
|---|
| 2837 | 2860 | return count; |
|---|
| 2838 | 2861 | } |
|---|