.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Memory merging support. |
---|
3 | 4 | * |
---|
.. | .. |
---|
10 | 11 | * Andrea Arcangeli |
---|
11 | 12 | * Chris Wright |
---|
12 | 13 | * Hugh Dickins |
---|
13 | | - * |
---|
14 | | - * This work is licensed under the terms of the GNU GPL, version 2. |
---|
15 | 14 | */ |
---|
16 | 15 | |
---|
17 | 16 | #include <linux/errno.h> |
---|
.. | .. |
---|
25 | 24 | #include <linux/pagemap.h> |
---|
26 | 25 | #include <linux/rmap.h> |
---|
27 | 26 | #include <linux/spinlock.h> |
---|
28 | | -#include <linux/jhash.h> |
---|
| 27 | +#include <linux/xxhash.h> |
---|
29 | 28 | #include <linux/delay.h> |
---|
30 | 29 | #include <linux/kthread.h> |
---|
31 | 30 | #include <linux/wait.h> |
---|
.. | .. |
---|
82 | 81 | * different KSM page copy of that content |
---|
83 | 82 | * |
---|
84 | 83 | * Internally, the regular nodes, "dups" and "chains" are represented |
---|
85 | | - * using the same :c:type:`struct stable_node` structure. |
---|
| 84 | + * using the same struct stable_node structure. |
---|
86 | 85 | * |
---|
87 | 86 | * In addition to the stable tree, KSM uses a second data structure called the |
---|
88 | 87 | * unstable tree: this tree holds pointers to pages which have been found to |
---|
.. | .. |
---|
296 | 295 | static void wait_while_offlining(void); |
---|
297 | 296 | |
---|
298 | 297 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
---|
| 298 | +static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); |
---|
299 | 299 | static DEFINE_MUTEX(ksm_thread_mutex); |
---|
300 | 300 | static DEFINE_SPINLOCK(ksm_mmlist_lock); |
---|
301 | 301 | |
---|
.. | .. |
---|
442 | 442 | /* |
---|
443 | 443 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
---|
444 | 444 | * page tables after it has passed through ksm_exit() - which, if necessary, |
---|
445 | | - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set |
---|
| 445 | + * takes mmap_lock briefly to serialize against them. ksm_exit() does not set |
---|
446 | 446 | * a special flag: they can just back out as soon as mm_users goes to zero. |
---|
447 | 447 | * ksm_test_exit() is used throughout to make this test for exit: in some |
---|
448 | 448 | * places for correctness, in some places just to avoid unnecessary work. |
---|
.. | .. |
---|
455 | 455 | /* |
---|
456 | 456 | * We use break_ksm to break COW on a ksm page: it's a stripped down |
---|
457 | 457 | * |
---|
458 | | - * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) |
---|
| 458 | + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) |
---|
459 | 459 | * put_page(page); |
---|
460 | 460 | * |
---|
461 | 461 | * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, |
---|
.. | .. |
---|
480 | 480 | break; |
---|
481 | 481 | if (PageKsm(page)) |
---|
482 | 482 | ret = handle_mm_fault(vma, addr, |
---|
483 | | - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); |
---|
| 483 | + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, |
---|
| 484 | + NULL); |
---|
484 | 485 | else |
---|
485 | 486 | ret = VM_FAULT_WRITE; |
---|
486 | | - put_page(page); |
---|
| 487 | + put_user_page(page); |
---|
487 | 488 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); |
---|
488 | 489 | /* |
---|
489 | 490 | * We must loop because handle_mm_fault() may back out if there's |
---|
.. | .. |
---|
542 | 543 | */ |
---|
543 | 544 | put_anon_vma(rmap_item->anon_vma); |
---|
544 | 545 | |
---|
545 | | - down_read(&mm->mmap_sem); |
---|
| 546 | + mmap_read_lock(mm); |
---|
546 | 547 | vma = find_mergeable_vma(mm, addr); |
---|
547 | 548 | if (vma) |
---|
548 | 549 | break_ksm(vma, addr); |
---|
549 | | - up_read(&mm->mmap_sem); |
---|
| 550 | + mmap_read_unlock(mm); |
---|
550 | 551 | } |
---|
551 | 552 | |
---|
552 | 553 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
---|
.. | .. |
---|
556 | 557 | struct vm_area_struct *vma; |
---|
557 | 558 | struct page *page; |
---|
558 | 559 | |
---|
559 | | - down_read(&mm->mmap_sem); |
---|
| 560 | + mmap_read_lock(mm); |
---|
560 | 561 | vma = find_mergeable_vma(mm, addr); |
---|
561 | 562 | if (!vma) |
---|
562 | 563 | goto out; |
---|
.. | .. |
---|
568 | 569 | flush_anon_page(vma, page, addr); |
---|
569 | 570 | flush_dcache_page(page); |
---|
570 | 571 | } else { |
---|
571 | | - put_page(page); |
---|
| 572 | + put_user_page(page); |
---|
572 | 573 | out: |
---|
573 | 574 | page = NULL; |
---|
574 | 575 | } |
---|
575 | | - up_read(&mm->mmap_sem); |
---|
| 576 | + mmap_read_unlock(mm); |
---|
576 | 577 | return page; |
---|
577 | 578 | } |
---|
578 | 579 | |
---|
.. | .. |
---|
597 | 598 | chain->chain_prune_time = jiffies; |
---|
598 | 599 | chain->rmap_hlist_len = STABLE_NODE_CHAIN; |
---|
599 | 600 | #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) |
---|
600 | | - chain->nid = -1; /* debug */ |
---|
| 601 | + chain->nid = NUMA_NO_NODE; /* debug */ |
---|
601 | 602 | #endif |
---|
602 | 603 | ksm_stable_node_chains++; |
---|
603 | 604 | |
---|
.. | .. |
---|
612 | 613 | * Move the old stable node to the second dimension |
---|
613 | 614 | * queued in the hlist_dup. The invariant is that all |
---|
614 | 615 | * dup stable_nodes in the chain->hlist point to pages |
---|
615 | | - * that are wrprotected and have the exact same |
---|
| 616 | + * that are write protected and have the exact same |
---|
616 | 617 | * content. |
---|
617 | 618 | */ |
---|
618 | 619 | stable_node_chain_add_dup(dup, chain); |
---|
.. | .. |
---|
666 | 667 | free_stable_node(stable_node); |
---|
667 | 668 | } |
---|
668 | 669 | |
---|
| 670 | +enum get_ksm_page_flags { |
---|
| 671 | + GET_KSM_PAGE_NOLOCK, |
---|
| 672 | + GET_KSM_PAGE_LOCK, |
---|
| 673 | + GET_KSM_PAGE_TRYLOCK |
---|
| 674 | +}; |
---|
| 675 | + |
---|
669 | 676 | /* |
---|
670 | 677 | * get_ksm_page: checks if the page indicated by the stable node |
---|
671 | 678 | * is still its ksm page, despite having held no reference to it. |
---|
.. | .. |
---|
685 | 692 | * a page to put something that might look like our key in page->mapping. |
---|
686 | 693 | * is on its way to being freed; but it is an anomaly to bear in mind. |
---|
687 | 694 | */ |
---|
688 | | -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
---|
| 695 | +static struct page *get_ksm_page(struct stable_node *stable_node, |
---|
| 696 | + enum get_ksm_page_flags flags) |
---|
689 | 697 | { |
---|
690 | 698 | struct page *page; |
---|
691 | 699 | void *expected_mapping; |
---|
.. | .. |
---|
705 | 713 | * case this node is no longer referenced, and should be freed; |
---|
706 | 714 | * however, it might mean that the page is under page_ref_freeze(). |
---|
707 | 715 | * The __remove_mapping() case is easy, again the node is now stale; |
---|
708 | | - * but if page is swapcache in migrate_page_move_mapping(), it might |
---|
709 | | - * still be our page, in which case it's essential to keep the node. |
---|
| 716 | + * the same is in reuse_ksm_page() case; but if page is swapcache |
---|
| 717 | + * in migrate_page_move_mapping(), it might still be our page, |
---|
| 718 | + * in which case it's essential to keep the node. |
---|
710 | 719 | */ |
---|
711 | 720 | while (!get_page_unless_zero(page)) { |
---|
712 | 721 | /* |
---|
.. | .. |
---|
727 | 736 | goto stale; |
---|
728 | 737 | } |
---|
729 | 738 | |
---|
730 | | - if (lock_it) { |
---|
| 739 | + if (flags == GET_KSM_PAGE_TRYLOCK) { |
---|
| 740 | + if (!trylock_page(page)) { |
---|
| 741 | + put_page(page); |
---|
| 742 | + return ERR_PTR(-EBUSY); |
---|
| 743 | + } |
---|
| 744 | + } else if (flags == GET_KSM_PAGE_LOCK) |
---|
731 | 745 | lock_page(page); |
---|
| 746 | + |
---|
| 747 | + if (flags != GET_KSM_PAGE_NOLOCK) { |
---|
732 | 748 | if (READ_ONCE(page->mapping) != expected_mapping) { |
---|
733 | 749 | unlock_page(page); |
---|
734 | 750 | put_page(page); |
---|
.. | .. |
---|
762 | 778 | struct page *page; |
---|
763 | 779 | |
---|
764 | 780 | stable_node = rmap_item->head; |
---|
765 | | - page = get_ksm_page(stable_node, true); |
---|
| 781 | + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
---|
766 | 782 | if (!page) |
---|
767 | 783 | goto out; |
---|
768 | 784 | |
---|
.. | .. |
---|
817 | 833 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
---|
818 | 834 | * than check every pte of a given vma, the locking doesn't quite work for |
---|
819 | 835 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
---|
820 | | - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
---|
| 836 | + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing |
---|
821 | 837 | * rmap_items from parent to child at fork time (so as not to waste time |
---|
822 | 838 | * if exit comes before the next scan reaches it). |
---|
823 | 839 | * |
---|
.. | .. |
---|
863 | 879 | struct page *page; |
---|
864 | 880 | int err; |
---|
865 | 881 | |
---|
866 | | - page = get_ksm_page(stable_node, true); |
---|
| 882 | + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
---|
867 | 883 | if (!page) { |
---|
868 | 884 | /* |
---|
869 | 885 | * get_ksm_page did remove_node_from_stable_tree itself. |
---|
.. | .. |
---|
962 | 978 | for (mm_slot = ksm_scan.mm_slot; |
---|
963 | 979 | mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { |
---|
964 | 980 | mm = mm_slot->mm; |
---|
965 | | - down_read(&mm->mmap_sem); |
---|
| 981 | + mmap_read_lock(mm); |
---|
966 | 982 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
---|
967 | 983 | if (ksm_test_exit(mm)) |
---|
968 | 984 | break; |
---|
.. | .. |
---|
975 | 991 | } |
---|
976 | 992 | |
---|
977 | 993 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
---|
978 | | - up_read(&mm->mmap_sem); |
---|
| 994 | + mmap_read_unlock(mm); |
---|
979 | 995 | |
---|
980 | 996 | spin_lock(&ksm_mmlist_lock); |
---|
981 | 997 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
---|
.. | .. |
---|
998 | 1014 | return 0; |
---|
999 | 1015 | |
---|
1000 | 1016 | error: |
---|
1001 | | - up_read(&mm->mmap_sem); |
---|
| 1017 | + mmap_read_unlock(mm); |
---|
1002 | 1018 | spin_lock(&ksm_mmlist_lock); |
---|
1003 | 1019 | ksm_scan.mm_slot = &ksm_mm_head; |
---|
1004 | 1020 | spin_unlock(&ksm_mmlist_lock); |
---|
.. | .. |
---|
1010 | 1026 | { |
---|
1011 | 1027 | u32 checksum; |
---|
1012 | 1028 | void *addr = kmap_atomic(page); |
---|
1013 | | - checksum = jhash2(addr, PAGE_SIZE / 4, 17); |
---|
| 1029 | + checksum = xxhash(addr, PAGE_SIZE, 0); |
---|
1014 | 1030 | kunmap_atomic(addr); |
---|
1015 | 1031 | return checksum; |
---|
1016 | | -} |
---|
1017 | | - |
---|
1018 | | -static int memcmp_pages(struct page *page1, struct page *page2) |
---|
1019 | | -{ |
---|
1020 | | - char *addr1, *addr2; |
---|
1021 | | - int ret; |
---|
1022 | | - |
---|
1023 | | - addr1 = kmap_atomic(page1); |
---|
1024 | | - addr2 = kmap_atomic(page2); |
---|
1025 | | - ret = memcmp(addr1, addr2, PAGE_SIZE); |
---|
1026 | | - kunmap_atomic(addr2); |
---|
1027 | | - kunmap_atomic(addr1); |
---|
1028 | | - return ret; |
---|
1029 | | -} |
---|
1030 | | - |
---|
1031 | | -static inline int pages_identical(struct page *page1, struct page *page2) |
---|
1032 | | -{ |
---|
1033 | | - return !memcmp_pages(page1, page2); |
---|
1034 | 1032 | } |
---|
1035 | 1033 | |
---|
1036 | 1034 | static int write_protect_page(struct vm_area_struct *vma, struct page *page, |
---|
.. | .. |
---|
1043 | 1041 | }; |
---|
1044 | 1042 | int swapped; |
---|
1045 | 1043 | int err = -EFAULT; |
---|
1046 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
---|
1047 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
---|
| 1044 | + struct mmu_notifier_range range; |
---|
1048 | 1045 | |
---|
1049 | 1046 | pvmw.address = page_address_in_vma(page, vma); |
---|
1050 | 1047 | if (pvmw.address == -EFAULT) |
---|
.. | .. |
---|
1052 | 1049 | |
---|
1053 | 1050 | BUG_ON(PageTransCompound(page)); |
---|
1054 | 1051 | |
---|
1055 | | - mmun_start = pvmw.address; |
---|
1056 | | - mmun_end = pvmw.address + PAGE_SIZE; |
---|
1057 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
---|
| 1052 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, |
---|
| 1053 | + pvmw.address, |
---|
| 1054 | + pvmw.address + PAGE_SIZE); |
---|
| 1055 | + mmu_notifier_invalidate_range_start(&range); |
---|
1058 | 1056 | |
---|
1059 | 1057 | if (!page_vma_mapped_walk(&pvmw)) |
---|
1060 | 1058 | goto out_mn; |
---|
.. | .. |
---|
1106 | 1104 | out_unlock: |
---|
1107 | 1105 | page_vma_mapped_walk_done(&pvmw); |
---|
1108 | 1106 | out_mn: |
---|
1109 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
| 1107 | + mmu_notifier_invalidate_range_end(&range); |
---|
1110 | 1108 | out: |
---|
1111 | 1109 | return err; |
---|
1112 | 1110 | } |
---|
.. | .. |
---|
1130 | 1128 | spinlock_t *ptl; |
---|
1131 | 1129 | unsigned long addr; |
---|
1132 | 1130 | int err = -EFAULT; |
---|
1133 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
---|
1134 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
---|
| 1131 | + struct mmu_notifier_range range; |
---|
1135 | 1132 | |
---|
1136 | 1133 | addr = page_address_in_vma(page, vma); |
---|
1137 | 1134 | if (addr == -EFAULT) |
---|
.. | .. |
---|
1141 | 1138 | if (!pmd) |
---|
1142 | 1139 | goto out; |
---|
1143 | 1140 | |
---|
1144 | | - mmun_start = addr; |
---|
1145 | | - mmun_end = addr + PAGE_SIZE; |
---|
1146 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
---|
| 1141 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, |
---|
| 1142 | + addr + PAGE_SIZE); |
---|
| 1143 | + mmu_notifier_invalidate_range_start(&range); |
---|
1147 | 1144 | |
---|
1148 | 1145 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
---|
1149 | 1146 | if (!pte_same(*ptep, orig_pte)) { |
---|
.. | .. |
---|
1153 | 1150 | |
---|
1154 | 1151 | /* |
---|
1155 | 1152 | * No need to check ksm_use_zero_pages here: we can only have a |
---|
1156 | | - * zero_page here if ksm_use_zero_pages was enabled alreaady. |
---|
| 1153 | + * zero_page here if ksm_use_zero_pages was enabled already. |
---|
1157 | 1154 | */ |
---|
1158 | 1155 | if (!is_zero_pfn(page_to_pfn(kpage))) { |
---|
1159 | 1156 | get_page(kpage); |
---|
.. | .. |
---|
1189 | 1186 | pte_unmap_unlock(ptep, ptl); |
---|
1190 | 1187 | err = 0; |
---|
1191 | 1188 | out_mn: |
---|
1192 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
| 1189 | + mmu_notifier_invalidate_range_end(&range); |
---|
1193 | 1190 | out: |
---|
1194 | 1191 | return err; |
---|
1195 | 1192 | } |
---|
.. | .. |
---|
1285 | 1282 | struct vm_area_struct *vma; |
---|
1286 | 1283 | int err = -EFAULT; |
---|
1287 | 1284 | |
---|
1288 | | - down_read(&mm->mmap_sem); |
---|
| 1285 | + mmap_read_lock(mm); |
---|
1289 | 1286 | vma = find_mergeable_vma(mm, rmap_item->address); |
---|
1290 | 1287 | if (!vma) |
---|
1291 | 1288 | goto out; |
---|
.. | .. |
---|
1297 | 1294 | /* Unstable nid is in union with stable anon_vma: remove first */ |
---|
1298 | 1295 | remove_rmap_item_from_tree(rmap_item); |
---|
1299 | 1296 | |
---|
1300 | | - /* Must get reference to anon_vma while still holding mmap_sem */ |
---|
| 1297 | + /* Must get reference to anon_vma while still holding mmap_lock */ |
---|
1301 | 1298 | rmap_item->anon_vma = vma->anon_vma; |
---|
1302 | 1299 | get_anon_vma(vma->anon_vma); |
---|
1303 | 1300 | out: |
---|
1304 | | - up_read(&mm->mmap_sem); |
---|
| 1301 | + mmap_read_unlock(mm); |
---|
1305 | 1302 | return err; |
---|
1306 | 1303 | } |
---|
1307 | 1304 | |
---|
.. | .. |
---|
1388 | 1385 | * stable_node parameter itself will be freed from |
---|
1389 | 1386 | * under us if it returns NULL. |
---|
1390 | 1387 | */ |
---|
1391 | | - _tree_page = get_ksm_page(dup, false); |
---|
| 1388 | + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); |
---|
1392 | 1389 | if (!_tree_page) |
---|
1393 | 1390 | continue; |
---|
1394 | 1391 | nr += 1; |
---|
.. | .. |
---|
1511 | 1508 | if (!is_stable_node_chain(stable_node)) { |
---|
1512 | 1509 | if (is_page_sharing_candidate(stable_node)) { |
---|
1513 | 1510 | *_stable_node_dup = stable_node; |
---|
1514 | | - return get_ksm_page(stable_node, false); |
---|
| 1511 | + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); |
---|
1515 | 1512 | } |
---|
1516 | 1513 | /* |
---|
1517 | 1514 | * _stable_node_dup set to NULL means the stable_node |
---|
.. | .. |
---|
1613 | 1610 | * continue. All KSM pages belonging to the |
---|
1614 | 1611 | * stable_node dups in a stable_node chain |
---|
1615 | 1612 | * have the same content and they're |
---|
1616 | | - * wrprotected at all times. Any will work |
---|
| 1613 | + * write protected at all times. Any will work |
---|
1617 | 1614 | * fine to continue the walk. |
---|
1618 | 1615 | */ |
---|
1619 | | - tree_page = get_ksm_page(stable_node_any, false); |
---|
| 1616 | + tree_page = get_ksm_page(stable_node_any, |
---|
| 1617 | + GET_KSM_PAGE_NOLOCK); |
---|
1620 | 1618 | } |
---|
1621 | 1619 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
---|
1622 | 1620 | if (!tree_page) { |
---|
.. | .. |
---|
1676 | 1674 | * It would be more elegant to return stable_node |
---|
1677 | 1675 | * than kpage, but that involves more changes. |
---|
1678 | 1676 | */ |
---|
1679 | | - tree_page = get_ksm_page(stable_node_dup, true); |
---|
| 1677 | + tree_page = get_ksm_page(stable_node_dup, |
---|
| 1678 | + GET_KSM_PAGE_TRYLOCK); |
---|
| 1679 | + |
---|
| 1680 | + if (PTR_ERR(tree_page) == -EBUSY) |
---|
| 1681 | + return ERR_PTR(-EBUSY); |
---|
| 1682 | + |
---|
1680 | 1683 | if (unlikely(!tree_page)) |
---|
1681 | 1684 | /* |
---|
1682 | 1685 | * The tree may have been rebalanced, |
---|
.. | .. |
---|
1842 | 1845 | * continue. All KSM pages belonging to the |
---|
1843 | 1846 | * stable_node dups in a stable_node chain |
---|
1844 | 1847 | * have the same content and they're |
---|
1845 | | - * wrprotected at all times. Any will work |
---|
| 1848 | + * write protected at all times. Any will work |
---|
1846 | 1849 | * fine to continue the walk. |
---|
1847 | 1850 | */ |
---|
1848 | | - tree_page = get_ksm_page(stable_node_any, false); |
---|
| 1851 | + tree_page = get_ksm_page(stable_node_any, |
---|
| 1852 | + GET_KSM_PAGE_NOLOCK); |
---|
1849 | 1853 | } |
---|
1850 | 1854 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
---|
1851 | 1855 | if (!tree_page) { |
---|
.. | .. |
---|
1946 | 1950 | * Don't substitute a ksm page for a forked page. |
---|
1947 | 1951 | */ |
---|
1948 | 1952 | if (page == tree_page) { |
---|
1949 | | - put_page(tree_page); |
---|
| 1953 | + put_user_page(tree_page); |
---|
1950 | 1954 | return NULL; |
---|
1951 | 1955 | } |
---|
1952 | 1956 | |
---|
.. | .. |
---|
1954 | 1958 | |
---|
1955 | 1959 | parent = *new; |
---|
1956 | 1960 | if (ret < 0) { |
---|
1957 | | - put_page(tree_page); |
---|
| 1961 | + put_user_page(tree_page); |
---|
1958 | 1962 | new = &parent->rb_left; |
---|
1959 | 1963 | } else if (ret > 0) { |
---|
1960 | | - put_page(tree_page); |
---|
| 1964 | + put_user_page(tree_page); |
---|
1961 | 1965 | new = &parent->rb_right; |
---|
1962 | 1966 | } else if (!ksm_merge_across_nodes && |
---|
1963 | 1967 | page_to_nid(tree_page) != nid) { |
---|
.. | .. |
---|
1966 | 1970 | * it will be flushed out and put in the right unstable |
---|
1967 | 1971 | * tree next time: only merge with it when across_nodes. |
---|
1968 | 1972 | */ |
---|
1969 | | - put_page(tree_page); |
---|
| 1973 | + put_user_page(tree_page); |
---|
1970 | 1974 | return NULL; |
---|
1971 | 1975 | } else { |
---|
1972 | 1976 | *tree_pagep = tree_page; |
---|
.. | .. |
---|
1999 | 2003 | * duplicate. page_migration could break later if rmap breaks, |
---|
2000 | 2004 | * so we can as well crash here. We really need to check for |
---|
2001 | 2005 | * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check |
---|
2002 | | - * for other negative values as an undeflow if detected here |
---|
| 2006 | + * for other negative values as an underflow if detected here |
---|
2003 | 2007 | * for the first time (and not when decreasing rmap_hlist_len) |
---|
2004 | 2008 | * would be sign of memory corruption in the stable_node. |
---|
2005 | 2009 | */ |
---|
.. | .. |
---|
2071 | 2075 | remove_rmap_item_from_tree(rmap_item); |
---|
2072 | 2076 | |
---|
2073 | 2077 | if (kpage) { |
---|
| 2078 | + if (PTR_ERR(kpage) == -EBUSY) |
---|
| 2079 | + return; |
---|
| 2080 | + |
---|
2074 | 2081 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
---|
2075 | 2082 | if (!err) { |
---|
2076 | 2083 | /* |
---|
.. | .. |
---|
2105 | 2112 | if (ksm_use_zero_pages && (checksum == zero_checksum)) { |
---|
2106 | 2113 | struct vm_area_struct *vma; |
---|
2107 | 2114 | |
---|
2108 | | - down_read(&mm->mmap_sem); |
---|
| 2115 | + mmap_read_lock(mm); |
---|
2109 | 2116 | vma = find_mergeable_vma(mm, rmap_item->address); |
---|
2110 | 2117 | if (vma) { |
---|
2111 | 2118 | err = try_to_merge_one_page(vma, page, |
---|
.. | .. |
---|
2117 | 2124 | */ |
---|
2118 | 2125 | err = 0; |
---|
2119 | 2126 | } |
---|
2120 | | - up_read(&mm->mmap_sem); |
---|
| 2127 | + mmap_read_unlock(mm); |
---|
2121 | 2128 | /* |
---|
2122 | 2129 | * In case of failure, the page was not really empty, so we |
---|
2123 | 2130 | * need to continue. Otherwise we're done. |
---|
.. | .. |
---|
2144 | 2151 | */ |
---|
2145 | 2152 | split = PageTransCompound(page) |
---|
2146 | 2153 | && compound_head(page) == compound_head(tree_page); |
---|
2147 | | - put_page(tree_page); |
---|
| 2154 | + put_user_page(tree_page); |
---|
2148 | 2155 | if (kpage) { |
---|
2149 | 2156 | /* |
---|
2150 | 2157 | * The pages were successfully merged: insert new |
---|
.. | .. |
---|
2253 | 2260 | |
---|
2254 | 2261 | list_for_each_entry_safe(stable_node, next, |
---|
2255 | 2262 | &migrate_nodes, list) { |
---|
2256 | | - page = get_ksm_page(stable_node, false); |
---|
| 2263 | + page = get_ksm_page(stable_node, |
---|
| 2264 | + GET_KSM_PAGE_NOLOCK); |
---|
2257 | 2265 | if (page) |
---|
2258 | 2266 | put_page(page); |
---|
2259 | 2267 | cond_resched(); |
---|
.. | .. |
---|
2279 | 2287 | } |
---|
2280 | 2288 | |
---|
2281 | 2289 | mm = slot->mm; |
---|
2282 | | - down_read(&mm->mmap_sem); |
---|
| 2290 | + mmap_read_lock(mm); |
---|
2283 | 2291 | if (ksm_test_exit(mm)) |
---|
2284 | 2292 | vma = NULL; |
---|
2285 | 2293 | else |
---|
.. | .. |
---|
2312 | 2320 | &rmap_item->rmap_list; |
---|
2313 | 2321 | ksm_scan.address += PAGE_SIZE; |
---|
2314 | 2322 | } else |
---|
2315 | | - put_page(*page); |
---|
2316 | | - up_read(&mm->mmap_sem); |
---|
| 2323 | + put_user_page(*page); |
---|
| 2324 | + mmap_read_unlock(mm); |
---|
2317 | 2325 | return rmap_item; |
---|
2318 | 2326 | } |
---|
2319 | | - put_page(*page); |
---|
| 2327 | + put_user_page(*page); |
---|
2320 | 2328 | ksm_scan.address += PAGE_SIZE; |
---|
2321 | 2329 | cond_resched(); |
---|
2322 | 2330 | } |
---|
.. | .. |
---|
2337 | 2345 | struct mm_slot, mm_list); |
---|
2338 | 2346 | if (ksm_scan.address == 0) { |
---|
2339 | 2347 | /* |
---|
2340 | | - * We've completed a full scan of all vmas, holding mmap_sem |
---|
| 2348 | + * We've completed a full scan of all vmas, holding mmap_lock |
---|
2341 | 2349 | * throughout, and found no VM_MERGEABLE: so do the same as |
---|
2342 | 2350 | * __ksm_exit does to remove this mm from all our lists now. |
---|
2343 | 2351 | * This applies either when cleaning up after __ksm_exit |
---|
2344 | 2352 | * (but beware: we can reach here even before __ksm_exit), |
---|
2345 | 2353 | * or when all VM_MERGEABLE areas have been unmapped (and |
---|
2346 | | - * mmap_sem then protects against race with MADV_MERGEABLE). |
---|
| 2354 | + * mmap_lock then protects against race with MADV_MERGEABLE). |
---|
2347 | 2355 | */ |
---|
2348 | 2356 | hash_del(&slot->link); |
---|
2349 | 2357 | list_del(&slot->mm_list); |
---|
.. | .. |
---|
2351 | 2359 | |
---|
2352 | 2360 | free_mm_slot(slot); |
---|
2353 | 2361 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); |
---|
2354 | | - up_read(&mm->mmap_sem); |
---|
| 2362 | + mmap_read_unlock(mm); |
---|
2355 | 2363 | mmdrop(mm); |
---|
2356 | 2364 | } else { |
---|
2357 | | - up_read(&mm->mmap_sem); |
---|
| 2365 | + mmap_read_unlock(mm); |
---|
2358 | 2366 | /* |
---|
2359 | | - * up_read(&mm->mmap_sem) first because after |
---|
| 2367 | + * mmap_read_unlock(mm) first because after |
---|
2360 | 2368 | * spin_unlock(&ksm_mmlist_lock) run, the "mm" may |
---|
2361 | 2369 | * already have been freed under us by __ksm_exit() |
---|
2362 | 2370 | * because the "mm_slot" is still hashed and |
---|
.. | .. |
---|
2381 | 2389 | static void ksm_do_scan(unsigned int scan_npages) |
---|
2382 | 2390 | { |
---|
2383 | 2391 | struct rmap_item *rmap_item; |
---|
2384 | | - struct page *uninitialized_var(page); |
---|
| 2392 | + struct page *page; |
---|
2385 | 2393 | |
---|
2386 | 2394 | while (scan_npages-- && likely(!freezing(current))) { |
---|
2387 | 2395 | cond_resched(); |
---|
.. | .. |
---|
2400 | 2408 | |
---|
2401 | 2409 | static int ksm_scan_thread(void *nothing) |
---|
2402 | 2410 | { |
---|
| 2411 | + unsigned int sleep_ms; |
---|
| 2412 | + |
---|
2403 | 2413 | set_freezable(); |
---|
2404 | 2414 | set_user_nice(current, 5); |
---|
2405 | 2415 | |
---|
.. | .. |
---|
2413 | 2423 | try_to_freeze(); |
---|
2414 | 2424 | |
---|
2415 | 2425 | if (ksmd_should_run()) { |
---|
2416 | | - schedule_timeout_interruptible( |
---|
2417 | | - msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
---|
| 2426 | + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); |
---|
| 2427 | + wait_event_interruptible_timeout(ksm_iter_wait, |
---|
| 2428 | + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), |
---|
| 2429 | + msecs_to_jiffies(sleep_ms)); |
---|
2418 | 2430 | } else { |
---|
2419 | 2431 | wait_event_freezable(ksm_thread_wait, |
---|
2420 | 2432 | ksmd_should_run() || kthread_should_stop()); |
---|
.. | .. |
---|
2476 | 2488 | |
---|
2477 | 2489 | return 0; |
---|
2478 | 2490 | } |
---|
| 2491 | +EXPORT_SYMBOL_GPL(ksm_madvise); |
---|
2479 | 2492 | |
---|
2480 | 2493 | int __ksm_enter(struct mm_struct *mm) |
---|
2481 | 2494 | { |
---|
.. | .. |
---|
2525 | 2538 | * This process is exiting: if it's straightforward (as is the |
---|
2526 | 2539 | * case when ksmd was never running), free mm_slot immediately. |
---|
2527 | 2540 | * But if it's at the cursor or has rmap_items linked to it, use |
---|
2528 | | - * mmap_sem to synchronize with any break_cows before pagetables |
---|
| 2541 | + * mmap_lock to synchronize with any break_cows before pagetables |
---|
2529 | 2542 | * are freed, and leave the mm_slot on the list for ksmd to free. |
---|
2530 | 2543 | * Beware: ksm may already have noticed it exiting and freed the slot. |
---|
2531 | 2544 | */ |
---|
.. | .. |
---|
2549 | 2562 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); |
---|
2550 | 2563 | mmdrop(mm); |
---|
2551 | 2564 | } else if (mm_slot) { |
---|
2552 | | - down_write(&mm->mmap_sem); |
---|
2553 | | - up_write(&mm->mmap_sem); |
---|
| 2565 | + mmap_write_lock(mm); |
---|
| 2566 | + mmap_write_unlock(mm); |
---|
2554 | 2567 | } |
---|
2555 | 2568 | } |
---|
2556 | 2569 | |
---|
.. | .. |
---|
2574 | 2587 | return page; /* let do_swap_page report the error */ |
---|
2575 | 2588 | |
---|
2576 | 2589 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
---|
| 2590 | + if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { |
---|
| 2591 | + put_page(new_page); |
---|
| 2592 | + new_page = NULL; |
---|
| 2593 | + } |
---|
2577 | 2594 | if (new_page) { |
---|
2578 | 2595 | copy_user_highpage(new_page, page, address, vma); |
---|
2579 | 2596 | |
---|
.. | .. |
---|
2609 | 2626 | struct vm_area_struct *vma; |
---|
2610 | 2627 | |
---|
2611 | 2628 | cond_resched(); |
---|
2612 | | - anon_vma_lock_read(anon_vma); |
---|
| 2629 | + if (!anon_vma_trylock_read(anon_vma)) { |
---|
| 2630 | + if (rwc->try_lock) { |
---|
| 2631 | + rwc->contended = true; |
---|
| 2632 | + return; |
---|
| 2633 | + } |
---|
| 2634 | + anon_vma_lock_read(anon_vma); |
---|
| 2635 | + } |
---|
2613 | 2636 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
---|
2614 | 2637 | 0, ULONG_MAX) { |
---|
2615 | 2638 | unsigned long addr; |
---|
.. | .. |
---|
2785 | 2808 | */ |
---|
2786 | 2809 | ksm_check_stable_tree(mn->start_pfn, |
---|
2787 | 2810 | mn->start_pfn + mn->nr_pages); |
---|
2788 | | - /* fallthrough */ |
---|
2789 | | - |
---|
| 2811 | + fallthrough; |
---|
2790 | 2812 | case MEM_CANCEL_OFFLINE: |
---|
2791 | 2813 | mutex_lock(&ksm_thread_mutex); |
---|
2792 | 2814 | ksm_run &= ~KSM_RUN_OFFLINE; |
---|
.. | .. |
---|
2833 | 2855 | return -EINVAL; |
---|
2834 | 2856 | |
---|
2835 | 2857 | ksm_thread_sleep_millisecs = msecs; |
---|
| 2858 | + wake_up_interruptible(&ksm_iter_wait); |
---|
2836 | 2859 | |
---|
2837 | 2860 | return count; |
---|
2838 | 2861 | } |
---|