From d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 11 Dec 2023 02:45:28 +0000 Subject: [PATCH] add boot partition size --- kernel/mm/ksm.c | 211 +++++++++++++++++++++++++++++----------------------- 1 files changed, 117 insertions(+), 94 deletions(-) diff --git a/kernel/mm/ksm.c b/kernel/mm/ksm.c index 87a541a..2695ddb 100644 --- a/kernel/mm/ksm.c +++ b/kernel/mm/ksm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * @@ -10,8 +11,6 @@ * Andrea Arcangeli * Chris Wright * Hugh Dickins - * - * This work is licensed under the terms of the GNU GPL, version 2. */ #include <linux/errno.h> @@ -25,7 +24,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> -#include <linux/jhash.h> +#include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> @@ -82,7 +81,7 @@ * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented - * using the same :c:type:`struct stable_node` structure. + * using the same struct stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to @@ -296,6 +295,7 @@ static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); @@ -442,7 +442,7 @@ /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. @@ -455,7 +455,7 @@ /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, @@ -480,10 +480,11 @@ break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); else ret = VM_FAULT_WRITE; - put_page(page); + put_user_page(page); } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop because handle_mm_fault() may back out if there's @@ -542,11 +543,11 @@ */ put_anon_vma(rmap_item->anon_vma); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct rmap_item *rmap_item) @@ -556,7 +557,7 @@ struct vm_area_struct *vma; struct page *page; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; @@ -568,11 +569,11 @@ flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { - put_page(page); + put_user_page(page); out: page = NULL; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return page; } @@ -597,7 +598,7 @@ chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) - chain->nid = -1; /* debug */ + chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; @@ -612,7 +613,7 @@ * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages - * that are wrprotected and have the exact same + * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); @@ -666,6 +667,12 @@ free_stable_node(stable_node); } +enum get_ksm_page_flags { + GET_KSM_PAGE_NOLOCK, + GET_KSM_PAGE_LOCK, + GET_KSM_PAGE_TRYLOCK +}; + /* * get_ksm_page: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. @@ -685,7 +692,8 @@ * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +static struct page *get_ksm_page(struct stable_node *stable_node, + enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; @@ -705,8 +713,9 @@ * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; - * but if page is swapcache in migrate_page_move_mapping(), it might - * still be our page, in which case it's essential to keep the node. + * the same is in reuse_ksm_page() case; but if page is swapcache + * in migrate_page_move_mapping(), it might still be our page, + * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* @@ -727,8 +736,15 @@ goto stale; } - if (lock_it) { + if (flags == GET_KSM_PAGE_TRYLOCK) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); + + if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); @@ -762,7 +778,7 @@ struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; @@ -817,7 +833,7 @@ * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * @@ -863,7 +879,7 @@ struct page *page; int err; - page = get_ksm_page(stable_node, true); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) { /* * get_ksm_page did remove_node_from_stable_tree itself. @@ -962,7 +978,7 @@ for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { mm = mm_slot->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (ksm_test_exit(mm)) break; @@ -975,7 +991,7 @@ } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -998,7 +1014,7 @@ return 0; error: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); @@ -1010,27 +1026,9 @@ { u32 checksum; void *addr = kmap_atomic(page); - checksum = jhash2(addr, PAGE_SIZE / 4, 17); + checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_atomic(addr); return checksum; -} - -static int memcmp_pages(struct page *page1, struct page *page2) -{ - char *addr1, *addr2; - int ret; - - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); - ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); - return ret; -} - -static inline int pages_identical(struct page *page1, struct page *page2) -{ - return !memcmp_pages(page1, page2); } static int write_protect_page(struct vm_area_struct *vma, struct page *page, @@ -1043,8 +1041,7 @@ }; int swapped; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1052,9 +1049,10 @@ BUG_ON(PageTransCompound(page)); - mmun_start = pvmw.address; - mmun_end = pvmw.address + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + pvmw.address, + pvmw.address + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; @@ -1106,7 +1104,7 @@ out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } @@ -1130,8 +1128,7 @@ spinlock_t *ptl; unsigned long addr; int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -1141,9 +1138,9 @@ if (!pmd) goto out; - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { @@ -1153,7 +1150,7 @@ /* * No need to check ksm_use_zero_pages here: we can only have a - * zero_page here if ksm_use_zero_pages was enabled alreaady. + * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); @@ -1189,7 +1186,7 @@ pte_unmap_unlock(ptep, ptl); err = 0; out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out: return err; } @@ -1285,7 +1282,7 @@ struct vm_area_struct *vma; int err = -EFAULT; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; @@ -1297,11 +1294,11 @@ /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); - /* Must get reference to anon_vma while still holding mmap_sem */ + /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1388,7 +1385,7 @@ * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, false); + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); if (!_tree_page) continue; nr += 1; @@ -1511,7 +1508,7 @@ if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, false); + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1613,10 +1610,11 @@ * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -1676,7 +1674,12 @@ * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, true); + tree_page = get_ksm_page(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + if (unlikely(!tree_page)) /* * The tree may have been rebalanced, @@ -1842,10 +1845,11 @@ * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, false); + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { @@ -1946,7 +1950,7 @@ * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { - put_page(tree_page); + put_user_page(tree_page); return NULL; } @@ -1954,10 +1958,10 @@ parent = *new; if (ret < 0) { - put_page(tree_page); + put_user_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { - put_page(tree_page); + put_user_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { @@ -1966,7 +1970,7 @@ * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ - put_page(tree_page); + put_user_page(tree_page); return NULL; } else { *tree_pagep = tree_page; @@ -1999,7 +2003,7 @@ * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check - * for other negative values as an undeflow if detected here + * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ @@ -2071,6 +2075,9 @@ remove_rmap_item_from_tree(rmap_item); if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) + return; + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* @@ -2105,7 +2112,7 @@ if (ksm_use_zero_pages && (checksum == zero_checksum)) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, @@ -2117,7 +2124,7 @@ */ err = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * In case of failure, the page was not really empty, so we * need to continue. Otherwise we're done. @@ -2144,7 +2151,7 @@ */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); - put_page(tree_page); + put_user_page(tree_page); if (kpage) { /* * The pages were successfully merged: insert new @@ -2253,7 +2260,8 @@ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, false); + page = get_ksm_page(stable_node, + GET_KSM_PAGE_NOLOCK); if (page) put_page(page); cond_resched(); @@ -2279,7 +2287,7 @@ } mm = slot->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); if (ksm_test_exit(mm)) vma = NULL; else @@ -2312,11 +2320,11 @@ &rmap_item->rmap_list; ksm_scan.address += PAGE_SIZE; } else - put_page(*page); - up_read(&mm->mmap_sem); + put_user_page(*page); + mmap_read_unlock(mm); return rmap_item; } - put_page(*page); + put_user_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -2337,13 +2345,13 @@ struct mm_slot, mm_list); if (ksm_scan.address == 0) { /* - * We've completed a full scan of all vmas, holding mmap_sem + * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and - * mmap_sem then protects against race with MADV_MERGEABLE). + * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); list_del(&slot->mm_list); @@ -2351,12 +2359,12 @@ free_mm_slot(slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmdrop(mm); } else { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* - * up_read(&mm->mmap_sem) first because after + * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and @@ -2381,7 +2389,7 @@ static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *uninitialized_var(page); + struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); @@ -2400,6 +2408,8 @@ static int ksm_scan_thread(void *nothing) { + unsigned int sleep_ms; + set_freezable(); set_user_nice(current, 5); @@ -2413,8 +2423,10 @@ try_to_freeze(); if (ksmd_should_run()) { - schedule_timeout_interruptible( - msecs_to_jiffies(ksm_thread_sleep_millisecs)); + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); @@ -2476,6 +2488,7 @@ return 0; } +EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { @@ -2525,7 +2538,7 @@ * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use - * mmap_sem to synchronize with any break_cows before pagetables + * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ @@ -2549,8 +2562,8 @@ clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } } @@ -2574,6 +2587,10 @@ return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { + put_page(new_page); + new_page = NULL; + } if (new_page) { copy_user_highpage(new_page, page, address, vma); @@ -2609,7 +2626,13 @@ struct vm_area_struct *vma; cond_resched(); - anon_vma_lock_read(anon_vma); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; @@ -2785,8 +2808,7 @@ */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); - /* fallthrough */ - + fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; @@ -2833,6 +2855,7 @@ return -EINVAL; ksm_thread_sleep_millisecs = msecs; + wake_up_interruptible(&ksm_iter_wait); return count; } -- Gitblit v1.6.2