From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/mm/ksm.c | 211 +++++++++++++++++++++++++++++-----------------------
1 files changed, 117 insertions(+), 94 deletions(-)
diff --git a/kernel/mm/ksm.c b/kernel/mm/ksm.c
index 87a541a..2695ddb 100644
--- a/kernel/mm/ksm.c
+++ b/kernel/mm/ksm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Memory merging support.
*
@@ -10,8 +11,6 @@
* Andrea Arcangeli
* Chris Wright
* Hugh Dickins
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
*/
#include <linux/errno.h>
@@ -25,7 +24,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
-#include <linux/jhash.h>
+#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
@@ -82,7 +81,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same :c:type:`struct stable_node` structure.
+ * using the same struct stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
@@ -296,6 +295,7 @@
static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
@@ -442,7 +442,7 @@
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
- * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
* a special flag: they can just back out as soon as mm_users goes to zero.
* ksm_test_exit() is used throughout to make this test for exit: in some
* places for correctness, in some places just to avoid unnecessary work.
@@ -455,7 +455,7 @@
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
@@ -480,10 +480,11 @@
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
- put_page(page);
+ put_user_page(page);
} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
* We must loop because handle_mm_fault() may back out if there's
@@ -542,11 +543,11 @@
*/
put_anon_vma(rmap_item->anon_vma);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
@@ -556,7 +557,7 @@
struct vm_area_struct *vma;
struct page *page;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
@@ -568,11 +569,11 @@
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
- put_page(page);
+ put_user_page(page);
out:
page = NULL;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return page;
}
@@ -597,7 +598,7 @@
chain->chain_prune_time = jiffies;
chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
- chain->nid = -1; /* debug */
+ chain->nid = NUMA_NO_NODE; /* debug */
#endif
ksm_stable_node_chains++;
@@ -612,7 +613,7 @@
* Move the old stable node to the second dimension
* queued in the hlist_dup. The invariant is that all
* dup stable_nodes in the chain->hlist point to pages
- * that are wrprotected and have the exact same
+ * that are write protected and have the exact same
* content.
*/
stable_node_chain_add_dup(dup, chain);
@@ -666,6 +667,12 @@
free_stable_node(stable_node);
}
+enum get_ksm_page_flags {
+ GET_KSM_PAGE_NOLOCK,
+ GET_KSM_PAGE_LOCK,
+ GET_KSM_PAGE_TRYLOCK
+};
+
/*
* get_ksm_page: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
@@ -685,7 +692,8 @@
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+static struct page *get_ksm_page(struct stable_node *stable_node,
+ enum get_ksm_page_flags flags)
{
struct page *page;
void *expected_mapping;
@@ -705,8 +713,9 @@
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
@@ -727,8 +736,15 @@
goto stale;
}
- if (lock_it) {
+ if (flags == GET_KSM_PAGE_TRYLOCK) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ERR_PTR(-EBUSY);
+ }
+ } else if (flags == GET_KSM_PAGE_LOCK)
lock_page(page);
+
+ if (flags != GET_KSM_PAGE_NOLOCK) {
if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
@@ -762,7 +778,7 @@
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page)
goto out;
@@ -817,7 +833,7 @@
* Though it's very tempting to unmerge rmap_items from stable tree rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
* rmap_items from parent to child at fork time (so as not to waste time
* if exit comes before the next scan reaches it).
*
@@ -863,7 +879,7 @@
struct page *page;
int err;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page) {
/*
* get_ksm_page did remove_node_from_stable_tree itself.
@@ -962,7 +978,7 @@
for (mm_slot = ksm_scan.mm_slot;
mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (ksm_test_exit(mm))
break;
@@ -975,7 +991,7 @@
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -998,7 +1014,7 @@
return 0;
error:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = &ksm_mm_head;
spin_unlock(&ksm_mmlist_lock);
@@ -1010,27 +1026,9 @@
{
u32 checksum;
void *addr = kmap_atomic(page);
- checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ checksum = xxhash(addr, PAGE_SIZE, 0);
kunmap_atomic(addr);
return checksum;
-}
-
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
}
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
@@ -1043,8 +1041,7 @@
};
int swapped;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1052,9 +1049,10 @@
BUG_ON(PageTransCompound(page));
- mmun_start = pvmw.address;
- mmun_end = pvmw.address + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
+ pvmw.address + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
if (!page_vma_mapped_walk(&pvmw))
goto out_mn;
@@ -1106,7 +1104,7 @@
out_unlock:
page_vma_mapped_walk_done(&pvmw);
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -1130,8 +1128,7 @@
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
@@ -1141,9 +1138,9 @@
if (!pmd)
goto out;
- mmun_start = addr;
- mmun_end = addr + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
@@ -1153,7 +1150,7 @@
/*
* No need to check ksm_use_zero_pages here: we can only have a
- * zero_page here if ksm_use_zero_pages was enabled alreaady.
+ * zero_page here if ksm_use_zero_pages was enabled already.
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
@@ -1189,7 +1186,7 @@
pte_unmap_unlock(ptep, ptl);
err = 0;
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -1285,7 +1282,7 @@
struct vm_area_struct *vma;
int err = -EFAULT;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
if (!vma)
goto out;
@@ -1297,11 +1294,11 @@
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
- /* Must get reference to anon_vma while still holding mmap_sem */
+ /* Must get reference to anon_vma while still holding mmap_lock */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1388,7 +1385,7 @@
* stable_node parameter itself will be freed from
* under us if it returns NULL.
*/
- _tree_page = get_ksm_page(dup, false);
+ _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
if (!_tree_page)
continue;
nr += 1;
@@ -1511,7 +1508,7 @@
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
- return get_ksm_page(stable_node, false);
+ return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
}
/*
* _stable_node_dup set to NULL means the stable_node
@@ -1613,10 +1610,11 @@
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -1676,7 +1674,12 @@
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node_dup, true);
+ tree_page = get_ksm_page(stable_node_dup,
+ GET_KSM_PAGE_TRYLOCK);
+
+ if (PTR_ERR(tree_page) == -EBUSY)
+ return ERR_PTR(-EBUSY);
+
if (unlikely(!tree_page))
/*
* The tree may have been rebalanced,
@@ -1842,10 +1845,11 @@
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -1946,7 +1950,7 @@
* Don't substitute a ksm page for a forked page.
*/
if (page == tree_page) {
- put_page(tree_page);
+ put_user_page(tree_page);
return NULL;
}
@@ -1954,10 +1958,10 @@
parent = *new;
if (ret < 0) {
- put_page(tree_page);
+ put_user_page(tree_page);
new = &parent->rb_left;
} else if (ret > 0) {
- put_page(tree_page);
+ put_user_page(tree_page);
new = &parent->rb_right;
} else if (!ksm_merge_across_nodes &&
page_to_nid(tree_page) != nid) {
@@ -1966,7 +1970,7 @@
* it will be flushed out and put in the right unstable
* tree next time: only merge with it when across_nodes.
*/
- put_page(tree_page);
+ put_user_page(tree_page);
return NULL;
} else {
*tree_pagep = tree_page;
@@ -1999,7 +2003,7 @@
* duplicate. page_migration could break later if rmap breaks,
* so we can as well crash here. We really need to check for
* rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
- * for other negative values as an undeflow if detected here
+ * for other negative values as an underflow if detected here
* for the first time (and not when decreasing rmap_hlist_len)
* would be sign of memory corruption in the stable_node.
*/
@@ -2071,6 +2075,9 @@
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
+ if (PTR_ERR(kpage) == -EBUSY)
+ return;
+
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
@@ -2105,7 +2112,7 @@
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
if (vma) {
err = try_to_merge_one_page(vma, page,
@@ -2117,7 +2124,7 @@
*/
err = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
@@ -2144,7 +2151,7 @@
*/
split = PageTransCompound(page)
&& compound_head(page) == compound_head(tree_page);
- put_page(tree_page);
+ put_user_page(tree_page);
if (kpage) {
/*
* The pages were successfully merged: insert new
@@ -2253,7 +2260,8 @@
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
- page = get_ksm_page(stable_node, false);
+ page = get_ksm_page(stable_node,
+ GET_KSM_PAGE_NOLOCK);
if (page)
put_page(page);
cond_resched();
@@ -2279,7 +2287,7 @@
}
mm = slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
if (ksm_test_exit(mm))
vma = NULL;
else
@@ -2312,11 +2320,11 @@
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
- put_page(*page);
- up_read(&mm->mmap_sem);
+ put_user_page(*page);
+ mmap_read_unlock(mm);
return rmap_item;
}
- put_page(*page);
+ put_user_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -2337,13 +2345,13 @@
struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
- * We've completed a full scan of all vmas, holding mmap_sem
+ * We've completed a full scan of all vmas, holding mmap_lock
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
- * mmap_sem then protects against race with MADV_MERGEABLE).
+ * mmap_lock then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
@@ -2351,12 +2359,12 @@
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmdrop(mm);
} else {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
- * up_read(&mm->mmap_sem) first because after
+ * mmap_read_unlock(mm) first because after
* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
* already have been freed under us by __ksm_exit()
* because the "mm_slot" is still hashed and
@@ -2381,7 +2389,7 @@
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
- struct page *uninitialized_var(page);
+ struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
@@ -2400,6 +2408,8 @@
static int ksm_scan_thread(void *nothing)
{
+ unsigned int sleep_ms;
+
set_freezable();
set_user_nice(current, 5);
@@ -2413,8 +2423,10 @@
try_to_freeze();
if (ksmd_should_run()) {
- schedule_timeout_interruptible(
- msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+ wait_event_interruptible_timeout(ksm_iter_wait,
+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+ msecs_to_jiffies(sleep_ms));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
@@ -2476,6 +2488,7 @@
return 0;
}
+EXPORT_SYMBOL_GPL(ksm_madvise);
int __ksm_enter(struct mm_struct *mm)
{
@@ -2525,7 +2538,7 @@
* This process is exiting: if it's straightforward (as is the
* case when ksmd was never running), free mm_slot immediately.
* But if it's at the cursor or has rmap_items linked to it, use
- * mmap_sem to synchronize with any break_cows before pagetables
+ * mmap_lock to synchronize with any break_cows before pagetables
* are freed, and leave the mm_slot on the list for ksmd to free.
* Beware: ksm may already have noticed it exiting and freed the slot.
*/
@@ -2549,8 +2562,8 @@
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
}
@@ -2574,6 +2587,10 @@
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ new_page = NULL;
+ }
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@@ -2609,7 +2626,13 @@
struct vm_area_struct *vma;
cond_resched();
- anon_vma_lock_read(anon_vma);
+ if (!anon_vma_trylock_read(anon_vma)) {
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+ anon_vma_lock_read(anon_vma);
+ }
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
unsigned long addr;
@@ -2785,8 +2808,7 @@
*/
ksm_check_stable_tree(mn->start_pfn,
mn->start_pfn + mn->nr_pages);
- /* fallthrough */
-
+ fallthrough;
case MEM_CANCEL_OFFLINE:
mutex_lock(&ksm_thread_mutex);
ksm_run &= ~KSM_RUN_OFFLINE;
@@ -2833,6 +2855,7 @@
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
+ wake_up_interruptible(&ksm_iter_wait);
return count;
}
--
Gitblit v1.6.2