From 982b8cc116118b3463d3f332581945625722acd8 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 09 Jan 2024 02:10:54 +0000
Subject: [PATCH] add poweroff command
---
kernel/mm/huge_memory.c | 907 ++++++++++++++++++++++++++++----------------------------
1 files changed, 450 insertions(+), 457 deletions(-)
diff --git a/kernel/mm/huge_memory.c b/kernel/mm/huge_memory.c
index f79002a..14bc799 100644
--- a/kernel/mm/huge_memory.c
+++ b/kernel/mm/huge_memory.c
@@ -1,8 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2009 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,8 +31,9 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <linux/page_owner.h>
-
+#include <trace/hooks/mm.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -64,12 +63,26 @@
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
+{
+ /* The addr is used to check if the vma size fits */
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+
+ if (!transhuge_vma_suitable(vma, addr))
+ return false;
if (vma_is_anonymous(vma))
return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
- return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma))
+ return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
return false;
}
@@ -302,34 +315,13 @@
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
-#ifdef CONFIG_DEBUG_VM
-static ssize_t debug_cow_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static ssize_t debug_cow_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_hugepage_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static struct kobj_attribute debug_cow_attr =
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
-#endif /* CONFIG_DEBUG_VM */
-
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
&hpage_pmd_size_attr.attr,
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
-#endif
-#ifdef CONFIG_DEBUG_VM
- &debug_cow_attr.attr,
#endif
NULL,
};
@@ -392,7 +384,11 @@
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
+ /*
+ * Hardware doesn't support hugepages, hence disable
+ * DAX PMD support.
+ */
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
return -EINVAL;
}
@@ -426,7 +422,7 @@
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -487,11 +483,25 @@
return pmd;
}
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- /* ->lru in the tail pages is occupied by compound_head. */
- return &page[2].deferred_list;
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
+}
+#endif
void prep_transhuge_page(struct page *page)
{
@@ -503,6 +513,17 @@
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
+
+bool is_transparent_hugepage(struct page *page)
+{
+ if (!PageCompound(page))
+ return false;
+
+ page = compound_head(page);
+ return is_huge_zero_page(page) ||
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
@@ -561,20 +582,21 @@
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
+ cgroup_throttle_swaprate(page, gfp);
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
ret = VM_FAULT_OOM;
goto release;
@@ -603,7 +625,6 @@
vm_fault_t ret2;
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -614,14 +635,14 @@
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- mem_cgroup_commit_charge(page, memcg, false, true);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -630,7 +651,6 @@
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
@@ -649,16 +669,25 @@
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ /* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- __GFP_KSWAPD_RECLAIM);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- 0);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
return GFP_TRANSHUGE_LIGHT;
}
@@ -686,7 +715,7 @@
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -698,7 +727,7 @@
pgtable_t pgtable;
struct page *zero_page;
vm_fault_t ret;
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = mm_get_huge_zero_page(vma->vm_mm);
@@ -787,11 +816,24 @@
pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+/**
+ * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
- pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
/*
@@ -809,7 +851,7 @@
return VM_FAULT_SIGBUS;
if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm, addr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (!pgtable)
return VM_FAULT_OOM;
}
@@ -819,7 +861,7 @@
insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
@@ -865,11 +907,24 @@
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+/**
+ * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
- pgprot_t pgprot = vma->vm_page_prot;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -890,7 +945,7 @@
insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -907,11 +962,10 @@
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -921,6 +975,11 @@
* not be in this function with `flags & FOLL_COW` set.
*/
WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -937,23 +996,23 @@
* device mapped pages can only be returned if the
* caller will manage the page reference count.
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
- put_dev_pagemap(pgmap);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -962,10 +1021,10 @@
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
- pgtable = pte_alloc_one(dst_mm, addr);
+ pgtable = pte_alloc_one(dst_mm);
if (unlikely(!pgtable))
goto out;
@@ -986,11 +1045,15 @@
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1007,28 +1070,44 @@
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+ /*
+ * If this page is a potentially pinned page, split and retry the fault
+ * with smaller page size. Normally this should not happen because the
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
+ * best effort that the pinned pages won't be replaced by another
+ * random page during the coming copy-on-write.
+ */
+ if (unlikely(is_cow_mapping(src_vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(src_page))) {
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ return -EAGAIN;
+ }
+
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1055,16 +1134,20 @@
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
if (flags & FOLL_WRITE && !pud_write(*pud))
+ return NULL;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
return NULL;
if (pud_present(*pud) && pud_devmap(*pud))
@@ -1078,17 +1161,19 @@
/*
* device mapped pages can only be returned if the
* caller will manage the page reference count.
+ *
+ * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
- put_dev_pagemap(pgmap);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
@@ -1117,6 +1202,16 @@
*/
if (is_huge_zero_pud(pud)) {
/* No huge zero pud yet */
+ }
+
+ /* Please refer to comments in copy_huge_pmd() */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(pud_page(pud)))) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pud(vma, src_pud, addr);
+ return -EAGAIN;
}
pudp_set_wrprotect(src_mm, addr, src_pud);
@@ -1173,274 +1268,73 @@
spin_unlock(vmf->ptl);
}
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
- pmd_t orig_pmd, struct page *page)
-{
- struct vm_area_struct *vma = vmf->vma;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mem_cgroup *memcg;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
- vm_fault_t ret = 0;
- struct page **pages;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
-
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
- GFP_KERNEL);
- if (unlikely(!pages)) {
- ret |= VM_FAULT_OOM;
- goto out;
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address, page_to_nid(page));
- if (unlikely(!pages[i] ||
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
- GFP_KERNEL, &memcg, false))) {
- if (pages[i])
- put_page(pages[i]);
- while (--i >= 0) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg,
- false);
- put_page(pages[i]);
- }
- kfree(pages);
- ret |= VM_FAULT_OOM;
- goto out;
- }
- set_page_private(pages[i], (unsigned long)memcg);
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SIZE * i, vma);
- __SetPageUptodate(pages[i]);
- cond_resched();
- }
-
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
-
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
-
- /*
- * Leave pmd empty until pte is filled note we must notify here as
- * concurrent CPU thread might write to new page before the call to
- * mmu_notifier_invalidate_range_end() happens which can lead to a
- * device seeing memory write in different order than CPU.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
- lru_cache_add_active_or_unevictable(pages[i], vma);
- vmf->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*vmf->pte));
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
- pte_unmap(vmf->pte);
- }
- kfree(pages);
-
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
- page_remove_rmap(page, true);
- spin_unlock(vmf->ptl);
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
-
- ret |= VM_FAULT_WRITE;
- put_page(page);
-
-out:
- return ret;
-
-out_free_pages:
- spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg, false);
- put_page(pages[i]);
- }
- kfree(pages);
- goto out;
-}
-
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *new_page;
- struct mem_cgroup *memcg;
+ struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- gfp_t huge_gfp; /* for allocation and charge */
- vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
if (is_huge_zero_pmd(orig_pmd))
- goto alloc;
+ goto fallback;
+
spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_unlock;
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- /*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
- */
+
+ /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
unlock_page(page);
put_page(page);
- goto out_unlock;
+ return 0;
}
put_page(page);
}
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- ret |= VM_FAULT_WRITE;
unlock_page(page);
- goto out_unlock;
- }
- unlock_page(page);
- get_page(page);
- spin_unlock(vmf->ptl);
-alloc:
- if (__transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
- } else
- new_page = NULL;
-
- if (likely(new_page)) {
- prep_transhuge_page(new_page);
- } else {
- if (!page) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- } else {
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
- if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- }
- put_page(page);
- }
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
- put_page(new_page);
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- if (page)
- put_page(page);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
-
- if (!page)
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
- else
- copy_user_huge_page(new_page, page, vmf->address,
- vma, HPAGE_PMD_NR);
- __SetPageUptodate(new_page);
-
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
-
- spin_lock(vmf->ptl);
- if (page)
- put_page(page);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(new_page, memcg, true);
- put_page(new_page);
- goto out_mn;
- } else {
- pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- page_add_new_anon_rmap(new_page, vma, haddr, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- if (!page) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- } else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page, true);
- put_page(page);
- }
- ret |= VM_FAULT_WRITE;
+ return VM_FAULT_WRITE;
}
+
+ unlock_page(page);
spin_unlock(vmf->ptl);
-out_mn:
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
-out:
- return ret;
-out_unlock:
- spin_unlock(vmf->ptl);
- return ret;
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
}
/*
- * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
- * but only after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1466,8 +1360,13 @@
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if (!try_grab_page(page, flags))
+ return ERR_PTR(-ENOMEM);
+
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
+
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1496,7 +1395,6 @@
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
- lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
@@ -1504,8 +1402,6 @@
skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
- if (flags & FOLL_GET)
- get_page(page);
out:
return page;
@@ -1518,7 +1414,7 @@
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
@@ -1539,8 +1435,7 @@
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -1564,7 +1459,7 @@
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
@@ -1572,12 +1467,11 @@
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -1587,21 +1481,21 @@
*/
get_page(page);
spin_unlock(vmf->ptl);
- anon_vma = page_lock_anon_vma_read(page);
+ anon_vma = page_lock_anon_vma_read(page, NULL);
/* Confirm the PMD did not change while page_table_lock was released */
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto out_unlock;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto clear_pmdnuma;
}
@@ -1616,8 +1510,20 @@
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
- if (mm_tlb_flush_pending(vma->vm_mm))
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
/*
* Migrate the THP to the requested node, returns with page unlocked
@@ -1651,7 +1557,7 @@
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
@@ -1671,7 +1577,7 @@
struct mm_struct *mm = tlb->mm;
bool ret = false;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1747,7 +1653,7 @@
pmd_t orig_pmd;
spinlock_t *ptl;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1758,10 +1664,10 @@
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_dax(vma)) {
+ if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
@@ -1785,7 +1691,7 @@
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1833,18 +1739,12 @@
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
-
- if ((old_addr & ~HPAGE_PMD_MASK) ||
- (new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE)
- return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
@@ -1857,7 +1757,7 @@
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
if (old_ptl) {
@@ -1893,13 +1793,16 @@
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, int prot_numa)
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t entry;
bool preserve_write;
int ret;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1923,6 +1826,8 @@
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
}
goto unlock;
@@ -1941,9 +1846,9 @@
goto unlock;
/*
- * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
- * which is also under down_read(mmap_sem):
+ * which is also under mmap_read_lock(mm):
*
* CPU0: CPU1:
* change_huge_pmd(prot_numa=1)
@@ -1966,6 +1871,17 @@
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
+ if (uffd_wp) {
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkuffd_wp(entry);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled by PF interrupt
+ * handler, then things like COW could be properly
+ * handled.
+ */
+ entry = pmd_clear_uffd_wp(entry);
+ }
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2012,7 +1928,6 @@
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
- pud_t orig_pud;
spinlock_t *ptl;
ptl = __pud_trans_huge_lock(pud, vma);
@@ -2024,10 +1939,9 @@
* pgtable_trans_huge_withdraw after finishing pudp related
* operations.
*/
- orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
- tlb->fullmm);
+ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_dax(vma)) {
+ if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
@@ -2054,14 +1968,16 @@
unsigned long address)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PUD_MASK;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
- ptl = pud_lock(mm, pud);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pud_lock(vma->vm_mm, pud);
if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
goto out;
- __split_huge_pud_locked(vma, pud, haddr);
+ __split_huge_pud_locked(vma, pud, range.start);
out:
spin_unlock(ptl);
@@ -2069,8 +1985,7 @@
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pudp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2079,7 +1994,7 @@
{
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
- pmd_t _pmd;
+ pmd_t _pmd, old_pmd;
int i;
/*
@@ -2090,7 +2005,7 @@
*
* See Documentation/vm/mmu_notifier.rst
*/
- pmdp_huge_clear_flush(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2099,6 +2014,8 @@
pte_t *pte, entry;
entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
+ if (pmd_uffd_wp(old_pmd))
+ entry = pte_mkuffd_wp(entry);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -2115,9 +2032,10 @@
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false;
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
unsigned long addr;
int i;
+ bool success = false;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
@@ -2135,7 +2053,7 @@
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_dax(vma))
+ if (vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
@@ -2176,8 +2094,8 @@
* free), userland could trigger a small page size TLB miss on the
* small sized TLB while the hugepage TLB entry is still established in
* the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
* only safe if the permission and cache attributes of the two entries
* loaded in the two TLB is identical (which should be the case here).
* But it is generally safer to never allow small and huge TLB entries
@@ -2195,10 +2113,11 @@
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
if (pmd_dirty(old_pmd))
@@ -2206,6 +2125,7 @@
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
+ uffd_wp = pmd_uffd_wp(old_pmd);
}
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2230,21 +2150,29 @@
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- entry = maybe_mkwrite(entry, vma);
+ entry = maybe_mkwrite(entry, vma->vm_flags);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- if (!pmd_migration)
- atomic_inc(&page[i]._mapcount);
+ if (!pmd_migration) {
+ trace_android_vh_update_page_mapcount(&page[i], true,
+ false, NULL, &success);
+ if (!success)
+ atomic_inc(&page[i]._mapcount);
+ }
pte_unmap(pte);
}
@@ -2255,8 +2183,12 @@
*/
if (compound_mapcount(page) > 1 &&
!TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_inc(&page[i]._mapcount);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ trace_android_vh_update_page_mapcount(&page[i], true,
+ false, NULL, &success);
+ if (!success)
+ atomic_inc(&page[i]._mapcount);
+ }
}
lock_page_memcg(page);
@@ -2265,8 +2197,12 @@
__dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ trace_android_vh_update_page_mapcount(&page[i],
+ false, false, NULL, &success);
+ if (!success)
+ atomic_dec(&page[i]._mapcount);
+ }
}
}
unlock_page_memcg(page);
@@ -2287,13 +2223,15 @@
unsigned long address, bool freeze, struct page *page)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct mmu_notifier_range range;
bool do_unlock_page = false;
pmd_t _pmd;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
- ptl = pmd_lock(mm, pmd);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pmd_lock(vma->vm_mm, pmd);
/*
* If caller asks to setup a migration entries, we need a page to check
@@ -2339,7 +2277,7 @@
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
if (do_unlock_page)
@@ -2357,8 +2295,7 @@
* any further changes to individual pte will notify. So no need
* to call mmu_notifier->invalidate_range()
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2413,13 +2350,13 @@
/*
* If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't page aligned and it could previously
+ * vm_next->vm_start isn't hpage aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
- nstart += adjust_next << PAGE_SHIFT;
+ nstart += adjust_next;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
@@ -2429,8 +2366,8 @@
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -2442,13 +2379,13 @@
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2477,6 +2414,9 @@
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
+#ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+#endif
(1L << PG_dirty)));
/* ->mapping in first tail page is compound_mapcount */
@@ -2519,16 +2459,27 @@
pgoff_t end, unsigned long flags)
{
struct page *head = compound_head(page);
- struct zone *zone = page_zone(head);
+ pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
- lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(head, pgdat);
/* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(head);
+ split_page_memcg(head, nr);
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2537,31 +2488,45 @@
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
}
}
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- /* Additional pin to radix tree of swap cache */
- if (PageSwapCache(head))
+ /* Additional pin to swap cache */
+ if (PageSwapCache(head)) {
page_ref_add(head, 2);
- else
+ xa_unlock(&swap_cache->i_pages);
+ } else {
page_ref_inc(head);
+ }
} else {
- /* Additional pin to radix tree */
+ /* Additional pin to page cache */
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2580,7 +2545,7 @@
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2588,16 +2553,17 @@
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2642,14 +2608,14 @@
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2664,11 +2630,11 @@
{
int extra_pins;
- /* Additional pins from radix tree */
+ /* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2697,23 +2663,23 @@
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
- bool mlocked;
unsigned long flags;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
- if (PageWriteback(page))
+ if (PageWriteback(head))
return -EBUSY;
if (PageAnon(head)) {
/*
- * The caller does not necessarily hold an mmap_sem that would
+ * The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
* is similar to page_lock_anon_vma_read except the write lock
@@ -2759,55 +2725,47 @@
goto out_unlock;
}
- mlocked = PageMlocked(page);
unmap_page(head);
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
- if (mlocked)
- lru_add_drain();
-
/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
+ spin_lock_irqsave(&pgdata->lru_lock, flags);
if (mapping) {
- void **pslot;
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
- xa_lock(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(head));
/*
- * Check if the head page is present in radix tree.
+ * Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- if (radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != head)
+ xa_lock(&mapping->i_pages);
+ if (xas_load(&xas) != head)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&pgdata->split_queue_lock);
+ spin_lock(&ds_queue->split_queue_lock);
if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
- if (mapping)
- __dec_node_page_state(page, NR_SHMEM_THPS);
- spin_unlock(&pgdata->split_queue_lock);
- __split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
+ spin_unlock(&ds_queue->split_queue_lock);
+ if (mapping) {
+ if (PageSwapBacked(head))
+ __dec_node_page_state(head, NR_SHMEM_THPS);
+ else
+ __dec_node_page_state(head, NR_FILE_THPS);
+ }
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ __split_huge_page(page, list, end, flags);
+ ret = 0;
} else {
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
fail:
if (mapping)
xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
- remap_page(head);
+ spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
@@ -2825,53 +2783,89 @@
void free_transhuge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
unsigned long flags;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ if (!list_empty(page_deferred_list(page)))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
- pgdata->split_queue_len++;
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return READ_ONCE(pgdata->split_queue_len);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &pgdata->split_queue) {
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
@@ -2879,12 +2873,12 @@
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -2898,15 +2892,15 @@
put_page(page);
}
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
- list_splice_tail(&list, &pgdata->split_queue);
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
- if (!split && list_empty(&pgdata->split_queue))
+ if (!split && list_empty(&ds_queue->split_queue))
return SHRINK_STOP;
return split;
}
@@ -2915,7 +2909,8 @@
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
};
#ifdef CONFIG_DEBUG_FS
@@ -2959,17 +2954,13 @@
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
"%llu\n");
static int __init split_huge_pages_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
- &split_huge_pages_fops);
- if (!ret)
- pr_warn("Failed to create split_huge_pages in debugfs");
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
+ &split_huge_pages_fops);
return 0;
}
late_initcall(split_huge_pages_debugfs);
@@ -3021,6 +3012,8 @@
pmde = pmd_mksoft_dirty(pmde);
if (is_write_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
--
Gitblit v1.6.2