From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/huge_memory.c | 907 ++++++++++++++++++++++++++++---------------------------- 1 files changed, 450 insertions(+), 457 deletions(-) diff --git a/kernel/mm/huge_memory.c b/kernel/mm/huge_memory.c index f79002a..14bc799 100644 --- a/kernel/mm/huge_memory.c +++ b/kernel/mm/huge_memory.c @@ -1,8 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2009 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -33,8 +31,9 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> +#include <linux/numa.h> #include <linux/page_owner.h> - +#include <trace/hooks/mm.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" @@ -64,12 +63,26 @@ struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_enabled(struct vm_area_struct *vma) +static inline bool file_thp_enabled(struct vm_area_struct *vma) { + return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file && + !inode_is_open_for_write(vma->vm_file->f_inode) && + (vma->vm_flags & VM_EXEC); +} + +bool transparent_hugepage_active(struct vm_area_struct *vma) +{ + /* The addr is used to check if the vma size fits */ + unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + + if (!transhuge_vma_suitable(vma, addr)) + return false; if (vma_is_anonymous(vma)) return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) - return __transparent_hugepage_enabled(vma); + if (vma_is_shmem(vma)) + return shmem_huge_enabled(vma); + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return file_thp_enabled(vma); return false; } @@ -302,34 +315,13 @@ static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); -#ifdef CONFIG_DEBUG_VM -static ssize_t debug_cow_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); -} -static ssize_t debug_cow_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - return single_hugepage_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); -} -static struct kobj_attribute debug_cow_attr = - __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); -#endif /* CONFIG_DEBUG_VM */ - static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, -#endif -#ifdef CONFIG_DEBUG_VM - &debug_cow_attr.attr, #endif NULL, }; @@ -392,7 +384,11 @@ struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; + /* + * Hardware doesn't support hugepages, hence disable + * DAX PMD support. + */ + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; return -EINVAL; } @@ -426,7 +422,7 @@ * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } @@ -487,11 +483,25 @@ return pmd; } -static inline struct list_head *page_deferred_list(struct page *page) +#ifdef CONFIG_MEMCG +static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - /* ->lru in the tail pages is occupied by compound_head. */ - return &page[2].deferred_list; + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + if (memcg) + return &memcg->deferred_split_queue; + else + return &pgdat->deferred_split_queue; } +#else +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + return &pgdat->deferred_split_queue; +} +#endif void prep_transhuge_page(struct page *page) { @@ -503,6 +513,17 @@ INIT_LIST_HEAD(page_deferred_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } + +bool is_transparent_hugepage(struct page *page) +{ + if (!PageCompound(page)) + return false; + + page = compound_head(page); + return is_huge_zero_page(page) || + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(is_transparent_hugepage); static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, @@ -561,20 +582,21 @@ struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; @@ -603,7 +625,6 @@ vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -614,14 +635,14 @@ entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -630,7 +651,6 @@ release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return ret; @@ -649,16 +669,25 @@ { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT; } @@ -686,7 +715,7 @@ struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -698,7 +727,7 @@ pgtable_t pgtable; struct page *zero_page; vm_fault_t ret; - pgtable = pte_alloc_one(vma->vm_mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); @@ -787,11 +816,24 @@ pte_free(mm, pgtable); } -vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) +/** + * vmf_insert_pfn_pmd_prot - insert a pmd size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; - pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; /* @@ -809,7 +851,7 @@ return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm, addr); + pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } @@ -819,7 +861,7 @@ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) @@ -865,11 +907,24 @@ spin_unlock(ptl); } -vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) +/** + * vmf_insert_pfn_pud_prot - insert a pud size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pud size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; - pgprot_t pgprot = vma->vm_page_prot; /* * If we had pud_special, we could avoid all these restrictions, @@ -890,7 +945,7 @@ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -907,11 +962,10 @@ } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -921,6 +975,11 @@ * not be in this function with `flags & FOLL_COW` set. */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -937,23 +996,23 @@ * device mapped pages can only be returned if the * caller will manage the page reference count. */ - if (!(flags & FOLL_GET)) + if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - get_page(page); - put_dev_pagemap(pgmap); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); return page; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, - struct vm_area_struct *vma) + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { spinlock_t *dst_ptl, *src_ptl; struct page *src_page; @@ -962,10 +1021,10 @@ int ret = -ENOMEM; /* Skip if can be re-fill on fault */ - if (!vma_is_anonymous(vma)) + if (!vma_is_anonymous(dst_vma)) return 0; - pgtable = pte_alloc_one(dst_mm, addr); + pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; @@ -986,11 +1045,15 @@ pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; goto out_unlock; @@ -1007,28 +1070,44 @@ * a page table. */ if (is_huge_zero_pmd(pmd)) { - struct page *zero_page; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = mm_get_huge_zero_page(dst_mm); - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, - zero_page); - ret = 0; - goto out_unlock; + mm_get_huge_zero_page(dst_mm); + goto out_zero_page; } src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(src_vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); +out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - pmdp_set_wrprotect(src_mm, addr, src_pmd); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_clear_uffd_wp(pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); @@ -1055,16 +1134,20 @@ } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); if (flags & FOLL_WRITE && !pud_write(*pud)) + return NULL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1078,17 +1161,19 @@ /* * device mapped pages can only be returned if the * caller will manage the page reference count. + * + * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: */ - if (!(flags & FOLL_GET)) + if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - get_page(page); - put_dev_pagemap(pgmap); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); return page; } @@ -1117,6 +1202,16 @@ */ if (is_huge_zero_pud(pud)) { /* No huge zero pud yet */ + } + + /* Please refer to comments in copy_huge_pmd() */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(pud_page(pud)))) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pud(vma, src_pud, addr); + return -EAGAIN; } pudp_set_wrprotect(src_mm, addr, src_pud); @@ -1173,274 +1268,73 @@ spin_unlock(vmf->ptl); } -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, - pmd_t orig_pmd, struct page *page) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mem_cgroup *memcg; - pgtable_t pgtable; - pmd_t _pmd; - int i; - vm_fault_t ret = 0; - struct page **pages; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), - GFP_KERNEL); - if (unlikely(!pages)) { - ret |= VM_FAULT_OOM; - goto out; - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, - vmf->address, page_to_nid(page)); - if (unlikely(!pages[i] || - mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, - GFP_KERNEL, &memcg, false))) { - if (pages[i]) - put_page(pages[i]); - while (--i >= 0) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, - false); - put_page(pages[i]); - } - kfree(pages); - ret |= VM_FAULT_OOM; - goto out; - } - set_page_private(pages[i], (unsigned long)memcg); - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - copy_user_highpage(pages[i], page + i, - haddr + PAGE_SIZE * i, vma); - __SetPageUptodate(pages[i]); - cond_resched(); - } - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_free_pages; - VM_BUG_ON_PAGE(!PageHead(page), page); - - /* - * Leave pmd empty until pte is filled note we must notify here as - * concurrent CPU thread might write to new page before the call to - * mmu_notifier_invalidate_range_end() happens which can lead to a - * device seeing memory write in different order than CPU. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); - pmd_populate(vma->vm_mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false, false); - lru_cache_add_active_or_unevictable(pages[i], vma); - vmf->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*vmf->pte)); - set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); - pte_unmap(vmf->pte); - } - kfree(pages); - - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, vmf->pmd, pgtable); - page_remove_rmap(page, true); - spin_unlock(vmf->ptl); - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); - - ret |= VM_FAULT_WRITE; - put_page(page); - -out: - return ret; - -out_free_pages: - spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); - for (i = 0; i < HPAGE_PMD_NR; i++) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, false); - put_page(pages[i]); - } - kfree(pages); - goto out; -} - vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *new_page; - struct mem_cgroup *memcg; + struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - gfp_t huge_gfp; /* for allocation and charge */ - vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) - goto alloc; + goto fallback; + spin_lock(vmf->ptl); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_unlock; + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. - */ + + /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); lock_page(page); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); unlock_page(page); put_page(page); - goto out_unlock; + return 0; } put_page(page); } + + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - ret |= VM_FAULT_WRITE; unlock_page(page); - goto out_unlock; - } - unlock_page(page); - get_page(page); - spin_unlock(vmf->ptl); -alloc: - if (__transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); - } else - new_page = NULL; - - if (likely(new_page)) { - prep_transhuge_page(new_page); - } else { - if (!page) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } else { - ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); - if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } - put_page(page); - } - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { - put_page(new_page); - split_huge_pmd(vma, vmf->pmd, vmf->address); - if (page) - put_page(page); - ret |= VM_FAULT_FALLBACK; - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - count_vm_event(THP_FAULT_ALLOC); - - if (!page) - clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); - else - copy_user_huge_page(new_page, page, vmf->address, - vma, HPAGE_PMD_NR); - __SetPageUptodate(new_page); - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - - spin_lock(vmf->ptl); - if (page) - put_page(page); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(new_page, memcg, true); - put_page(new_page); - goto out_mn; - } else { - pmd_t entry; - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - if (!page) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - } else { - VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page, true); - put_page(page); - } - ret |= VM_FAULT_WRITE; + return VM_FAULT_WRITE; } + + unlock_page(page); spin_unlock(vmf->ptl); -out_mn: - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, - mmun_end); -out: - return ret; -out_unlock: - spin_unlock(vmf->ptl); - return ret; +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; } /* - * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, - * but only after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1466,8 +1360,13 @@ page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if (!try_grab_page(page, flags)) + return ERR_PTR(-ENOMEM); + if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid @@ -1496,7 +1395,6 @@ goto skip_mlock; if (!trylock_page(page)) goto skip_mlock; - lru_add_drain(); if (page->mapping && !PageDoubleMap(page)) mlock_vma_page(page); unlock_page(page); @@ -1504,8 +1402,6 @@ skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); - if (flags & FOLL_GET) - get_page(page); out: return page; @@ -1518,7 +1414,7 @@ struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = -1, this_nid = numa_node_id(); + int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; @@ -1539,8 +1435,7 @@ if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -1564,7 +1459,7 @@ */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { /* If the page was locked, there are no parallel migrations */ if (page_locked) goto clear_pmdnuma; @@ -1572,12 +1467,11 @@ /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - page_nid = -1; + page_nid = NUMA_NO_NODE; if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); goto out; } @@ -1587,21 +1481,21 @@ */ get_page(page); spin_unlock(vmf->ptl); - anon_vma = page_lock_anon_vma_read(page); + anon_vma = page_lock_anon_vma_read(page, NULL); /* Confirm the PMD did not change while page_table_lock was released */ spin_lock(vmf->ptl); if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto out_unlock; } /* Bail if we fail to protect against THP splits for any reason */ if (unlikely(!anon_vma)) { put_page(page); - page_nid = -1; + page_nid = NUMA_NO_NODE; goto clear_pmdnuma; } @@ -1616,8 +1510,20 @@ * We are not sure a pending tlb flush here is for a huge page * mapping or not. Hence use the tlb range variant */ - if (mm_tlb_flush_pending(vma->vm_mm)) + if (mm_tlb_flush_pending(vma->vm_mm)) { flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* + * change_huge_pmd() released the pmd lock before + * invalidating the secondary MMUs sharing the primary + * MMU pagetables (with ->invalidate_range()). The + * mmu_notifier_invalidate_range_end() (which + * internally calls ->invalidate_range()) in + * change_pmd_range() will run after us, so we can't + * rely on it here and we need an explicit invalidate. + */ + mmu_notifier_invalidate_range(vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + } /* * Migrate the THP to the requested node, returns with page unlocked @@ -1651,7 +1557,7 @@ if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); @@ -1671,7 +1577,7 @@ struct mm_struct *mm = tlb->mm; bool ret = false; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1747,7 +1653,7 @@ pmd_t orig_pmd; spinlock_t *ptl; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1758,10 +1664,10 @@ * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (vma_is_dax(vma)) { + if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -1785,7 +1691,7 @@ VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -1833,18 +1739,12 @@ } bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd) + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; bool force_flush = false; - - if ((old_addr & ~HPAGE_PMD_MASK) || - (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE) - return false; /* * The destination pmd shouldn't be established, free_pgtables() @@ -1857,7 +1757,7 @@ /* * We don't have to worry about the ordering of src and dst - * ptlocks because exclusive mmap_sem prevents deadlock. + * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) { @@ -1893,13 +1793,16 @@ * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t entry; bool preserve_write; int ret; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1923,6 +1826,8 @@ newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + if (pmd_swp_uffd_wp(*pmd)) + newpmd = pmd_swp_mkuffd_wp(newpmd); set_pmd_at(mm, addr, pmd, newpmd); } goto unlock; @@ -1941,9 +1846,9 @@ goto unlock; /* - * In case prot_numa, we are under down_read(mmap_sem). It's critical + * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED - * which is also under down_read(mmap_sem): + * which is also under mmap_read_lock(mm): * * CPU0: CPU1: * change_huge_pmd(prot_numa=1) @@ -1966,6 +1871,17 @@ entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); @@ -2012,7 +1928,6 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { - pud_t orig_pud; spinlock_t *ptl; ptl = __pud_trans_huge_lock(pud, vma); @@ -2024,10 +1939,9 @@ * pgtable_trans_huge_withdraw after finishing pudp related * operations. */ - orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, - tlb->fullmm); + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (vma_is_dax(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -2054,14 +1968,16 @@ unsigned long address) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PUD_MASK; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); - ptl = pud_lock(mm, pud); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, + (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; - __split_huge_pud_locked(vma, pud, haddr); + __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); @@ -2069,8 +1985,7 @@ * No need to double call mmu_notifier->invalidate_range() callback as * the above pudp_huge_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2079,7 +1994,7 @@ { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; - pmd_t _pmd; + pmd_t _pmd, old_pmd; int i; /* @@ -2090,7 +2005,7 @@ * * See Documentation/vm/mmu_notifier.rst */ - pmdp_huge_clear_flush(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2099,6 +2014,8 @@ pte_t *pte, entry; entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); entry = pte_mkspecial(entry); + if (pmd_uffd_wp(old_pmd)) + entry = pte_mkuffd_wp(entry); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -2115,9 +2032,10 @@ struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false; + bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; unsigned long addr; int i; + bool success = false; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); @@ -2135,7 +2053,7 @@ */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (vma_is_dax(vma)) + if (vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) { swp_entry_t entry; @@ -2176,8 +2094,8 @@ * free), userland could trigger a small page size TLB miss on the * small sized TLB while the hugepage TLB entry is still established in * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's + * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum + * 383 on page 105. Intel should be safe but is also warns that it's * only safe if the permission and cache attributes of the two entries * loaded in the two TLB is identical (which should be the case here). * But it is generally safer to never allow small and huge TLB entries @@ -2195,10 +2113,11 @@ swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); if (pmd_dirty(old_pmd)) @@ -2206,6 +2125,7 @@ write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); @@ -2230,21 +2150,29 @@ entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - entry = maybe_mkwrite(entry, vma); + entry = maybe_mkwrite(entry, vma->vm_flags); if (!write) entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - if (!pmd_migration) - atomic_inc(&page[i]._mapcount); + if (!pmd_migration) { + trace_android_vh_update_page_mapcount(&page[i], true, + false, NULL, &success); + if (!success) + atomic_inc(&page[i]._mapcount); + } pte_unmap(pte); } @@ -2255,8 +2183,12 @@ */ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); + for (i = 0; i < HPAGE_PMD_NR; i++) { + trace_android_vh_update_page_mapcount(&page[i], true, + false, NULL, &success); + if (!success) + atomic_inc(&page[i]._mapcount); + } } lock_page_memcg(page); @@ -2265,8 +2197,12 @@ __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); + for (i = 0; i < HPAGE_PMD_NR; i++) { + trace_android_vh_update_page_mapcount(&page[i], + false, false, NULL, &success); + if (!success) + atomic_dec(&page[i]._mapcount); + } } } unlock_page_memcg(page); @@ -2287,13 +2223,15 @@ unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; + struct mmu_notifier_range range; bool do_unlock_page = false; pmd_t _pmd; - mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); - ptl = pmd_lock(mm, pmd); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); /* * If caller asks to setup a migration entries, we need a page to check @@ -2339,7 +2277,7 @@ clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, freeze); + __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); if (do_unlock_page) @@ -2357,8 +2295,7 @@ * any further changes to individual pte will notify. So no need * to call mmu_notifier->invalidate_range() */ - mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + - HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_only_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, @@ -2413,13 +2350,13 @@ /* * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't page aligned and it could previously + * vm_next->vm_start isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; - nstart += adjust_next << PAGE_SHIFT; + nstart += adjust_next; if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) @@ -2429,8 +2366,8 @@ static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | - TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2442,13 +2379,13 @@ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } -static void remap_page(struct page *page) +static void remap_page(struct page *page, unsigned int nr) { int i; if (PageTransHuge(page)) { remove_migration_ptes(page, page, true); } else { - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) remove_migration_ptes(page + i, page + i, true); } } @@ -2477,6 +2414,9 @@ (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | +#ifdef CONFIG_64BIT + (1L << PG_arch_2) | +#endif (1L << PG_dirty))); /* ->mapping in first tail page is compound_mapcount */ @@ -2519,16 +2459,27 @@ pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); - struct zone *zone = page_zone(head); + pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; + struct address_space *swap_cache = NULL; + unsigned long offset = 0; + unsigned int nr = thp_nr_pages(head); int i; - lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); + lruvec = mem_cgroup_page_lruvec(head, pgdat); /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(head); + split_page_memcg(head, nr); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + offset = swp_offset(entry); + swap_cache = swap_address_space(entry); + xa_lock(&swap_cache->i_pages); + } + + for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ if (head[i].index >= end) { @@ -2537,31 +2488,45 @@ if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, offset + i, + head + i, 0); } } ClearPageCompound(head); - split_page_owner(head, HPAGE_PMD_ORDER); + split_page_owner(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ - if (PageSwapCache(head)) + /* Additional pin to swap cache */ + if (PageSwapCache(head)) { page_ref_add(head, 2); - else + xa_unlock(&swap_cache->i_pages); + } else { page_ref_inc(head); + } } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); + spin_unlock_irqrestore(&pgdat->lru_lock, flags); - remap_page(head); + remap_page(head, nr); - for (i = 0; i < HPAGE_PMD_NR; i++) { + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + split_swap_cluster(entry); + } + + for (i = 0; i < nr; i++) { struct page *subpage = head + i; if (subpage == page) continue; @@ -2580,7 +2545,7 @@ int total_mapcount(struct page *page) { - int i, compound, ret; + int i, compound, nr, ret; VM_BUG_ON_PAGE(PageTail(page), page); @@ -2588,16 +2553,17 @@ return atomic_read(&page->_mapcount) + 1; compound = compound_mapcount(page); + nr = compound_nr(page); if (PageHuge(page)) return compound; ret = compound; - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) ret += atomic_read(&page[i]._mapcount) + 1; /* File pages has compound_mapcount included in _mapcount */ if (!PageAnon(page)) - return ret - compound * HPAGE_PMD_NR; + return ret - compound * nr; if (PageDoubleMap(page)) - ret -= HPAGE_PMD_NR; + ret -= nr; return ret; } @@ -2642,14 +2608,14 @@ page = compound_head(page); _total_mapcount = ret = 0; - for (i = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0; i < thp_nr_pages(page); i++) { mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); _total_mapcount += mapcount; } if (PageDoubleMap(page)) { ret -= 1; - _total_mapcount -= HPAGE_PMD_NR; + _total_mapcount -= thp_nr_pages(page); } mapcount = compound_mapcount(page); ret += mapcount; @@ -2664,11 +2630,11 @@ { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) - extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0; else - extra_pins = HPAGE_PMD_NR; + extra_pins = thp_nr_pages(page); if (pextra_pins) *pextra_pins = extra_pins; return total_mapcount(page) == page_count(page) - extra_pins - 1; @@ -2697,23 +2663,23 @@ { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; - bool mlocked; unsigned long flags; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageLocked(head), head); + VM_BUG_ON_PAGE(!PageCompound(head), head); - if (PageWriteback(page)) + if (PageWriteback(head)) return -EBUSY; if (PageAnon(head)) { /* - * The caller does not necessarily hold an mmap_sem that would + * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This * is similar to page_lock_anon_vma_read except the write lock @@ -2759,55 +2725,47 @@ goto out_unlock; } - mlocked = PageMlocked(page); unmap_page(head); - /* Make sure the page is not on per-CPU pagevec as it takes pin */ - if (mlocked) - lru_add_drain(); - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); + spin_lock_irqsave(&pgdata->lru_lock, flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&pgdata->split_queue_lock); + spin_lock(&ds_queue->split_queue_lock); if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } - if (mapping) - __dec_node_page_state(page, NR_SHMEM_THPS); - spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, end, flags); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; + spin_unlock(&ds_queue->split_queue_lock); + if (mapping) { + if (PageSwapBacked(head)) + __dec_node_page_state(head, NR_SHMEM_THPS); + else + __dec_node_page_state(head, NR_FILE_THPS); + } - ret = split_swap_cluster(entry); - } else - ret = 0; + __split_huge_page(page, list, end, flags); + ret = 0; } else { - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - remap_page(head); + spin_unlock_irqrestore(&pgdata->lru_lock, flags); + remap_page(head, thp_nr_pages(head)); ret = -EBUSY; } @@ -2825,53 +2783,89 @@ void free_transhuge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = get_deferred_split_queue(page); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; +#endif unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + /* + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same page, it is + * unnecessary to handle it again in shrinker. + * + * Check PageSwapCache to determine if the page is being + * handled by page reclaim since THP swap would add the page into + * swap cache before calling try_to_unmap(). + */ + if (PageSwapCache(page)) + return; + + if (!list_empty(page_deferred_list(page))) + return; + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &pgdata->split_queue); - pgdata->split_queue_len++; + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + ds_queue->split_queue_len++; +#ifdef CONFIG_MEMCG + if (memcg) + memcg_set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); +#endif } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return READ_ONCE(pgdata->split_queue_len); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + return READ_ONCE(ds_queue->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &pgdata->split_queue) { + list_for_each_safe(pos, next, &ds_queue->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { @@ -2879,12 +2873,12 @@ } else { /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - pgdata->split_queue_len--; + ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -2898,15 +2892,15 @@ put_page(page); } - spin_lock_irqsave(&pgdata->split_queue_lock, flags); - list_splice_tail(&list, &pgdata->split_queue); - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ - if (!split && list_empty(&pgdata->split_queue)) + if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } @@ -2915,7 +2909,8 @@ .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, }; #ifdef CONFIG_DEBUG_FS @@ -2959,17 +2954,13 @@ return 0; } -DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, +DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, "%llu\n"); static int __init split_huge_pages_debugfs(void) { - void *ret; - - ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, - &split_huge_pages_fops); - if (!ret) - pr_warn("Failed to create split_huge_pages in debugfs"); + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, + &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); @@ -3021,6 +3012,8 @@ pmde = pmd_mksoft_dirty(pmde); if (is_write_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) -- Gitblit v1.6.2