From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/mm/userfaultfd.c | 400 ++++++++++++++++++++++++++++++++++++++++-----------------
1 files changed, 280 insertions(+), 120 deletions(-)
diff --git a/kernel/mm/userfaultfd.c b/kernel/mm/userfaultfd.c
index 93a12cc..fa707e5 100644
--- a/kernel/mm/userfaultfd.c
+++ b/kernel/mm/userfaultfd.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/userfaultfd.c
*
* Copyright (C) 2015 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
#include <linux/mm.h>
@@ -20,21 +18,119 @@
#include <asm/tlbflush.h>
#include "internal.h"
+static __always_inline
+struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ /*
+ * Make sure that the dst range is both valid and fully within a
+ * single existing vma.
+ */
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma)
+ return NULL;
+
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ return NULL;
+
+ /*
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ return NULL;
+
+ return dst_vma;
+}
+
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
+ */
+int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, bool wp_copy)
+{
+ int ret;
+ pte_t _dst_pte, *dst_pte;
+ bool writable = dst_vma->vm_flags & VM_WRITE;
+ bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+ bool page_in_cache = page_mapping(page);
+ spinlock_t *ptl;
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (page_in_cache && !vm_shared)
+ writable = false;
+ if (writable || !page_in_cache)
+ _dst_pte = pte_mkdirty(_dst_pte);
+ if (writable) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ if (vma_is_shmem(dst_vma)) {
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ if (page_in_cache)
+ page_add_file_rmap(page, false);
+ else
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+
+ /*
+ * Must happen after rmap, as mm_counter() checks mapping (via
+ * PageAnon()), which is set by __page_set_anon_rmap().
+ */
+ inc_mm_counter(dst_mm, mm_counter(page));
+
+ if (newly_allocated)
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **pagep)
+ struct page **pagep,
+ bool wp_copy)
{
- struct mem_cgroup *memcg;
- pte_t _dst_pte, *dst_pte;
- spinlock_t *ptl;
void *page_kaddr;
int ret;
struct page *page;
- pgoff_t offset, max_off;
- struct inode *inode;
if (!*pagep) {
ret = -ENOMEM;
@@ -48,13 +144,15 @@
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
}
+
+ flush_dcache_page(page);
} else {
page = *pagep;
*pagep = NULL;
@@ -62,50 +160,21 @@
/*
* The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
+ * preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
- }
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, dst_vma);
-
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
- pte_unmap_unlock(dst_pte, ptl);
- ret = 0;
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, wp_copy);
+ if (ret)
+ goto out_release;
out:
return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
put_page(page);
goto out;
@@ -146,6 +215,41 @@
return ret;
}
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ bool wp_copy)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct page *page;
+ int ret;
+
+ ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+ if (ret)
+ goto out;
+ if (!page) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, false, wp_copy);
+ if (ret)
+ goto out_release;
+
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -170,14 +274,14 @@
#ifdef CONFIG_HUGETLB_PAGE
/*
* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_sem held, it will release mmap_sem before returning.
+ * called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ enum mcopy_atomic_mode mode)
{
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -186,7 +290,6 @@
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
- struct hstate *h;
unsigned long vma_hpagesize;
pgoff_t idx;
u32 hash;
@@ -198,8 +301,8 @@
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (zeropage) {
- up_read(&dst_mm->mmap_sem);
+ if (mode == MCOPY_ATOMIC_ZEROPAGE) {
+ mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -218,24 +321,13 @@
retry:
/*
- * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * On routine entry dst_vma is set. If we had to drop mmap_lock and
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is
- * required to enforce the VM_MAYWRITE check done at
- * uffd registration time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
goto out_unlock;
err = -EINVAL;
@@ -244,10 +336,6 @@
vm_shared = dst_vma->vm_flags & VM_SHARED;
}
-
- if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
- (len - copied) & (vma_hpagesize - 1)))
- goto out_unlock;
/*
* If not shared, ensure the dst_vma has a anon_vma.
@@ -258,56 +346,59 @@
goto out_unlock;
}
- h = hstate_vma(dst_vma);
-
while (src_addr < src_start + len) {
- pte_t dst_pteval;
-
BUG_ON(dst_addr >= dst_start + len);
- VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via hugetlb_fault_mutex
+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+ * i_mmap_rwsem ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
*/
- idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
+ i_mmap_lock_read(mapping);
+ idx = linear_page_index(dst_vma, dst_addr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
- err = -EEXIST;
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
+ if (mode != MCOPY_ATOMIC_CONTINUE &&
+ !huge_pte_none(huge_ptep_get(dst_pte))) {
+ err = -EEXIST;
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, &page);
+ dst_addr, src_addr, mode, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
if (unlikely(err == -ENOENT)) {
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
err = copy_huge_page_from_user(page,
(const void __user *)src_addr,
- pages_per_huge_page(h), true);
+ vma_hpagesize / PAGE_SIZE,
+ true);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
dst_vma = NULL;
goto retry;
@@ -327,7 +418,7 @@
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page) {
/*
@@ -338,7 +429,7 @@
* private and shared mappings. See the routine
* restore_reserve_on_error for details. Unfortunately, we
* can not call restore_reserve_on_error now as it would
- * require holding mmap_sem.
+ * require holding mmap_lock.
*
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
@@ -389,7 +480,7 @@
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage);
+ enum mcopy_atomic_mode mode);
#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -398,9 +489,15 @@
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage)
+ enum mcopy_atomic_mode mode,
+ bool wp_copy)
{
ssize_t err;
+
+ if (mode == MCOPY_ATOMIC_CONTINUE) {
+ return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ wp_copy);
+ }
/*
* The normal page fault path for a shmem will invoke the
@@ -413,20 +510,19 @@
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
+ if (mode == MCOPY_ATOMIC_NORMAL)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page);
+ dst_addr, src_addr, page,
+ wp_copy);
else
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
} else {
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ VM_WARN_ON_ONCE(wp_copy);
+ err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ mode != MCOPY_ATOMIC_NORMAL,
+ page);
}
return err;
@@ -436,8 +532,9 @@
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage,
- bool *mmap_changing)
+ enum mcopy_atomic_mode mcopy_mode,
+ bool *mmap_changing,
+ __u64 mode)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -445,6 +542,7 @@
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
+ bool wp_copy;
/*
* Sanitize the command parameters:
@@ -461,7 +559,7 @@
copied = 0;
page = NULL;
retry:
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
/*
* If memory mappings are changing because of non-cooperative
@@ -477,19 +575,8 @@
* both valid and fully within a single existing vma.
*/
err = -ENOENT;
- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
if (!dst_vma)
- goto out_unlock;
- /*
- * Check the vma is registered in uffd, this is required to
- * enforce the VM_MAYWRITE check done at uffd registration
- * time.
- */
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
- goto out_unlock;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
goto out_unlock;
err = -EINVAL;
@@ -502,13 +589,23 @@
goto out_unlock;
/*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ wp_copy = mode & UFFDIO_COPY_MODE_WP;
+ if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ /*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ src_start, len, mcopy_mode);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
+ goto out_unlock;
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
goto out_unlock;
/*
@@ -542,7 +639,7 @@
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
@@ -556,13 +653,13 @@
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage);
+ src_addr, &page, mcopy_mode, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
void *page_kaddr;
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
page_kaddr = kmap(page);
@@ -574,6 +671,7 @@
err = -EFAULT;
goto out;
}
+ flush_dcache_page(page);
goto retry;
} else
BUG_ON(page);
@@ -591,7 +689,7 @@
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page)
put_page(page);
@@ -603,14 +701,76 @@
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- bool *mmap_changing)
+ bool *mmap_changing, __u64 mode)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+ MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+ mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool *mmap_changing)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+ mmap_changing, 0);
+}
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool enable_wp, bool *mmap_changing)
+{
+ struct vm_area_struct *dst_vma;
+ pgprot_t newprot;
+ int err;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(start + len <= start);
+
+ mmap_read_lock(dst_mm);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, start, len);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (!userfaultfd_wp(dst_vma))
+ goto out_unlock;
+ if (!vma_is_anonymous(dst_vma))
+ goto out_unlock;
+
+ if (enable_wp)
+ newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ else
+ newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+ change_protection(dst_vma, start, start + len, newprot,
+ enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+
+ err = 0;
+out_unlock:
+ mmap_read_unlock(dst_mm);
+ return err;
}
--
Gitblit v1.6.2