From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/mprotect.c | 172 +++++++++++++++++++++++++++++++++----------------------- 1 files changed, 101 insertions(+), 71 deletions(-) diff --git a/kernel/mm/mprotect.c b/kernel/mm/mprotect.c index 5c175d4..c1c3315 100644 --- a/kernel/mm/mprotect.c +++ b/kernel/mm/mprotect.c @@ -9,7 +9,7 @@ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/mman.h> @@ -28,7 +28,7 @@ #include <linux/ksm.h> #include <linux/uaccess.h> #include <linux/mm_inline.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -37,16 +37,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { - struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* - * Can be called with only the mmap_sem for reading by + * Can be called with only the mmap_lock for reading by * prot_numa so we must check the pmd isn't constantly * changing from under us from pmd_none to pmd_trans_huge * and/or the other way around. @@ -56,7 +59,7 @@ /* * The pmd points to a regular pte so the pmd can't change - * from under us even if the mmap_sem is only hold for + * from under us even if the mmap_lock is only hold for * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -81,13 +84,17 @@ if (prot_numa) { struct page *page; + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; + page = vm_normal_page(vma, addr, oldpte); if (!page || PageKsm(page)) continue; /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - page_mapcount(page) != 1) + page_count(page) != 1) continue; /* @@ -95,11 +102,7 @@ * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_cache(page) && PageDirty(page)) - continue; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) + if (page_is_file_lru(page) && PageDirty(page)) continue; /* @@ -110,10 +113,23 @@ continue; } - ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); + + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && @@ -121,13 +137,13 @@ !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION)) { + } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; if (is_write_migration_entry(entry)) { - pte_t newpte; /* * A protection check is difficult so * just be safe and disable write @@ -136,22 +152,28 @@ newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(mm, addr, pte, newpte); - - pages++; - } - - if (is_write_device_private_entry(entry)) { - pte_t newpte; - + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(mm, addr, pte, newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } } @@ -189,14 +211,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; - struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; - unsigned long mni_start = 0; + struct mmu_notifier_range range; + + range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -205,7 +228,7 @@ next = pmd_addr_end(addr, end); /* - * Automatic NUMA balancing walks the tables with mmap_sem + * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur * between pmd_trans_huge() and a pmd_none_or_clear_bad() * check leading to a false positive and clearing. @@ -217,9 +240,11 @@ goto next; /* invoke the mmu notifier if the pmd is populated */ - if (!mni_start) { - mni_start = addr; - mmu_notifier_invalidate_range_start(mm, mni_start, end); + if (!range.start) { + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); } if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { @@ -227,7 +252,7 @@ __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -242,14 +267,14 @@ /* fall through, the trans huge pmd just split */ } this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); pages += this_pages; next: cond_resched(); } while (pmd++, addr = next, addr != end); - if (mni_start) - mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (range.start) + mmu_notifier_invalidate_range_end(&range); if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); @@ -258,7 +283,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -270,7 +295,7 @@ if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pud++, addr = next, addr != end); return pages; @@ -278,7 +303,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -290,7 +315,7 @@ if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(vma, p4d, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (p4d++, addr = next, addr != end); return pages; @@ -298,7 +323,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -315,7 +340,7 @@ if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -328,14 +353,17 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { unsigned long pages; + + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + pages = change_protection_range(vma, start, end, newprot, + cp_flags); return pages; } @@ -361,20 +389,11 @@ return 0; } -static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long newflags) -{ - pgprot_t new_pgprot = vm_get_page_prot(newflags); - struct mm_walk prot_none_walk = { - .pte_entry = prot_none_pte_entry, - .hugetlb_entry = prot_none_hugetlb_entry, - .test_walk = prot_none_test, - .mm = current->mm, - .private = &new_pgprot, - }; - - return walk_page_range(start, end, &prot_none_walk); -} +static const struct mm_walk_ops prot_none_walk_ops = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, +}; int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, @@ -400,8 +419,11 @@ */ if (arch_has_pfn_modify_check() && (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { - error = prot_none_walk(vma, start, end, newflags); + (newflags & VM_ACCESS_FLAGS) == 0) { + pgprot_t new_pgprot = vm_get_page_prot(newflags); + + error = walk_page_range(current->mm, start, end, + &prot_none_walk_ops, &new_pgprot); if (error) return error; } @@ -455,15 +477,17 @@ success: /* - * vm_flags and vm_page_prot are protected by the mmap_sem + * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, newflags); dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable, 0); + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + vm_write_end(vma); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -516,7 +540,7 @@ reqprot = prot; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; /* @@ -576,8 +600,14 @@ newflags |= (vma->vm_flags & ~mask_off_old_flags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ - if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { + if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { error = -EACCES; + goto out; + } + + /* Allow architectures to sanity-check the new flags */ + if (!arch_validate_flags(newflags)) { + error = -EINVAL; goto out; } @@ -606,7 +636,7 @@ prot = reqprot; } out: - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return error; } @@ -636,7 +666,7 @@ if (init_val & ~PKEY_ACCESS_MASK) return -EINVAL; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); pkey = mm_pkey_alloc(current->mm); ret = -ENOSPC; @@ -650,7 +680,7 @@ } ret = pkey; out: - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } @@ -658,9 +688,9 @@ { int ret; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); ret = mm_pkey_free(current->mm, pkey); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); /* * We could provie warnings or errors if any VMA still -- Gitblit v1.6.2