From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/include/linux/mm.h | 1455 ++++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 969 insertions(+), 486 deletions(-) diff --git a/kernel/include/linux/mm.h b/kernel/include/linux/mm.h index f79d114..dfefcfa 100644 --- a/kernel/include/linux/mm.h +++ b/kernel/include/linux/mm.h @@ -15,6 +15,7 @@ #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> +#include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> @@ -23,9 +24,15 @@ #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> +#include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/memremap.h> #include <linux/overflow.h> +#include <linux/sizes.h> +#include <linux/sched.h> +#include <linux/pgtable.h> +#include <linux/kasan.h> +#include <linux/page_pinner.h> #include <linux/android_kabi.h> struct mempolicy; @@ -35,6 +42,9 @@ struct user_struct; struct writeback_control; struct bdi_writeback; +struct pt_regs; + +extern int sysctl_page_lock_unfairness; void init_mm_internals(void); @@ -49,7 +59,27 @@ static inline void set_max_mapnr(unsigned long limit) { } #endif -extern unsigned long totalram_pages; +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); +} + extern void * high_memory; extern int page_cluster; @@ -71,7 +101,6 @@ #endif #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/processor.h> /* @@ -87,6 +116,14 @@ #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) +#endif + +#ifndef __va_function +#define __va_function(x) (x) +#endif + +#ifndef __pa_function +#define __pa_function(x) __pa_symbol(x) #endif #ifndef page_to_virt @@ -110,10 +147,48 @@ /* * On some architectures it is expensive to call memset() for small sizes. - * Those architectures should provide their own implementation of "struct page" - * zeroing by defining this macro in <asm/pgtable.h>. + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in <asm/pgtable.h> */ -#ifndef mm_zero_struct_page +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 80 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statments if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, or 80 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 80); + + switch (sizeof(struct page)) { + case 80: + _pp[9] = 0; + fallthrough; + case 72: + _pp[8] = 0; + fallthrough; + case 64: + _pp[7] = 0; + fallthrough; + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif @@ -145,10 +220,12 @@ extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -157,6 +234,8 @@ /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) + +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) /* * Linux kernel virtual memory manager primitives. @@ -267,23 +346,48 @@ #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI +#elif defined(CONFIG_ARM64) +# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ +# define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif -#if defined(CONFIG_X86_INTEL_MPX) -/* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_4 +#if defined(CONFIG_ARM64_MTE) +# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ #else -# define VM_MPX VM_NONE +# define VM_MTE VM_NONE +# define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -297,11 +401,17 @@ #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + + /* * Special vmas that are non-mergable, non-mlock()able. - * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +/* This mask prevents VMA from being scanned with khugepaged */ +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE @@ -321,15 +431,77 @@ */ extern pgprot_t protection_map[16]; -#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x20 /* Second try */ -#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ -#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ -#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +/** + * Fault flag definitions. + * + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +#define FAULT_FLAG_WRITE 0x01 +#define FAULT_FLAG_MKWRITE 0x02 +#define FAULT_FLAG_ALLOW_RETRY 0x04 +#define FAULT_FLAG_RETRY_NOWAIT 0x08 +#define FAULT_FLAG_KILLABLE 0x10 +#define FAULT_FLAG_TRIED 0x20 +#define FAULT_FLAG_USER 0x40 +#define FAULT_FLAG_REMOTE 0x80 +#define FAULT_FLAG_INSTRUCTION 0x100 +#define FAULT_FLAG_INTERRUPTIBLE 0x200 +/* Speculative fault, not holding mmap_sem */ +#define FAULT_FLAG_SPECULATIVE 0x400 + +/* + * The default fault flags that should be used by most of the + * arch-specific page fault handlers. + */ +#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ + FAULT_FLAG_KILLABLE | \ + FAULT_FLAG_INTERRUPTIBLE) + +/** + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * + * This is mostly used for places where we want to try to avoid taking + * the mmap_lock for too long a time when waiting for another condition + * to change, in which case we can try to be polite to release the + * mmap_lock in the first round to avoid potential starvation of other + * processes that would also want the mmap_lock. + * + * Return: true if the page fault allows retry and this is the first + * attempt of the fault handling; false otherwise. + */ +static inline bool fault_flag_allow_retry_first(unsigned int flags) +{ + return (flags & FAULT_FLAG_ALLOW_RETRY) && + (!(flags & FAULT_FLAG_TRIED)); +} #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ @@ -340,10 +512,11 @@ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ - { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" } + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } /* - * vm_fault is filled by the the pagefault handler and passed to the vma's + * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * @@ -353,11 +526,18 @@ * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - struct vm_area_struct *vma; /* Target VMA */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - gfp_t gfp_mask; /* gfp mask to be used for allocations */ - pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned int sequence; + pmd_t orig_pmd; /* value of PMD at the time of fault */ +#endif + const struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address */ + }; + unsigned int flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -366,7 +546,6 @@ pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -382,12 +561,19 @@ * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. - * vm_ops->map_pages() calls - * alloc_set_pte() from atomic context. + * vm_ops->map_pages() sets up a page + * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ + /* + * These entries are required when handling speculative page fault. + * This way the page handling is done using consistent field values. + */ + unsigned long vma_flags; + pgprot_t vma_page_prot; + ANDROID_OEM_DATA_ARRAY(1, 2); }; /* page entry size for vm->huge_fault() */ @@ -410,7 +596,7 @@ vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); - void (*map_pages)(struct vm_fault *vmf, + vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); @@ -447,7 +633,7 @@ * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not - * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. @@ -463,11 +649,24 @@ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + bool (*allow_speculation)(void); +#endif + ANDROID_KABI_RESERVE(1); ANDROID_KABI_RESERVE(2); ANDROID_KABI_RESERVE(3); ANDROID_KABI_RESERVE(4); }; + +static inline void INIT_VMA(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->anon_vma_chain); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + seqcount_init(&vma->vm_sequence); + atomic_set(&vma->vm_ref_count, 1); +#endif +} static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { @@ -476,7 +675,7 @@ memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; - INIT_LIST_HEAD(&vma->anon_vma_chain); + INIT_VMA(vma); } static inline void vma_set_anonymous(struct vm_area_struct *vma) @@ -484,35 +683,59 @@ vma->vm_ops = NULL; } +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ + if (!current->mm) + return true; + + if (current->mm != vma->vm_mm) + return true; + + return false; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +#ifdef CONFIG_SHMEM +/* + * The vma_is_shmem is not inline because it is used only by slow + * paths in userfault. + */ +bool vma_is_shmem(struct vm_area_struct *vma); +#else +static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +#endif + +int vma_is_stack_for_current(struct vm_area_struct *vma); + /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; -#define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) - -#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline int pmd_devmap(pmd_t pmd) -{ - return 0; -} -static inline int pud_devmap(pud_t pud) -{ - return 0; -} -static inline int pgd_devmap(pgd_t pgd) -{ - return 0; -} -#endif - -/* - * FIXME: take this include out, include page-flags.h in - * files which need it (119 of them) - */ -#include <linux/page-flags.h> #include <linux/huge_mm.h> /* @@ -533,8 +756,13 @@ */ static inline int put_page_testzero(struct page *page) { + int ret; + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); - return page_ref_dec_and_test(page); + ret = page_ref_dec_and_test(page); + page_pinner_put_page(page); + + return ret; } /* @@ -569,19 +797,19 @@ * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ -static inline bool is_vmalloc_addr(const void *x) -{ -#ifdef CONFIG_MMU - unsigned long addr = (unsigned long)x; - return addr >= VMALLOC_START && addr < VMALLOC_END; -#else - return false; +#ifndef is_ioremap_addr +#define is_ioremap_addr(x) is_vmalloc_addr(x) #endif -} + #ifdef CONFIG_MMU +extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else +static inline bool is_vmalloc_addr(const void *x) +{ + return false; +} static inline int is_vmalloc_or_module_addr(const void *x) { return 0; @@ -617,8 +845,15 @@ return kvmalloc_array(n, size, flags | __GFP_ZERO); } +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, + gfp_t flags); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); + +static inline int head_compound_mapcount(struct page *head) +{ + return atomic_read(compound_mapcount_ptr(head)) + 1; +} /* * Mapcount of compound page as a whole, does not include mapped sub-pages. @@ -629,7 +864,7 @@ { VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); - return atomic_read(compound_mapcount_ptr(page)) + 1; + return head_compound_mapcount(page); } /* @@ -709,7 +944,7 @@ #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) @@ -718,10 +953,10 @@ page[1].compound_dtor = compound_dtor; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +static inline void destroy_compound_page(struct page *page) { VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - return compound_page_dtors[page[1].compound_dtor]; + compound_page_dtors[page[1].compound_dtor](page); } static inline unsigned int compound_order(struct page *page) @@ -731,15 +966,53 @@ return page[1].compound_order; } +static inline bool hpage_pincount_available(struct page *page) +{ + /* + * Can the page->hpage_pinned_refcount field be used? That field is in + * the 3rd page of the compound page, so the smallest (2-page) compound + * pages cannot support it. + */ + page = compound_head(page); + return PageCompound(page) && compound_order(page) > 1; +} + +static inline int head_compound_pincount(struct page *head) +{ + return atomic_read(compound_pincount_ptr(head)); +} + +static inline int compound_pincount(struct page *page) +{ + VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); + page = compound_head(page); + return head_compound_pincount(page); +} + static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; + page[1].compound_nr = 1U << order; +} + +/* Returns the number of pages in this potentially compound page. */ +static inline unsigned long compound_nr(struct page *page) +{ + if (!PageHead(page)) + return 1; + return page[1].compound_nr; } /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); +} + +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); } void free_compound_page(struct page *page); @@ -751,15 +1024,16 @@ * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags) { - if (likely(vma->vm_flags & VM_WRITE)) + if (likely(vma_flags & VM_WRITE)) pte = pte_mkwrite(pte); return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); + vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -860,10 +1134,6 @@ #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#endif - #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) @@ -873,6 +1143,7 @@ static inline enum zone_type page_zonenum(const struct page *page) { + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } @@ -881,6 +1152,8 @@ { return page_zonenum(page) == ZONE_DEVICE; } +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { @@ -889,11 +1162,10 @@ #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void dev_pagemap_get_ops(void); -void dev_pagemap_put_ops(void); -void __put_devmap_managed_page(struct page *page); +void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool put_devmap_managed_page(struct page *page) + +static inline bool page_is_devmap_managed(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; @@ -901,9 +1173,7 @@ return false; switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_PUBLIC: case MEMORY_DEVICE_FS_DAX: - __put_devmap_managed_page(page); return true; default: break; @@ -911,42 +1181,34 @@ return false; } +void put_devmap_managed_page(struct page *page); + +#else /* CONFIG_DEV_PAGEMAP_OPS */ +static inline bool page_is_devmap_managed(struct page *page) +{ + return false; +} + +static inline void put_devmap_managed_page(struct page *page) +{ +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ + static inline bool is_device_private_page(const struct page *page) { - return is_zone_device_page(page) && + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } -static inline bool is_device_public_page(const struct page *page) +static inline bool is_pci_p2pdma_page(const struct page *page) { - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PUBLIC; + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } - -#else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline void dev_pagemap_get_ops(void) -{ -} - -static inline void dev_pagemap_put_ops(void) -{ -} - -static inline bool put_devmap_managed_page(struct page *page) -{ - return false; -} - -static inline bool is_device_private_page(const struct page *page) -{ - return false; -} - -static inline bool is_device_public_page(const struct page *page) -{ - return false; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ /* 127: arbitrary random number, small enough to assemble well */ #define page_ref_zero_or_close_to_overflow(page) \ @@ -962,6 +1224,8 @@ VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } + +bool __must_check try_grab_page(struct page *page, unsigned int flags); static inline __must_check bool try_get_page(struct page *page) { @@ -982,11 +1246,95 @@ * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); return; + } if (put_page_testzero(page)) __put_page(page); +} + +/* + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload + * the page's refcount so that two separate items are tracked: the original page + * reference count, and also a new count of how many pin_user_pages() calls were + * made against the page. ("gup-pinned" is another term for the latter). + * + * With this scheme, pin_user_pages() becomes special: such pages are marked as + * distinct from normal pages. As such, the unpin_user_page() call (and its + * variants) must be used in order to release gup-pinned pages. + * + * Choice of value: + * + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference + * counts with respect to pin_user_pages() and unpin_user_page() becomes + * simpler, due to the fact that adding an even power of two to the page + * refcount has the effect of using only the upper N bits, for the code that + * counts up using the bias value. This means that the lower bits are left for + * the exclusive use of the original code that increments and decrements by one + * (or at least, by much smaller values than the bias value). + * + * Of course, once the lower bits overflow into the upper bits (and this is + * OK, because subtraction recovers the original values), then visual inspection + * no longer suffices to directly view the separate counts. However, for normal + * applications that don't have huge page reference counts, this won't be an + * issue. + * + * Locking: the lockless algorithm described in page_cache_get_speculative() + * and page_cache_gup_pin_speculative() provides safe operation for + * get_user_pages and page_mkclean and other calls that race to set up page + * table entries. + */ +#define GUP_PIN_COUNTING_BIAS (1U << 10) + +void put_user_page(struct page *page); +void unpin_user_page(struct page *page); +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); +void unpin_user_pages(struct page **pages, unsigned long npages); + +/** + * page_maybe_dma_pinned() - report if a page is pinned for DMA. + * + * This function checks if a page has been pinned via a call to + * pin_user_pages*(). + * + * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal page references". + * + * False positives are OK, because: a) it's unlikely for a page to get that many + * refcounts, and b) all the callers of this routine are expected to be able to + * deal gracefully with a false positive. + * + * For huge pages, the result will be exactly correct. That's because we have + * more tracking data available: the 3rd struct page in the compound page is + * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS + * scheme). + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * @page: pointer to page to be queried. + * @Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. + */ +static inline bool page_maybe_dma_pinned(struct page *page) +{ + if (hpage_pincount_available(page)) + return compound_pincount(page) > 0; + + /* + * page_ref_count() is signed. If that refcount overflows, then + * page_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the signed bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)page_ref_count(compound_head(page))) >= + GUP_PIN_COUNTING_BIAS; } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -1114,7 +1462,7 @@ static inline bool cpupid_pid_unset(int cpupid) { - return 1; + return true; } static inline void page_cpupid_reset_last(struct page *page) @@ -1127,23 +1475,43 @@ } #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_KASAN_SW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +/* + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid + * setting tags for all pages to native kernel tag value 0xff, as the default + * value 0x00 maps to 0xff. + */ + static inline u8 page_kasan_tag(const struct page *page) { - return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + u8 tag = 0xff; + + if (kasan_enabled()) { + tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag ^= 0xff; + } + + return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { - page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); - page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + if (kasan_enabled()) { + tag ^= 0xff; + page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } } static inline void page_kasan_tag_reset(struct page *page) { - page_kasan_tag_set(page, 0xff); + if (kasan_enabled()) + page_kasan_tag_set(page, 0xff); } -#else + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + static inline u8 page_kasan_tag(const struct page *page) { return 0xff; @@ -1151,7 +1519,8 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } -#endif + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { @@ -1319,57 +1688,12 @@ } /* - * Different kinds of faults, as returned by handle_mm_fault(). - * Used to decide whether a process gets delivered SIGBUS or - * just gets major/minor fault counters bumped up. - */ - -#define VM_FAULT_OOM 0x0001 -#define VM_FAULT_SIGBUS 0x0002 -#define VM_FAULT_MAJOR 0x0004 -#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ -#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ -#define VM_FAULT_SIGSEGV 0x0040 - -#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ -#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ -#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ -#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ -#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables - * and needs fsync() to complete (for - * synchronous page faults in DAX) */ - -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ - VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ - VM_FAULT_FALLBACK) - -#define VM_FAULT_RESULT_TRACE \ - { VM_FAULT_OOM, "OOM" }, \ - { VM_FAULT_SIGBUS, "SIGBUS" }, \ - { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ - { VM_FAULT_HWPOISON, "HWPOISON" }, \ - { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ - { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ - { VM_FAULT_NOPAGE, "NOPAGE" }, \ - { VM_FAULT_LOCKED, "LOCKED" }, \ - { VM_FAULT_RETRY, "RETRY" }, \ - { VM_FAULT_FALLBACK, "FALLBACK" }, \ - { VM_FAULT_DONE_COW, "DONE_COW" }, \ - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } - -/* Encode hstate index for a hwpoisoned large page */ -#define VM_FAULT_SET_HINDEX(x) ((x) << 12) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) - -/* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in @@ -1379,7 +1703,11 @@ extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_MMU extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); @@ -1394,8 +1722,12 @@ }; struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, bool with_public_device); -#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) + pte_t pte, unsigned long vma_flags); +static inline struct page *vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + return _vm_normal_page(vma, addr, pte, vma->vm_flags); +} struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); @@ -1407,65 +1739,51 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); -/** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) - */ -struct mm_walk { - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_entry)(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, - unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*test_walk)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - struct mm_struct *mm; - struct vm_area_struct *vma; - void *private; -}; +struct mmu_notifier_range; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); -int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, + pmd_t **pmdpp, spinlock_t **ptlp); +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static inline void vm_write_begin(struct vm_area_struct *vma) +{ + /* + * Isolated vma might be freed without exclusive mmap_lock but + * speculative page fault handler still needs to know it was changed. + */ + if (!RB_EMPTY_NODE(&vma->vm_rb)) + mmap_assert_write_locked(vma->vm_mm); + /* + * The reads never spins and preemption + * disablement is not required. + */ + raw_write_seqcount_begin(&vma->vm_sequence); +} +static inline void vm_write_end(struct vm_area_struct *vma) +{ + raw_write_seqcount_end(&vma->vm_sequence); +} +#else +static inline void vm_write_begin(struct vm_area_struct *vma) +{ +} +static inline void vm_write_end(struct vm_area_struct *vma) +{ +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); @@ -1477,10 +1795,51 @@ #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int flags, + struct pt_regs *regs); +extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern vm_fault_t __handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags, + struct vm_area_struct **vma, + struct pt_regs *regs); +static inline vm_fault_t handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags, + struct vm_area_struct **vma, + struct pt_regs *regs) +{ + /* + * Try speculative page fault for multithreaded user space task only. + */ + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) { + *vma = NULL; + return VM_FAULT_RETRY; + } + return __handle_speculative_fault(mm, address, flags, vma, regs); +} +extern bool can_reuse_spf_vma(struct vm_area_struct *vma, + unsigned long address); +#else +static inline vm_fault_t handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags, + struct vm_area_struct **vma, + struct pt_regs *regs) +{ + return VM_FAULT_RETRY; +} +static inline bool can_reuse_spf_vma(struct vm_area_struct *vma, + unsigned long address) +{ + return false; +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); @@ -1488,14 +1847,14 @@ loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } -static inline int fixup_user_fault(struct task_struct *tsk, - struct mm_struct *mm, unsigned long address, +static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ @@ -1522,32 +1881,37 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); +long pin_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#ifdef CONFIG_FS_DAX -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); + +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim); /* Container for pinned pfns / pages */ struct frame_vector { @@ -1555,7 +1919,7 @@ unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ - void *ptrs[0]; /* Array of pinned pfns / pages. Use + void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; @@ -1622,30 +1986,30 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); -static inline bool vma_is_anonymous(struct vm_area_struct *vma) -{ - return !vma->vm_ops; -} - -#ifdef CONFIG_SHMEM -/* - * The vma_is_shmem is not inline because it is used only by slow - * paths in userfault. - */ -bool vma_is_shmem(struct vm_area_struct *vma); -#else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -#endif - -int vma_is_stack_for_current(struct vm_area_struct *vma); - extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks); + +/* + * Flags used by change_protection(). For now we make it a bitmap so + * that we can pass in multiple flags just like parameters. However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* Whether we should allow dirty bit accounting */ +#define MM_CP_DIRTY_ACCT (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define MM_CP_PROT_NUMA (1UL << 1) +/* Whether this change is for write protecting */ +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) + extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); + unsigned long cp_flags); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1653,8 +2017,16 @@ /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ @@ -1765,7 +2137,19 @@ } #endif -#ifndef __HAVE_ARCH_PTE_DEVMAP +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL +static inline int pte_special(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte; +} +#endif + +#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; @@ -1881,16 +2265,11 @@ static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); +int __pte_alloc_kernel(pmd_t *pmd); -/* - * The following ifdef needed to get the 4level-fixup.h header to work. - * Remove it when 4level-fixup.h has been removed. - */ -#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) +#if defined(CONFIG_MMU) -#ifndef __ARCH_HAS_5LEVEL_HACK static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -1904,14 +2283,13 @@ return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } -#endif /* !__ARCH_HAS_5LEVEL_HACK */ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } -#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS @@ -1964,13 +2342,6 @@ return true; } -/* Reset page->mapping so free_pages_check won't complain. */ -static inline void pte_lock_deinit(struct page *page) -{ - page->mapping = NULL; - ptlock_free(page); -} - #else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. @@ -1981,7 +2352,7 @@ } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } -static inline void pte_lock_deinit(struct page *page) {} +static inline void ptlock_free(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline void pgtable_init(void) @@ -1990,7 +2361,7 @@ pgtable_cache_init(); } -static inline bool pgtable_page_ctor(struct page *page) +static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; @@ -1999,9 +2370,9 @@ return true; } -static inline void pgtable_page_dtor(struct page *page) +static inline void pgtable_pte_page_dtor(struct page *page) { - pte_lock_deinit(page); + ptlock_free(page); __ClearPageTable(page); dec_zone_page_state(page, NR_PAGETABLE); } @@ -2020,18 +2391,17 @@ pte_unmap(pte); \ } while (0) -#define pte_alloc(mm, pmd, address) \ - (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) +#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ - (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) + (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - (pte_alloc(mm, pmd, address) ? \ + (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if USE_SPLIT_PMD_PTLOCKS @@ -2047,7 +2417,7 @@ return ptlock_ptr(pmd_to_page(pmd)); } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pmd_ptlock_init(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE page->pmd_huge_pte = NULL; @@ -2055,7 +2425,7 @@ return ptlock_init(page); } -static inline void pgtable_pmd_page_dtor(struct page *page) +static inline void pmd_ptlock_free(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(page->pmd_huge_pte, page); @@ -2072,8 +2442,8 @@ return &mm->page_table_lock; } -static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } -static inline void pgtable_pmd_page_dtor(struct page *page) {} +static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -2084,6 +2454,22 @@ spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; +} + +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + if (!pmd_ptlock_init(page)) + return false; + __SetPageTable(page); + inc_zone_page_state(page, NR_PAGETABLE); + return true; +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ + pmd_ptlock_free(page); + __ClearPageTable(page); + dec_zone_page_state(page, NR_PAGETABLE); } /* @@ -2106,9 +2492,7 @@ } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2118,7 +2502,7 @@ * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, - int poison, char *s); + int poison, const char *s); #ifdef CONFIG_HIGHMEM /* @@ -2178,34 +2562,23 @@ return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * Using memblock node mappings, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); - * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly - * sparse_memory_present_with_active_regions() calls memory_present() for - * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. + * free_area_init(max_zone_pfns); */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -2214,16 +2587,9 @@ extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); -extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +#ifndef CONFIG_NEED_MULTIPLE_NODES +static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } @@ -2235,15 +2601,10 @@ struct mminit_pfnnid_cache *state); #endif -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) -void zero_resv_unavail(void); -#else -static inline void zero_resv_unavail(void) {} -#endif - extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); +extern void memmap_init_zone(unsigned long, int, unsigned long, + unsigned long, unsigned long, enum meminit_context, + struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); @@ -2261,12 +2622,11 @@ extern void setup_per_cpu_pageset(void); -extern void zone_pcp_update(struct zone *zone); -extern void zone_pcp_reset(struct zone *zone); - /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; @@ -2310,16 +2670,29 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + struct vm_area_struct *expand, bool keep_locked); static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vma, start, end, pgoff, insert, NULL, false); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, + +extern struct vm_area_struct *__vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *); + unsigned long vm_flags, struct anon_vma *anon, struct file *file, + pgoff_t pgoff, struct mempolicy *mpol, struct vm_userfaultfd_ctx uff, + const char __user *user, bool keep_locked); + +static inline struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *anon, struct file *file, + pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff, + const char __user *user) +{ + return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off, + pol, uff, user, false); +} + extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -2369,6 +2742,9 @@ unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +unsigned long randomize_stack_top(unsigned long stack_top); +unsigned long randomize_page(unsigned long start, unsigned long range); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2376,19 +2752,12 @@ struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, - struct list_head *uf); + unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern int __do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); - -static inline unsigned long -do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) -{ - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); -} +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, @@ -2420,26 +2789,7 @@ unsigned long align_offset; }; -extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); -extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); - -/* - * Search for an unmapped address range. - * - * We are looking for a range that: - * - does not intersect with any VMA; - * - is contained within the [low_limit, high_limit) interval; - * - is at least the desired size. - * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) - */ -static inline unsigned long -vm_unmapped_area(struct vm_unmapped_area_info *info) -{ - if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) - return unmapped_area_topdown(info); - else - return unmapped_area(info); -} +extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); @@ -2449,39 +2799,22 @@ /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); -extern void filemap_map_pages(struct vm_fault *vmf, +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern bool filemap_allow_speculation(void); +#endif /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); -/* readahead.c */ -#define VM_MAX_READAHEAD 128 /* kbytes */ -#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -/* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ +/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ extern int expand_downwards(struct vm_area_struct *vma, unsigned long address); #if VM_GROWSUP @@ -2576,12 +2909,20 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); +vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); @@ -2590,32 +2931,6 @@ unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - -static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) -{ - int err = vm_insert_mixed(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - -static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn) -{ - int err = vm_insert_pfn(vma, addr, pfn); if (err == -ENOMEM) return VM_FAULT_OOM; @@ -2641,16 +2956,8 @@ return VM_FAULT_SIGBUS; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags, - unsigned int *page_mask); - -static inline struct page *follow_page(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags) -{ - unsigned int unused_page_mask; - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); -} +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ @@ -2669,6 +2976,66 @@ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ + +/* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY. + * + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when + * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to unpin_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via unpin_user_page(). + * + * Please see Documentation/core-api/pin_user_pages.rst for more information. + */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { @@ -2681,76 +3048,121 @@ return 0; } -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, - void *data); +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +extern int apply_to_existing_page_range(struct mm_struct *mm, + unsigned long address, unsigned long size, + pte_fn_t fn, void *data); - +extern void init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING -extern bool page_poisoning_enabled(void); -extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern void __kernel_poison_pages(struct page *page, int numpages); +extern void __kernel_unpoison_pages(struct page *page, int numpages); +extern bool _page_poisoning_enabled_early; +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); +static inline bool page_poisoning_enabled(void) +{ + return _page_poisoning_enabled_early; +} +/* + * For use in fast paths after init_mem_debugging() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool page_poisoning_enabled_static(void) +{ + return static_branch_unlikely(&_page_poisoning_enabled); +} +static inline void kernel_poison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, numpages); +} +static inline void kernel_unpoison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_unpoison_pages(page, numpages); +} #else static inline bool page_poisoning_enabled(void) { return false; } -static inline void kernel_poison_pages(struct page *page, int numpages, - int enable) { } +static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } +static inline void kernel_poison_pages(struct page *page, int numpages) { } +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_alloc); -#else DECLARE_STATIC_KEY_FALSE(init_on_alloc); -#endif static inline bool want_init_on_alloc(gfp_t flags) { - if (static_branch_unlikely(&init_on_alloc) && - !page_poisoning_enabled()) + if (static_branch_unlikely(&init_on_alloc)) return true; return flags & __GFP_ZERO; } -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DECLARE_STATIC_KEY_TRUE(init_on_free); -#else DECLARE_STATIC_KEY_FALSE(init_on_free); -#endif static inline bool want_init_on_free(void) { - return static_branch_unlikely(&init_on_free) && - !page_poisoning_enabled(); + return static_branch_unlikely(&init_on_free); } -#ifdef CONFIG_DEBUG_PAGEALLOC -extern bool _debug_pagealloc_enabled; -extern void __kernel_map_pages(struct page *page, int numpages, int enable); +extern bool _debug_pagealloc_enabled_early; +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { - return _debug_pagealloc_enabled; + return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + _debug_pagealloc_enabled_early; } +/* + * For use in fast paths after init_debug_pagealloc() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool debug_pagealloc_enabled_static(void) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + + return static_branch_unlikely(&_debug_pagealloc_enabled); +} + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) +extern void __kernel_map_pages(struct page *page, int numpages, int enable); + +/* + * When called in DEBUG_PAGEALLOC context, the call should most likely be + * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() + */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled()) - return; - __kernel_map_pages(page, numpages, enable); } + +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); +} + +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} + #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ -#else /* CONFIG_DEBUG_PAGEALLOC */ +#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ -static inline bool debug_pagealloc_enabled(void) -{ - return false; -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); @@ -2772,8 +3184,8 @@ #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); @@ -2786,23 +3198,30 @@ #endif const char * arch_vma_name(struct vm_area_struct *vma); +#ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); +#else +static inline void print_vma_addr(char *prefix, unsigned long rip) +{ +} +#endif void *sparse_buffer_alloc(unsigned long size); -struct page *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap); +struct page * __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); +pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, + struct vmem_altmap *altmap); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; -void *vmemmap_alloc_block_buf(unsigned long size, int node); -void *altmap_alloc_block_buf(unsigned long size, struct vmem_altmap *altmap); +void *vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, - int node); + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); @@ -2821,14 +3240,13 @@ }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); +extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; -extern int soft_offline_page(struct page *page, int flags); +extern int soft_offline_page(unsigned long pfn, int flags); /* @@ -2863,6 +3281,7 @@ MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; @@ -2878,13 +3297,28 @@ const void __user *usr_src, unsigned int pages_per_huge_page, bool allow_pagefault); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -extern struct page_ext_operations debug_guardpage_ops; +/** + * vma_is_special_huge - Are transhuge page-table entries considered special? + * @vma: Pointer to the struct vm_area_struct to consider + * + * Whether transhuge page-table entries are considered "special" following + * the definition in vm_normal_page(). + * + * Return: true if transhuge page-table entries should be considered special, + * false otherwise. + */ +static inline bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + return vma_is_dax(vma) || (vma->vm_file && + (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); +} + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; -extern bool _debug_guardpage_enabled; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { @@ -2893,21 +3327,15 @@ static inline bool debug_guardpage_enabled(void) { - return _debug_guardpage_enabled; + return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + return PageGuard(page); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } @@ -2921,5 +3349,60 @@ static inline void setup_nr_node_ids(void) {} #endif +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +#ifdef CONFIG_MAPPING_DIRTY_HELPERS +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end); + +unsigned long wp_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr); +#endif + +extern int sysctl_nr_trim_pages; +extern bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr); +extern int reclaim_shmem_address_space(struct address_space *mapping); + +/** + * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * @seals: the seals to check + * @vma: the vma to operate on + * + * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on + * the vma flags. Return 0 if check pass, or <0 for errors. + */ +static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +{ + if (seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); + } + + return 0; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- Gitblit v1.6.2