From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/util.c | 480 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 355 insertions(+), 125 deletions(-) diff --git a/kernel/mm/util.c b/kernel/mm/util.c index 65b4ea6..6ed9861 100644 --- a/kernel/mm/util.c +++ b/kernel/mm/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> @@ -6,6 +7,7 @@ #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> @@ -14,17 +16,21 @@ #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> +#include <linux/elf.h> +#include <linux/elf-randomize.h> +#include <linux/personality.h> +#include <linux/random.h> +#include <linux/processor.h> +#include <linux/sizes.h> +#include <linux/compat.h> -#include <asm/sections.h> #include <linux/uaccess.h> #include "internal.h" - -static inline int is_kernel_rodata(unsigned long addr) -{ - return addr >= (unsigned long)__start_rodata && - addr < (unsigned long)__end_rodata; -} +#ifndef __GENKSYMS__ +#include <trace/hooks/syscall_check.h> +#include <trace/hooks/mm.h> +#endif /** * kfree_const - conditionally free memory @@ -43,6 +49,8 @@ * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrdup(const char *s, gfp_t gfp) { @@ -65,9 +73,11 @@ * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Function returns source string if it is in .rodata section otherwise it - * fallbacks to kstrdup. - * Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { @@ -85,6 +95,8 @@ * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -110,6 +122,8 @@ * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -127,6 +141,9 @@ * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { @@ -150,14 +167,14 @@ * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result is physically + * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; - p = kmalloc_track_caller(len, GFP_USER); + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -176,7 +193,7 @@ * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. Result may be not + * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) @@ -200,6 +217,8 @@ * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { @@ -231,7 +250,7 @@ * @src: source address in user space * @len: number of bytes to copy * - * Returns an ERR_PTR() on failure. + * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { @@ -257,7 +276,7 @@ EXPORT_SYMBOL(memdup_user_nul); void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) + struct vm_area_struct *prev) { struct vm_area_struct *next; @@ -266,16 +285,26 @@ next = prev->vm_next; prev->vm_next = vma; } else { + next = mm->mmap; mm->mmap = vma; - if (rb_parent) - next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - next = NULL; } vma->vm_next = next; if (next) next->vm_prev = vma; +} + +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *prev, *next; + + next = vma->vm_next; + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + if (next) + next->vm_prev = prev; } /* Check if the vma is being used as a stack by this task */ @@ -286,7 +315,138 @@ return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} +EXPORT_SYMBOL_GPL(arch_mmap_rnd); + +static int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; @@ -294,52 +454,79 @@ } #endif -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * If the architecture does not support this function, simply return with no - * pages pinned. +/** + * __account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * @task: task used to check RLIMIT_MEMLOCK + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped + * + * Assumes @task and @mm are valid (i.e. at least one reference on each), and + * that mmap_lock is held as writer. + * + * Return: + * * 0 on success + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ -int __weak __get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim) { - return 0; + unsigned long locked_vm, limit; + int ret = 0; + + mmap_assert_write_locked(mm); + + locked_vm = mm->locked_vm; + if (inc) { + if (!bypass_rlim) { + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked_vm + pages > limit) + ret = -ENOMEM; + } + if (!ret) + mm->locked_vm = locked_vm + pages; + } else { + WARN_ON_ONCE(pages > locked_vm); + mm->locked_vm = locked_vm - pages; + } + + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + return ret; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(__account_locked_vm); /** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. + * account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against, may be NULL + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm, with force=0 and vma=NULL. However - * unlike get_user_pages, it must be called without mmap_sem held. - * - * get_user_pages_fast may take mmap_sem and page table locks, so no - * assumptions can be made about lack of locking. get_user_pages_fast is to be - * implemented in a way that is advantageous (vs get_user_pages()) when the - * user memory area is already faulted in and present in ptes. However if the - * pages have to be faulted in, it may turn out to be slightly slower so - * callers need to carefully consider what to use. On many architectures, - * get_user_pages_fast simply falls back to get_user_pages. + * Return: + * * 0 on success, or if mm is NULL + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ -int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + int ret; + + if (pages == 0 || !mm) + return 0; + + mmap_write_lock(mm); + ret = __account_locked_vm(mm, pages, inc, current, + capable(CAP_IPC_LOCK)); + mmap_write_unlock(mm); + + return ret; } -EXPORT_SYMBOL_GPL(get_user_pages_fast); +EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -352,15 +539,16 @@ ret = security_mmap_file(file, prot, flag); if (!ret) { - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate, &uf); - up_write(&mm->mmap_sem); + ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, + &uf); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } + trace_android_vh_check_mmap_file(file, prot, flag, ret); return ret; } @@ -393,11 +581,14 @@ * * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not * fall back to vmalloc. + * + * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; + bool use_vmalloc = false; /* * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) @@ -405,6 +596,10 @@ */ if ((flags & GFP_KERNEL) != GFP_KERNEL) return kmalloc_node(size, flags, node); + + trace_android_vh_kvmalloc_node_use_vmalloc(size, &kmalloc_flags, &use_vmalloc); + if (use_vmalloc) + goto use_vmalloc_node; /* * We want to attempt a large physically contiguous block first because @@ -429,7 +624,14 @@ if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + +use_vmalloc_node: + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); @@ -442,7 +644,7 @@ * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * - * Context: Any context except NMI. + * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { @@ -470,6 +672,21 @@ } } EXPORT_SYMBOL(kvfree_sensitive); + +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); static inline void *__page_rmapping(struct page *page) { @@ -503,7 +720,7 @@ return true; if (PageHuge(page)) return false; - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } @@ -584,9 +801,8 @@ unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -596,9 +812,49 @@ return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static void sync_overcommit_as(struct work_struct *dummy) +{ + percpu_counter_sync(&vm_committed_as); +} + +int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int new_policy = -1; + int ret; + + /* + * The deviation of sync_overcommit_as could be big with loose policy + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply + * with the strict "NEVER", and to avoid possible race condtion (even + * though user usually won't too frequently do the switching to policy + * OVERCOMMIT_NEVER), the switch is done in the following order: + * 1. changing the batch + * 2. sync percpu count on each CPU + * 3. switch the policy + */ + if (write) { + t = *table; + t.data = &new_policy; + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || new_policy == -1) + return ret; + + mm_compute_batch(new_policy); + if (new_policy == OVERCOMMIT_NEVER) + schedule_on_each_cpu(sync_overcommit_as); + sysctl_overcommit_memory = new_policy; + } else { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } + + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -618,7 +874,7 @@ if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; @@ -638,10 +894,15 @@ * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. + * + * The time cost of this is very low for small platforms, and for big + * platform like a 2S/36C/72T Skylake server, in worst case where + * vm_committed_as's spinlock is under severe contention, the time cost + * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { - return percpu_counter_read_positive(&vm_committed_as); + return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); @@ -663,11 +924,7 @@ */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - long free, allowed, reserve; - - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); + long allowed; vm_acct_memory(pages); @@ -678,51 +935,9 @@ return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_zone_page_state(NR_FREE_PAGES); - free += global_node_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_node_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_node_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Part of the kernel memory, which can be released - * under memory pressure. - */ - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) + if (pages > totalram_pages() + total_swap_pages) goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; + return 0; } allowed = vm_commit_limit(); @@ -736,7 +951,8 @@ * Don't let a single process grow so big a user can't recover */ if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); } @@ -754,7 +970,8 @@ * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. - * Returns the size of the cmdline field copied. Note that the copy does + * + * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) @@ -768,12 +985,12 @@ if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); len = arg_end - arg_start; @@ -805,3 +1022,16 @@ out: return res; } + +int __weak memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} -- Gitblit v1.6.2