From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/madvise.c | 663 +++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 522 insertions(+), 141 deletions(-) diff --git a/kernel/mm/madvise.c b/kernel/mm/madvise.c index 68a9626..7071c11 100644 --- a/kernel/mm/madvise.c +++ b/kernel/mm/madvise.c @@ -11,27 +11,39 @@ #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> +#include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> +#include <linux/fadvise.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> +#include <trace/hooks/mm.h> #include <asm/tlb.h> #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; + bool can_pageout_file; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_sem for writing. Others, which simply traverse vmas, need + * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) @@ -40,6 +52,8 @@ case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -105,28 +119,14 @@ case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; } @@ -152,15 +152,8 @@ goto out; } error = __split_vma(mm, vma, start, 1); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } if (end != vma->vm_end) { @@ -169,22 +162,25 @@ goto out; } error = __split_vma(mm, vma, end, 0); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } success: /* - * vm_flags is protected by the mmap_sem held in write mode. + * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); + vm_write_end(vma); + +out_convert_errno: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; out: return error; } @@ -225,43 +221,36 @@ return 0; } -static void force_swapin_readahead(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_walk walk = { - .mm = vma->vm_mm, - .pmd_entry = swapin_walk_pmd_entry, - .private = vma, - }; - - walk_page_range(start, end, &walk); - - lru_add_drain(); /* Push any new pages onto the LRU now */ -} +static const struct mm_walk_ops swapin_walk_ops = { + .pmd_entry = swapin_walk_pmd_entry, +}; static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { - pgoff_t index; + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); + pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); struct page *page; - swp_entry_t swap; - for (; start < end; start += PAGE_SIZE) { - index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + rcu_read_lock(); + xas_for_each(&xas, page, end_index) { + swp_entry_t swap; - page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { - if (page) - put_page(page); + if (!xa_is_value(page)) continue; - } + xas_pause(&xas); + rcu_read_unlock(); + swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (page) put_page(page); + + rcu_read_lock(); } + rcu_read_unlock(); lru_add_drain(); /* Push any new pages onto the LRU now */ } @@ -274,12 +263,15 @@ struct vm_area_struct **prev, unsigned long start, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; + loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { - force_swapin_readahead(vma, start, end); + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } @@ -298,12 +290,311 @@ return 0; } - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + /* + * Filesystem's fadvise may need to take various locks. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_lock. + */ + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + get_file(file); + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + mmap_read_unlock(mm); + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); + fput(file); + mmap_read_lock(mm); + return 0; +} - force_page_cache_readahead(file->f_mapping, file, start, end - start); +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; + bool pageout_anon_only = pageout && !private->can_pageout_file; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page = NULL; + LIST_HEAD(page_list); + bool allow_shared = false; + bool abort_madvise = false; + bool skip = false; + + trace_android_vh_madvise_cold_or_pageout_abort(vma, &abort_madvise); + if (fatal_signal_pending(current) || abort_madvise) + return -EINTR; + + trace_android_vh_madvise_cold_or_pageout(vma, &allow_shared); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + goto huge_unlock; + + if (pageout_anon_only && !PageAnon(page)) + goto huge_unlock; + + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } + } else + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); + return 0; + } + +regular_page: + if (pmd_trans_unstable(pmd)) + return 0; +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + trace_android_vh_should_end_madvise(mm, &skip, &pageout); + if (skip) + break; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + if (pageout_anon_only && !PageAnon(page)) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + /* + * Do not interfere with other mappings of this page and + * non-LRU page. + */ + if (!allow_shared && (!PageLRU(page) || page_mapcount(page) != 1)) + continue; + + if (pageout_anon_only && !PageAnon(page)) + continue; + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else { + list_add(&page->lru, &page_list); + trace_android_vh_page_isolated_for_reclaim(mm, page); + } + } + } else + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_or_pageout_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + bool can_pageout_file) +{ + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + .can_pageout_file = can_pageout_file, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_file_pageout(struct vm_area_struct *vma) +{ + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + bool can_pageout_file; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + /* + * If the VMA belongs to a private file mapping, there can be private + * dirty pages which can be paged out if even this process is neither + * owner nor write capable of the file. Cache the file access check + * here and use it later during page walk. + */ + can_pageout_file = can_do_file_pageout(vma); + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr, can_pageout_file); + tlb_finish_mmu(&tlb, start_addr, end_addr); + return 0; } @@ -328,7 +619,7 @@ if (pmd_trans_unstable(pmd)) return 0; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); @@ -354,7 +645,7 @@ continue; } - page = _vm_normal_page(vma, addr, ptent, true); + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -440,47 +731,41 @@ return 0; } -static void madvise_free_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - struct mm_walk free_walk = { - .pmd_entry = madvise_free_pte_range, - .mm = vma->vm_mm, - .private = tlb, - }; - - tlb_start_vma(tlb, vma); - walk_page_range(addr, end, &free_walk); - tlb_end_vma(tlb, vma); -} +static const struct mm_walk_ops madvise_free_walk_ops = { + .pmd_entry = madvise_free_pte_range, +}; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - unsigned long start, end; struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) + range.start = max(vma->vm_start, start_addr); + if (range.start >= vma->vm_end) return -EINVAL; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) + range.end = min(vma->vm_end, end_addr); + if (range.end <= vma->vm_start) return -EINVAL; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, range.start, range.end); update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - madvise_free_page_range(&tlb, vma, start, end); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_invalidate_range_start(&range); + tlb_start_vma(&tlb, vma); + walk_page_range(vma->vm_mm, range.start, range.end, + &madvise_free_walk_ops, &tlb); + tlb_end_vma(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, range.start, range.end); return 0; } @@ -516,15 +801,17 @@ unsigned long start, unsigned long end, int behavior) { + struct mm_struct *mm = vma->vm_mm; + *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { - *prev = NULL; /* mmap_sem has been dropped, prev is stale */ + *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, start); + mmap_read_lock(mm); + vma = find_vma(mm, start); if (!vma) return -ENOMEM; if (start < vma->vm_start) { @@ -539,12 +826,12 @@ */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old - * vma was splitted while the mmap_sem was + * vma was splitted while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an @@ -578,8 +865,9 @@ loff_t offset; int error; struct file *f; + struct mm_struct *mm = vma->vm_mm; - *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; @@ -600,18 +888,18 @@ * Filesystem's fallocate may need to take i_mutex. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop - * mmap_sem. + * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { - /* mmap_sem was not released by userfaultfd_remove() */ - up_read(¤t->mm->mmap_sem); + /* mmap_lock was not released by userfaultfd_remove() */ + mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(mm); return error; } @@ -622,16 +910,16 @@ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { - struct page *page; struct zone *zone; - unsigned int order; + unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE << order) { + for (; start < end; start += size) { unsigned long pfn; + struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); @@ -642,36 +930,20 @@ /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will - * no longer be a compound page, and order will be 0. + * no longer be a compound page. */ - order = compound_order(compound_head(page)); - - if (PageHWPoison(page)) { - put_page(page); - continue; - } + size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", - pfn, start); - - ret = soft_offline_page(page, MF_COUNT_INCREASED); - if (ret) - return ret; - continue; + pfn, start); + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); + } else { + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + pfn, start); + ret = memory_failure(pfn, MF_COUNT_INCREASED); } - pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", - pfn, start); - - /* - * Drop the page reference taken by get_user_pages_fast(). In - * the absence of MF_COUNT_INCREASED the memory_failure() - * routine is responsible for pinning the page to prevent it - * from being released back to the page allocator. - */ - put_page(page); - ret = memory_failure(pfn, 0); if (ret) return ret; } @@ -693,6 +965,10 @@ return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -714,6 +990,8 @@ case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -732,6 +1010,19 @@ #endif return true; + default: + return false; + } +} + +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_WILLNEED: + return true; default: return false; } @@ -784,6 +1075,11 @@ * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -798,7 +1094,7 @@ * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -813,9 +1109,9 @@ if (!madvise_behavior_valid(behavior)) return error; - if (start & ~PAGE_MASK) + if (!PAGE_ALIGNED(start)) return error; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; + len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) @@ -836,10 +1132,10 @@ write = madvise_need_mmap_write(behavior); if (write) { - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; } else { - down_read(¤t->mm->mmap_sem); + mmap_read_lock(mm); } /* @@ -847,7 +1143,7 @@ * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; @@ -883,15 +1179,100 @@ goto out; if (prev) vma = prev->vm_next; - else /* madvise_remove dropped mmap_sem */ - vma = find_vma(current->mm, start); + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); } out: blk_finish_plug(&plug); if (write) - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(mm); else - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(mm); return error; } + +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + return do_madvise(current->mm, start, len_in, behavior); +} + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (!process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported. + */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + ret = (total_len - iov_iter_count(&iter)) ? : ret; + +release_mm: + mmput(mm); +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; +} -- Gitblit v1.6.2