From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/userfaultfd.c | 503 +++++++++++++++++++++++++++++++++++-------------------- 1 files changed, 316 insertions(+), 187 deletions(-) diff --git a/kernel/fs/userfaultfd.c b/kernel/fs/userfaultfd.c index c927ade..e11d1a6 100644 --- a/kernel/fs/userfaultfd.c +++ b/kernel/fs/userfaultfd.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/userfaultfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * Copyright (C) 2008-2009 Red Hat, Inc. * Copyright (C) 2015 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. * * Some part derived from fs/eventfd.c (anon inode setup) and * mm/ksm.c (mm hashing). @@ -17,6 +15,7 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/mm.h> +#include <linux/mmu_notifier.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/seq_file.h> @@ -29,6 +28,8 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> + +int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -56,9 +57,9 @@ /* waitqueue head for events */ wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ - seqlock_t refile_seq; + seqcount_spinlock_t refile_seq; /* pseudo fd refcounting */ - atomic_t refcount; + refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ @@ -151,8 +152,7 @@ */ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { - if (!atomic_inc_not_zero(&ctx->refcount)) - BUG(); + refcount_inc(&ctx->refcount); } /** @@ -165,7 +165,7 @@ */ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); @@ -198,24 +198,21 @@ msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; msg.arg.pagefault.address = address; + /* + * These flags indicate why the userfault occurred: + * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. + * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. + * - Neither of these flags being set indicates a MISSING fault. + * + * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write + * fault. Otherwise, it was a read fault. + */ if (flags & FAULT_FLAG_WRITE) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE - * was not set in a UFFD_EVENT_PAGEFAULT, it means it - * was a read fault, otherwise if set it means it's - * a write fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; if (reason & VM_UFFD_WP) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was - * not set in a UFFD_EVENT_PAGEFAULT, it means it was - * a missing fault, otherwise if set it means it's a - * write protect fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (reason & VM_UFFD_MINOR) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; @@ -236,7 +233,7 @@ pte_t *ptep, pte; bool ret = true; - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + mmap_assert_locked(mm); ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); @@ -288,7 +285,7 @@ pte_t *pte; bool ret = true; - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + mmap_assert_locked(mm); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -316,8 +313,11 @@ if (!pmd_present(_pmd)) goto out; - if (pmd_trans_huge(_pmd)) + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + ret = true; goto out; + } /* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it @@ -330,10 +330,23 @@ */ if (pte_none(*pte)) ret = true; + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; pte_unmap(pte); out: return ret; +} + +static inline long userfaultfd_get_blocking_state(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return TASK_INTERRUPTIBLE; + + if (flags & FAULT_FLAG_KILLABLE) + return TASK_KILLABLE; + + return TASK_UNINTERRUPTIBLE; } /* @@ -342,13 +355,13 @@ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" * recommendation in __lock_page_or_retry is not an understatement. * - * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is * not set. * * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not * set, VM_FAULT_RETRY can still be returned if and only if there are - * fatal_signal_pending()s, and the mmap_sem must be released before + * fatal_signal_pending()s, and the mmap_lock must be released before * returning it. */ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) @@ -357,7 +370,7 @@ struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; - bool must_wait, return_to_userland; + bool must_wait; long blocking_state; /* @@ -369,16 +382,16 @@ * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during - * coredumping without mmap_sem and it ends up here. + * coredumping without mmap_lock and it ends up here. */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; /* - * Coredumping runs without mmap_sem so we can only check that - * the mmap_sem is held, if PF_DUMPCORE was not set. + * Coredumping runs without mmap_lock so we can only check that + * the mmap_lock is held, if PF_DUMPCORE was not set. */ - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + mmap_assert_locked(mm); ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -386,16 +399,25 @@ BUG_ON(ctx->mm != mm); - VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); - VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + /* Any unrecognized flag is a bug. */ + VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + /* 0 or > 1 flags set is a bug; we expect exactly 1. */ + VM_BUG_ON(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; + if ((vmf->flags & FAULT_FLAG_USER) == 0 && + ctx->flags & UFFD_USER_MODE_ONLY) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); + goto out; + } /* * If it's already released don't get it. This avoids to loop * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_sem. + * caller of handle_userfault to release the mmap_lock. */ if (unlikely(READ_ONCE(ctx->released))) { /* @@ -454,7 +476,7 @@ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; - /* take the reference before dropping the mmap_sem */ + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); @@ -464,11 +486,7 @@ uwq.ctx = ctx; uwq.waken = false; - return_to_userland = - (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == - (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); - blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE; + blocking_state = userfaultfd_get_blocking_state(vmf->flags); spin_lock_irq(&ctx->fault_pending_wqh.lock); /* @@ -491,64 +509,14 @@ must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, vmf->address, vmf->flags, reason); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); - if (likely(must_wait && !READ_ONCE(ctx->released) && - (return_to_userland ? !signal_pending(current) : - !fatal_signal_pending(current)))) { + if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); - ret |= VM_FAULT_MAJOR; - - /* - * False wakeups can orginate even from rwsem before - * up_read() however userfaults will wait either for a - * targeted wakeup on the specific uwq waitqueue from - * wake_userfault() or for signals or for uffd - * release. - */ - while (!READ_ONCE(uwq.waken)) { - /* - * This needs the full smp_store_mb() - * guarantee as the state write must be - * visible to other CPUs before reading - * uwq.waken from other CPUs. - */ - set_current_state(blocking_state); - if (READ_ONCE(uwq.waken) || - READ_ONCE(ctx->released) || - (return_to_userland ? signal_pending(current) : - fatal_signal_pending(current))) - break; - schedule(); - } } __set_current_state(TASK_RUNNING); - - if (return_to_userland) { - if (signal_pending(current) && - !fatal_signal_pending(current)) { - /* - * If we got a SIGSTOP or SIGCONT and this is - * a normal userland page fault, just let - * userland return so the signal will be - * handled and gdb debugging works. The page - * fault code immediately after we return from - * this function is going to release the - * mmap_sem and it's not depending on it - * (unlike gup would if we were not to return - * VM_FAULT_RETRY). - * - * If a fatal signal is pending we still take - * the streamlined VM_FAULT_RETRY failure path - * and there's no need to retake the mmap_sem - * in such case. - */ - down_read(&mm->mmap_sem); - ret = VM_FAULT_NOPAGE; - } - } /* * Here we race with the list_del; list_add in @@ -640,15 +608,13 @@ struct mm_struct *mm = release_new_ctx->mm; /* the various vma->vm_userfaultfd_ctx still points to it */ - down_write(&mm->mmap_sem); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); + mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); } @@ -677,8 +643,11 @@ octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vm_write_begin(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & ~__VM_UFFD_FLAGS); + vm_write_end(vma); return 0; } @@ -699,7 +668,7 @@ return -ENOMEM; } - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; ctx->features = octx->features; ctx->released = false; @@ -759,7 +728,7 @@ } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } } @@ -801,7 +770,7 @@ userfaultfd_ctx_get(ctx); WRITE_ONCE(ctx->mmap_changing, true); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -881,7 +850,6 @@ /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -892,38 +860,37 @@ * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx - * changes while handle_userfault released the mmap_sem. So + * changes while handle_userfault released the mmap_lock. So * it's critical that released is set to true (above), before - * taking the mmap_sem for writing. + * taking the mmap_lock for writing. */ - down_write(&mm->mmap_sem); - still_valid = mmget_still_valid(mm); + mmap_write_lock(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, - vma_get_anon_name(vma)); - if (prev) - vma = prev; - else - prev = vma; - } - vma->vm_flags = new_flags; + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX, + vma_get_anon_name(vma)); + if (prev) + vma = prev; + else + prev = vma; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); mmput(mm); wakeup: /* @@ -951,7 +918,7 @@ wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&wqh->lock)); + lockdep_assert_held(&wqh->lock); uwq = NULL; if (!waitqueue_active(wqh)) @@ -1013,14 +980,14 @@ static const struct file_operations userfaultfd_fops; -static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, - struct userfaultfd_ctx *new, +static int resolve_userfault_fork(struct userfaultfd_ctx *new, + struct inode *inode, struct uffd_msg *msg) { int fd; - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); + fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -1030,7 +997,7 @@ } static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, - struct uffd_msg *msg) + struct uffd_msg *msg, struct inode *inode) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); @@ -1060,7 +1027,7 @@ * waitqueue could become empty if this is the * only userfault. */ - write_seqlock(&ctx->refile_seq); + write_seqcount_begin(&ctx->refile_seq); /* * The fault_pending_wqh.lock prevents the uwq @@ -1086,7 +1053,7 @@ list_del(&uwq->wq.entry); add_wait_queue(&ctx->fault_wqh, &uwq->wq); - write_sequnlock(&ctx->refile_seq); + write_seqcount_end(&ctx->refile_seq); /* careful to always initialize msg if ret == 0 */ *msg = uwq->msg; @@ -1141,7 +1108,7 @@ spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { - ret = resolve_userfault_fork(ctx, fork_nctx, msg); + ret = resolve_userfault_fork(fork_nctx, inode, msg); spin_lock_irq(&ctx->event_wqh.lock); if (!list_empty(&fork_event)) { /* @@ -1201,6 +1168,7 @@ ssize_t _ret, ret = 0; struct uffd_msg msg; int no_wait = file->f_flags & O_NONBLOCK; + struct inode *inode = file_inode(file); if (!userfaultfd_is_initialized(ctx)) return -EINVAL; @@ -1208,7 +1176,7 @@ for (;;) { if (count < sizeof(msg)) return ret ? ret : -EINVAL; - _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); if (_ret < 0) return ret ? ret : _ret; if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) @@ -1246,7 +1214,7 @@ /* * To be sure waitqueue_active() is not reordered by the CPU * before the pagetable update, use an explicit SMP memory - * barrier here. PT lock release or up_read(mmap_sem) still + * barrier here. PT lock release or mmap_read_unlock(mm) still * have release semantics that can allow the * waitqueue_active() to be reordered before the pte update. */ @@ -1259,41 +1227,51 @@ * sure we've userfaults to wake. */ do { - seq = read_seqbegin(&ctx->refile_seq); + seq = read_seqcount_begin(&ctx->refile_seq); need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || waitqueue_active(&ctx->fault_wqh); cond_resched(); - } while (read_seqretry(&ctx->refile_seq, seq)); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); if (need_wakeup) __wake_userfault(ctx, range); } static __always_inline int validate_range(struct mm_struct *mm, - __u64 *start, __u64 len) + __u64 start, __u64 len) { __u64 task_size = mm->task_size; - *start = untagged_addr(*start); - - if (*start & ~PAGE_MASK) + if (start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (*start < mmap_min_addr) + if (start < mmap_min_addr) return -EINVAL; - if (*start >= task_size) + if (start >= task_size) return -EINVAL; - if (len > task_size - *start) + if (len > task_size - start) return -EINVAL; return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma) +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) { + /* FIXME: add WP support to hugetlbfs and shmem */ + if (vm_flags & VM_UFFD_WP) { + if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) + return false; + } + + if (vm_flags & VM_UFFD_MINOR) { + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) + return false; + } + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + vma_is_shmem(vma); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1319,23 +1297,21 @@ ret = -EINVAL; if (!uffdio_register.mode) goto out; - if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| - UFFDIO_REGISTER_MODE_WP)) + if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; - /* - * FIXME: remove the below error constraint by - * implementing the wprotect tracking mode. - */ - ret = -EINVAL; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR goto out; +#endif + vm_flags |= VM_UFFD_MINOR; } - ret = validate_range(mm, &uffdio_register.range.start, + ret = validate_range(mm, uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1347,9 +1323,7 @@ if (!mmget_not_zero(mm)) goto out; - down_write(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto out_unlock; + mmap_write_lock(mm); vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1379,11 +1353,11 @@ cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, vm_flags)) goto out_unlock; /* @@ -1411,6 +1385,8 @@ if (end & (vma_hpagesize - 1)) goto out_unlock; } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; /* * Check that this vma isn't already owned by a @@ -1440,7 +1416,7 @@ do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1457,7 +1433,7 @@ start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1483,8 +1459,13 @@ * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; + vm_write_end(vma); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); skip: prev = vma; @@ -1492,17 +1473,31 @@ vma = vma->vm_next; } while (vma && vma->vm_start < end); out_unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); mmput(mm); if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + + /* CONTINUE ioctl is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : - UFFD_API_RANGE_IOCTLS, - &user_uffdio_register->ioctls)) + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) ret = -EFAULT; } out: @@ -1525,7 +1520,7 @@ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, &uffdio_unregister.start, + ret = validate_range(mm, uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1537,9 +1532,7 @@ if (!mmget_not_zero(mm)) goto out; - down_write(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto out_unlock; + mmap_write_lock(mm); vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1569,7 +1562,7 @@ cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* * Check not compatible vmas, not strictly required @@ -1578,7 +1571,7 @@ * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, cur->vm_flags)) goto out_unlock; found = true; @@ -1592,7 +1585,7 @@ do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); /* * Nothing to do: this vma is already registered into this @@ -1620,7 +1613,7 @@ wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1646,8 +1639,10 @@ * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); skip: prev = vma; @@ -1655,7 +1650,7 @@ vma = vma->vm_next; } while (vma && vma->vm_start < end); out_unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); mmput(mm); out: return ret; @@ -1677,7 +1672,7 @@ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1717,7 +1712,7 @@ sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1728,11 +1723,12 @@ ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1773,7 +1769,7 @@ sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; @@ -1801,6 +1797,119 @@ wake_userfault(ctx, &range); } ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (READ_ONCE(ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_continue uffdio_continue; + struct uffdio_continue __user *user_uffdio_continue; + struct userfaultfd_wake_range range; + + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, + /* don't copy the output fields */ + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out; + + ret = -EINVAL; + /* double check for wraparound just in case. */ + if (uffdio_continue.range.start + uffdio_continue.range.len <= + uffdio_continue.range.start) { + goto out; + } + if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { + range.start = uffdio_continue.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; + out: return ret; } @@ -1840,6 +1949,10 @@ goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); +#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) @@ -1889,6 +2002,12 @@ case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; + case UFFDIO_CONTINUE: + ret = userfaultfd_continue(ctx, arg); + break; } return ret; } @@ -1929,7 +2048,7 @@ .poll = userfaultfd_poll, .read = userfaultfd_read, .unlocked_ioctl = userfaultfd_ioctl, - .compat_ioctl = userfaultfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; @@ -1941,7 +2060,7 @@ init_waitqueue_head(&ctx->fault_wqh); init_waitqueue_head(&ctx->event_wqh); init_waitqueue_head(&ctx->fd_wqh); - seqlock_init(&ctx->refile_seq); + seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } SYSCALL_DEFINE1(userfaultfd, int, flags) @@ -1949,20 +2068,30 @@ struct userfaultfd_ctx *ctx; int fd; + if (!sysctl_unprivileged_userfaultfd && + (flags & UFFD_USER_MODE_ONLY) == 0 && + !capable(CAP_SYS_PTRACE)) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); + return -EPERM; + } + BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) return -ENOMEM; - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; ctx->released = false; @@ -1971,8 +2100,8 @@ /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); -- Gitblit v1.6.2