hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/userfaultfd.c
....@@ -1,12 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * fs/userfaultfd.c
34 *
45 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
56 * Copyright (C) 2008-2009 Red Hat, Inc.
67 * Copyright (C) 2015 Red Hat, Inc.
7
- *
8
- * This work is licensed under the terms of the GNU GPL, version 2. See
9
- * the COPYING file in the top-level directory.
108 *
119 * Some part derived from fs/eventfd.c (anon inode setup) and
1210 * mm/ksm.c (mm hashing).
....@@ -17,6 +15,7 @@
1715 #include <linux/sched/signal.h>
1816 #include <linux/sched/mm.h>
1917 #include <linux/mm.h>
18
+#include <linux/mmu_notifier.h>
2019 #include <linux/poll.h>
2120 #include <linux/slab.h>
2221 #include <linux/seq_file.h>
....@@ -29,6 +28,8 @@
2928 #include <linux/ioctl.h>
3029 #include <linux/security.h>
3130 #include <linux/hugetlb.h>
31
+
32
+int sysctl_unprivileged_userfaultfd __read_mostly;
3233
3334 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
3435
....@@ -56,9 +57,9 @@
5657 /* waitqueue head for events */
5758 wait_queue_head_t event_wqh;
5859 /* a refile sequence protected by fault_pending_wqh lock */
59
- seqlock_t refile_seq;
60
+ seqcount_spinlock_t refile_seq;
6061 /* pseudo fd refcounting */
61
- atomic_t refcount;
62
+ refcount_t refcount;
6263 /* userfaultfd syscall flags */
6364 unsigned int flags;
6465 /* features requested from the userspace */
....@@ -151,8 +152,7 @@
151152 */
152153 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
153154 {
154
- if (!atomic_inc_not_zero(&ctx->refcount))
155
- BUG();
155
+ refcount_inc(&ctx->refcount);
156156 }
157157
158158 /**
....@@ -165,7 +165,7 @@
165165 */
166166 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
167167 {
168
- if (atomic_dec_and_test(&ctx->refcount)) {
168
+ if (refcount_dec_and_test(&ctx->refcount)) {
169169 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
170170 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
171171 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
....@@ -198,24 +198,21 @@
198198 msg_init(&msg);
199199 msg.event = UFFD_EVENT_PAGEFAULT;
200200 msg.arg.pagefault.address = address;
201
+ /*
202
+ * These flags indicate why the userfault occurred:
203
+ * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
204
+ * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
205
+ * - Neither of these flags being set indicates a MISSING fault.
206
+ *
207
+ * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
208
+ * fault. Otherwise, it was a read fault.
209
+ */
201210 if (flags & FAULT_FLAG_WRITE)
202
- /*
203
- * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
204
- * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
205
- * was not set in a UFFD_EVENT_PAGEFAULT, it means it
206
- * was a read fault, otherwise if set it means it's
207
- * a write fault.
208
- */
209211 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
210212 if (reason & VM_UFFD_WP)
211
- /*
212
- * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
213
- * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
214
- * not set in a UFFD_EVENT_PAGEFAULT, it means it was
215
- * a missing fault, otherwise if set it means it's a
216
- * write protect fault.
217
- */
218213 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
214
+ if (reason & VM_UFFD_MINOR)
215
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
219216 if (features & UFFD_FEATURE_THREAD_ID)
220217 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
221218 return msg;
....@@ -236,7 +233,7 @@
236233 pte_t *ptep, pte;
237234 bool ret = true;
238235
239
- VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
236
+ mmap_assert_locked(mm);
240237
241238 ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
242239
....@@ -288,7 +285,7 @@
288285 pte_t *pte;
289286 bool ret = true;
290287
291
- VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
288
+ mmap_assert_locked(mm);
292289
293290 pgd = pgd_offset(mm, address);
294291 if (!pgd_present(*pgd))
....@@ -316,8 +313,11 @@
316313 if (!pmd_present(_pmd))
317314 goto out;
318315
319
- if (pmd_trans_huge(_pmd))
316
+ if (pmd_trans_huge(_pmd)) {
317
+ if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
318
+ ret = true;
320319 goto out;
320
+ }
321321
322322 /*
323323 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
....@@ -330,10 +330,23 @@
330330 */
331331 if (pte_none(*pte))
332332 ret = true;
333
+ if (!pte_write(*pte) && (reason & VM_UFFD_WP))
334
+ ret = true;
333335 pte_unmap(pte);
334336
335337 out:
336338 return ret;
339
+}
340
+
341
+static inline long userfaultfd_get_blocking_state(unsigned int flags)
342
+{
343
+ if (flags & FAULT_FLAG_INTERRUPTIBLE)
344
+ return TASK_INTERRUPTIBLE;
345
+
346
+ if (flags & FAULT_FLAG_KILLABLE)
347
+ return TASK_KILLABLE;
348
+
349
+ return TASK_UNINTERRUPTIBLE;
337350 }
338351
339352 /*
....@@ -342,13 +355,13 @@
342355 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
343356 * recommendation in __lock_page_or_retry is not an understatement.
344357 *
345
- * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
358
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
346359 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
347360 * not set.
348361 *
349362 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
350363 * set, VM_FAULT_RETRY can still be returned if and only if there are
351
- * fatal_signal_pending()s, and the mmap_sem must be released before
364
+ * fatal_signal_pending()s, and the mmap_lock must be released before
352365 * returning it.
353366 */
354367 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
....@@ -357,7 +370,7 @@
357370 struct userfaultfd_ctx *ctx;
358371 struct userfaultfd_wait_queue uwq;
359372 vm_fault_t ret = VM_FAULT_SIGBUS;
360
- bool must_wait, return_to_userland;
373
+ bool must_wait;
361374 long blocking_state;
362375
363376 /*
....@@ -369,16 +382,16 @@
369382 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
370383 * the no_page_table() helper in follow_page_mask(), but the
371384 * shmem_vm_ops->fault method is invoked even during
372
- * coredumping without mmap_sem and it ends up here.
385
+ * coredumping without mmap_lock and it ends up here.
373386 */
374387 if (current->flags & (PF_EXITING|PF_DUMPCORE))
375388 goto out;
376389
377390 /*
378
- * Coredumping runs without mmap_sem so we can only check that
379
- * the mmap_sem is held, if PF_DUMPCORE was not set.
391
+ * Coredumping runs without mmap_lock so we can only check that
392
+ * the mmap_lock is held, if PF_DUMPCORE was not set.
380393 */
381
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
394
+ mmap_assert_locked(mm);
382395
383396 ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
384397 if (!ctx)
....@@ -386,16 +399,25 @@
386399
387400 BUG_ON(ctx->mm != mm);
388401
389
- VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
390
- VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
402
+ /* Any unrecognized flag is a bug. */
403
+ VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
404
+ /* 0 or > 1 flags set is a bug; we expect exactly 1. */
405
+ VM_BUG_ON(!reason || (reason & (reason - 1)));
391406
392407 if (ctx->features & UFFD_FEATURE_SIGBUS)
393408 goto out;
409
+ if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
410
+ ctx->flags & UFFD_USER_MODE_ONLY) {
411
+ printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
412
+ "sysctl knob to 1 if kernel faults must be handled "
413
+ "without obtaining CAP_SYS_PTRACE capability\n");
414
+ goto out;
415
+ }
394416
395417 /*
396418 * If it's already released don't get it. This avoids to loop
397419 * in __get_user_pages if userfaultfd_release waits on the
398
- * caller of handle_userfault to release the mmap_sem.
420
+ * caller of handle_userfault to release the mmap_lock.
399421 */
400422 if (unlikely(READ_ONCE(ctx->released))) {
401423 /*
....@@ -454,7 +476,7 @@
454476 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
455477 goto out;
456478
457
- /* take the reference before dropping the mmap_sem */
479
+ /* take the reference before dropping the mmap_lock */
458480 userfaultfd_ctx_get(ctx);
459481
460482 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
....@@ -464,11 +486,7 @@
464486 uwq.ctx = ctx;
465487 uwq.waken = false;
466488
467
- return_to_userland =
468
- (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
469
- (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
470
- blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
471
- TASK_KILLABLE;
489
+ blocking_state = userfaultfd_get_blocking_state(vmf->flags);
472490
473491 spin_lock_irq(&ctx->fault_pending_wqh.lock);
474492 /*
....@@ -491,64 +509,14 @@
491509 must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
492510 vmf->address,
493511 vmf->flags, reason);
494
- up_read(&mm->mmap_sem);
512
+ mmap_read_unlock(mm);
495513
496
- if (likely(must_wait && !READ_ONCE(ctx->released) &&
497
- (return_to_userland ? !signal_pending(current) :
498
- !fatal_signal_pending(current)))) {
514
+ if (likely(must_wait && !READ_ONCE(ctx->released))) {
499515 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
500516 schedule();
501
- ret |= VM_FAULT_MAJOR;
502
-
503
- /*
504
- * False wakeups can orginate even from rwsem before
505
- * up_read() however userfaults will wait either for a
506
- * targeted wakeup on the specific uwq waitqueue from
507
- * wake_userfault() or for signals or for uffd
508
- * release.
509
- */
510
- while (!READ_ONCE(uwq.waken)) {
511
- /*
512
- * This needs the full smp_store_mb()
513
- * guarantee as the state write must be
514
- * visible to other CPUs before reading
515
- * uwq.waken from other CPUs.
516
- */
517
- set_current_state(blocking_state);
518
- if (READ_ONCE(uwq.waken) ||
519
- READ_ONCE(ctx->released) ||
520
- (return_to_userland ? signal_pending(current) :
521
- fatal_signal_pending(current)))
522
- break;
523
- schedule();
524
- }
525517 }
526518
527519 __set_current_state(TASK_RUNNING);
528
-
529
- if (return_to_userland) {
530
- if (signal_pending(current) &&
531
- !fatal_signal_pending(current)) {
532
- /*
533
- * If we got a SIGSTOP or SIGCONT and this is
534
- * a normal userland page fault, just let
535
- * userland return so the signal will be
536
- * handled and gdb debugging works. The page
537
- * fault code immediately after we return from
538
- * this function is going to release the
539
- * mmap_sem and it's not depending on it
540
- * (unlike gup would if we were not to return
541
- * VM_FAULT_RETRY).
542
- *
543
- * If a fatal signal is pending we still take
544
- * the streamlined VM_FAULT_RETRY failure path
545
- * and there's no need to retake the mmap_sem
546
- * in such case.
547
- */
548
- down_read(&mm->mmap_sem);
549
- ret = VM_FAULT_NOPAGE;
550
- }
551
- }
552520
553521 /*
554522 * Here we race with the list_del; list_add in
....@@ -640,15 +608,13 @@
640608 struct mm_struct *mm = release_new_ctx->mm;
641609
642610 /* the various vma->vm_userfaultfd_ctx still points to it */
643
- down_write(&mm->mmap_sem);
644
- /* no task can run (and in turn coredump) yet */
645
- VM_WARN_ON(!mmget_still_valid(mm));
611
+ mmap_write_lock(mm);
646612 for (vma = mm->mmap; vma; vma = vma->vm_next)
647613 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
648614 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
649
- vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
615
+ vma->vm_flags &= ~__VM_UFFD_FLAGS;
650616 }
651
- up_write(&mm->mmap_sem);
617
+ mmap_write_unlock(mm);
652618
653619 userfaultfd_ctx_put(release_new_ctx);
654620 }
....@@ -677,8 +643,11 @@
677643
678644 octx = vma->vm_userfaultfd_ctx.ctx;
679645 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
646
+ vm_write_begin(vma);
680647 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
681
- vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
648
+ WRITE_ONCE(vma->vm_flags,
649
+ vma->vm_flags & ~__VM_UFFD_FLAGS);
650
+ vm_write_end(vma);
682651 return 0;
683652 }
684653
....@@ -699,7 +668,7 @@
699668 return -ENOMEM;
700669 }
701670
702
- atomic_set(&ctx->refcount, 1);
671
+ refcount_set(&ctx->refcount, 1);
703672 ctx->flags = octx->flags;
704673 ctx->features = octx->features;
705674 ctx->released = false;
....@@ -759,7 +728,7 @@
759728 } else {
760729 /* Drop uffd context if remap feature not enabled */
761730 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
762
- vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
731
+ vma->vm_flags &= ~__VM_UFFD_FLAGS;
763732 }
764733 }
765734
....@@ -801,7 +770,7 @@
801770
802771 userfaultfd_ctx_get(ctx);
803772 WRITE_ONCE(ctx->mmap_changing, true);
804
- up_read(&mm->mmap_sem);
773
+ mmap_read_unlock(mm);
805774
806775 msg_init(&ewq.msg);
807776
....@@ -881,7 +850,6 @@
881850 /* len == 0 means wake all */
882851 struct userfaultfd_wake_range range = { .len = 0, };
883852 unsigned long new_flags;
884
- bool still_valid;
885853
886854 WRITE_ONCE(ctx->released, true);
887855
....@@ -892,38 +860,37 @@
892860 * Flush page faults out of all CPUs. NOTE: all page faults
893861 * must be retried without returning VM_FAULT_SIGBUS if
894862 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
895
- * changes while handle_userfault released the mmap_sem. So
863
+ * changes while handle_userfault released the mmap_lock. So
896864 * it's critical that released is set to true (above), before
897
- * taking the mmap_sem for writing.
865
+ * taking the mmap_lock for writing.
898866 */
899
- down_write(&mm->mmap_sem);
900
- still_valid = mmget_still_valid(mm);
867
+ mmap_write_lock(mm);
901868 prev = NULL;
902869 for (vma = mm->mmap; vma; vma = vma->vm_next) {
903870 cond_resched();
904871 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
905
- !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
872
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
906873 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
907874 prev = vma;
908875 continue;
909876 }
910
- new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
911
- if (still_valid) {
912
- prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
913
- new_flags, vma->anon_vma,
914
- vma->vm_file, vma->vm_pgoff,
915
- vma_policy(vma),
916
- NULL_VM_UFFD_CTX,
917
- vma_get_anon_name(vma));
918
- if (prev)
919
- vma = prev;
920
- else
921
- prev = vma;
922
- }
923
- vma->vm_flags = new_flags;
877
+ new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
878
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
879
+ new_flags, vma->anon_vma,
880
+ vma->vm_file, vma->vm_pgoff,
881
+ vma_policy(vma),
882
+ NULL_VM_UFFD_CTX,
883
+ vma_get_anon_name(vma));
884
+ if (prev)
885
+ vma = prev;
886
+ else
887
+ prev = vma;
888
+ vm_write_begin(vma);
889
+ WRITE_ONCE(vma->vm_flags, new_flags);
924890 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
891
+ vm_write_end(vma);
925892 }
926
- up_write(&mm->mmap_sem);
893
+ mmap_write_unlock(mm);
927894 mmput(mm);
928895 wakeup:
929896 /*
....@@ -951,7 +918,7 @@
951918 wait_queue_entry_t *wq;
952919 struct userfaultfd_wait_queue *uwq;
953920
954
- VM_BUG_ON(!spin_is_locked(&wqh->lock));
921
+ lockdep_assert_held(&wqh->lock);
955922
956923 uwq = NULL;
957924 if (!waitqueue_active(wqh))
....@@ -1013,14 +980,14 @@
1013980
1014981 static const struct file_operations userfaultfd_fops;
1015982
1016
-static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
1017
- struct userfaultfd_ctx *new,
983
+static int resolve_userfault_fork(struct userfaultfd_ctx *new,
984
+ struct inode *inode,
1018985 struct uffd_msg *msg)
1019986 {
1020987 int fd;
1021988
1022
- fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
1023
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
989
+ fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
990
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1024991 if (fd < 0)
1025992 return fd;
1026993
....@@ -1030,7 +997,7 @@
1030997 }
1031998
1032999 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1033
- struct uffd_msg *msg)
1000
+ struct uffd_msg *msg, struct inode *inode)
10341001 {
10351002 ssize_t ret;
10361003 DECLARE_WAITQUEUE(wait, current);
....@@ -1060,7 +1027,7 @@
10601027 * waitqueue could become empty if this is the
10611028 * only userfault.
10621029 */
1063
- write_seqlock(&ctx->refile_seq);
1030
+ write_seqcount_begin(&ctx->refile_seq);
10641031
10651032 /*
10661033 * The fault_pending_wqh.lock prevents the uwq
....@@ -1086,7 +1053,7 @@
10861053 list_del(&uwq->wq.entry);
10871054 add_wait_queue(&ctx->fault_wqh, &uwq->wq);
10881055
1089
- write_sequnlock(&ctx->refile_seq);
1056
+ write_seqcount_end(&ctx->refile_seq);
10901057
10911058 /* careful to always initialize msg if ret == 0 */
10921059 *msg = uwq->msg;
....@@ -1141,7 +1108,7 @@
11411108 spin_unlock_irq(&ctx->fd_wqh.lock);
11421109
11431110 if (!ret && msg->event == UFFD_EVENT_FORK) {
1144
- ret = resolve_userfault_fork(ctx, fork_nctx, msg);
1111
+ ret = resolve_userfault_fork(fork_nctx, inode, msg);
11451112 spin_lock_irq(&ctx->event_wqh.lock);
11461113 if (!list_empty(&fork_event)) {
11471114 /*
....@@ -1201,6 +1168,7 @@
12011168 ssize_t _ret, ret = 0;
12021169 struct uffd_msg msg;
12031170 int no_wait = file->f_flags & O_NONBLOCK;
1171
+ struct inode *inode = file_inode(file);
12041172
12051173 if (!userfaultfd_is_initialized(ctx))
12061174 return -EINVAL;
....@@ -1208,7 +1176,7 @@
12081176 for (;;) {
12091177 if (count < sizeof(msg))
12101178 return ret ? ret : -EINVAL;
1211
- _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
1179
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
12121180 if (_ret < 0)
12131181 return ret ? ret : _ret;
12141182 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
....@@ -1246,7 +1214,7 @@
12461214 /*
12471215 * To be sure waitqueue_active() is not reordered by the CPU
12481216 * before the pagetable update, use an explicit SMP memory
1249
- * barrier here. PT lock release or up_read(mmap_sem) still
1217
+ * barrier here. PT lock release or mmap_read_unlock(mm) still
12501218 * have release semantics that can allow the
12511219 * waitqueue_active() to be reordered before the pte update.
12521220 */
....@@ -1259,41 +1227,51 @@
12591227 * sure we've userfaults to wake.
12601228 */
12611229 do {
1262
- seq = read_seqbegin(&ctx->refile_seq);
1230
+ seq = read_seqcount_begin(&ctx->refile_seq);
12631231 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
12641232 waitqueue_active(&ctx->fault_wqh);
12651233 cond_resched();
1266
- } while (read_seqretry(&ctx->refile_seq, seq));
1234
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
12671235 if (need_wakeup)
12681236 __wake_userfault(ctx, range);
12691237 }
12701238
12711239 static __always_inline int validate_range(struct mm_struct *mm,
1272
- __u64 *start, __u64 len)
1240
+ __u64 start, __u64 len)
12731241 {
12741242 __u64 task_size = mm->task_size;
12751243
1276
- *start = untagged_addr(*start);
1277
-
1278
- if (*start & ~PAGE_MASK)
1244
+ if (start & ~PAGE_MASK)
12791245 return -EINVAL;
12801246 if (len & ~PAGE_MASK)
12811247 return -EINVAL;
12821248 if (!len)
12831249 return -EINVAL;
1284
- if (*start < mmap_min_addr)
1250
+ if (start < mmap_min_addr)
12851251 return -EINVAL;
1286
- if (*start >= task_size)
1252
+ if (start >= task_size)
12871253 return -EINVAL;
1288
- if (len > task_size - *start)
1254
+ if (len > task_size - start)
12891255 return -EINVAL;
12901256 return 0;
12911257 }
12921258
1293
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
1259
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
1260
+ unsigned long vm_flags)
12941261 {
1262
+ /* FIXME: add WP support to hugetlbfs and shmem */
1263
+ if (vm_flags & VM_UFFD_WP) {
1264
+ if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
1265
+ return false;
1266
+ }
1267
+
1268
+ if (vm_flags & VM_UFFD_MINOR) {
1269
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
1270
+ return false;
1271
+ }
1272
+
12951273 return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
1296
- vma_is_shmem(vma);
1274
+ vma_is_shmem(vma);
12971275 }
12981276
12991277 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
....@@ -1319,23 +1297,21 @@
13191297 ret = -EINVAL;
13201298 if (!uffdio_register.mode)
13211299 goto out;
1322
- if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
1323
- UFFDIO_REGISTER_MODE_WP))
1300
+ if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
13241301 goto out;
13251302 vm_flags = 0;
13261303 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
13271304 vm_flags |= VM_UFFD_MISSING;
1328
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1305
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
13291306 vm_flags |= VM_UFFD_WP;
1330
- /*
1331
- * FIXME: remove the below error constraint by
1332
- * implementing the wprotect tracking mode.
1333
- */
1334
- ret = -EINVAL;
1307
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1308
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
13351309 goto out;
1310
+#endif
1311
+ vm_flags |= VM_UFFD_MINOR;
13361312 }
13371313
1338
- ret = validate_range(mm, &uffdio_register.range.start,
1314
+ ret = validate_range(mm, uffdio_register.range.start,
13391315 uffdio_register.range.len);
13401316 if (ret)
13411317 goto out;
....@@ -1347,9 +1323,7 @@
13471323 if (!mmget_not_zero(mm))
13481324 goto out;
13491325
1350
- down_write(&mm->mmap_sem);
1351
- if (!mmget_still_valid(mm))
1352
- goto out_unlock;
1326
+ mmap_write_lock(mm);
13531327 vma = find_vma_prev(mm, start, &prev);
13541328 if (!vma)
13551329 goto out_unlock;
....@@ -1379,11 +1353,11 @@
13791353 cond_resched();
13801354
13811355 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1382
- !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1356
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
13831357
13841358 /* check not compatible vmas */
13851359 ret = -EINVAL;
1386
- if (!vma_can_userfault(cur))
1360
+ if (!vma_can_userfault(cur, vm_flags))
13871361 goto out_unlock;
13881362
13891363 /*
....@@ -1411,6 +1385,8 @@
14111385 if (end & (vma_hpagesize - 1))
14121386 goto out_unlock;
14131387 }
1388
+ if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1389
+ goto out_unlock;
14141390
14151391 /*
14161392 * Check that this vma isn't already owned by a
....@@ -1440,7 +1416,7 @@
14401416 do {
14411417 cond_resched();
14421418
1443
- BUG_ON(!vma_can_userfault(vma));
1419
+ BUG_ON(!vma_can_userfault(vma, vm_flags));
14441420 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
14451421 vma->vm_userfaultfd_ctx.ctx != ctx);
14461422 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
....@@ -1457,7 +1433,7 @@
14571433 start = vma->vm_start;
14581434 vma_end = min(end, vma->vm_end);
14591435
1460
- new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
1436
+ new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
14611437 prev = vma_merge(mm, prev, start, vma_end, new_flags,
14621438 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
14631439 vma_policy(vma),
....@@ -1483,8 +1459,13 @@
14831459 * the next vma was merged into the current one and
14841460 * the current one has not been updated yet.
14851461 */
1486
- vma->vm_flags = new_flags;
1462
+ vm_write_begin(vma);
1463
+ WRITE_ONCE(vma->vm_flags, new_flags);
14871464 vma->vm_userfaultfd_ctx.ctx = ctx;
1465
+ vm_write_end(vma);
1466
+
1467
+ if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1468
+ hugetlb_unshare_all_pmds(vma);
14881469
14891470 skip:
14901471 prev = vma;
....@@ -1492,17 +1473,31 @@
14921473 vma = vma->vm_next;
14931474 } while (vma && vma->vm_start < end);
14941475 out_unlock:
1495
- up_write(&mm->mmap_sem);
1476
+ mmap_write_unlock(mm);
14961477 mmput(mm);
14971478 if (!ret) {
1479
+ __u64 ioctls_out;
1480
+
1481
+ ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1482
+ UFFD_API_RANGE_IOCTLS;
1483
+
1484
+ /*
1485
+ * Declare the WP ioctl only if the WP mode is
1486
+ * specified and all checks passed with the range
1487
+ */
1488
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1489
+ ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1490
+
1491
+ /* CONTINUE ioctl is only supported for MINOR ranges. */
1492
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1493
+ ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1494
+
14981495 /*
14991496 * Now that we scanned all vmas we can already tell
15001497 * userland which ioctls methods are guaranteed to
15011498 * succeed on this range.
15021499 */
1503
- if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1504
- UFFD_API_RANGE_IOCTLS,
1505
- &user_uffdio_register->ioctls))
1500
+ if (put_user(ioctls_out, &user_uffdio_register->ioctls))
15061501 ret = -EFAULT;
15071502 }
15081503 out:
....@@ -1525,7 +1520,7 @@
15251520 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
15261521 goto out;
15271522
1528
- ret = validate_range(mm, &uffdio_unregister.start,
1523
+ ret = validate_range(mm, uffdio_unregister.start,
15291524 uffdio_unregister.len);
15301525 if (ret)
15311526 goto out;
....@@ -1537,9 +1532,7 @@
15371532 if (!mmget_not_zero(mm))
15381533 goto out;
15391534
1540
- down_write(&mm->mmap_sem);
1541
- if (!mmget_still_valid(mm))
1542
- goto out_unlock;
1535
+ mmap_write_lock(mm);
15431536 vma = find_vma_prev(mm, start, &prev);
15441537 if (!vma)
15451538 goto out_unlock;
....@@ -1569,7 +1562,7 @@
15691562 cond_resched();
15701563
15711564 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1572
- !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
1565
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
15731566
15741567 /*
15751568 * Check not compatible vmas, not strictly required
....@@ -1578,7 +1571,7 @@
15781571 * provides for more strict behavior to notice
15791572 * unregistration errors.
15801573 */
1581
- if (!vma_can_userfault(cur))
1574
+ if (!vma_can_userfault(cur, cur->vm_flags))
15821575 goto out_unlock;
15831576
15841577 found = true;
....@@ -1592,7 +1585,7 @@
15921585 do {
15931586 cond_resched();
15941587
1595
- BUG_ON(!vma_can_userfault(vma));
1588
+ BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
15961589
15971590 /*
15981591 * Nothing to do: this vma is already registered into this
....@@ -1620,7 +1613,7 @@
16201613 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
16211614 }
16221615
1623
- new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
1616
+ new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
16241617 prev = vma_merge(mm, prev, start, vma_end, new_flags,
16251618 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
16261619 vma_policy(vma),
....@@ -1646,8 +1639,10 @@
16461639 * the next vma was merged into the current one and
16471640 * the current one has not been updated yet.
16481641 */
1649
- vma->vm_flags = new_flags;
1642
+ vm_write_begin(vma);
1643
+ WRITE_ONCE(vma->vm_flags, new_flags);
16501644 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1645
+ vm_write_end(vma);
16511646
16521647 skip:
16531648 prev = vma;
....@@ -1655,7 +1650,7 @@
16551650 vma = vma->vm_next;
16561651 } while (vma && vma->vm_start < end);
16571652 out_unlock:
1658
- up_write(&mm->mmap_sem);
1653
+ mmap_write_unlock(mm);
16591654 mmput(mm);
16601655 out:
16611656 return ret;
....@@ -1677,7 +1672,7 @@
16771672 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
16781673 goto out;
16791674
1680
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
1675
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
16811676 if (ret)
16821677 goto out;
16831678
....@@ -1717,7 +1712,7 @@
17171712 sizeof(uffdio_copy)-sizeof(__s64)))
17181713 goto out;
17191714
1720
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
1715
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
17211716 if (ret)
17221717 goto out;
17231718 /*
....@@ -1728,11 +1723,12 @@
17281723 ret = -EINVAL;
17291724 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
17301725 goto out;
1731
- if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
1726
+ if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
17321727 goto out;
17331728 if (mmget_not_zero(ctx->mm)) {
17341729 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1735
- uffdio_copy.len, &ctx->mmap_changing);
1730
+ uffdio_copy.len, &ctx->mmap_changing,
1731
+ uffdio_copy.mode);
17361732 mmput(ctx->mm);
17371733 } else {
17381734 return -ESRCH;
....@@ -1773,7 +1769,7 @@
17731769 sizeof(uffdio_zeropage)-sizeof(__s64)))
17741770 goto out;
17751771
1776
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
1772
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
17771773 uffdio_zeropage.range.len);
17781774 if (ret)
17791775 goto out;
....@@ -1801,6 +1797,119 @@
18011797 wake_userfault(ctx, &range);
18021798 }
18031799 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1800
+out:
1801
+ return ret;
1802
+}
1803
+
1804
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1805
+ unsigned long arg)
1806
+{
1807
+ int ret;
1808
+ struct uffdio_writeprotect uffdio_wp;
1809
+ struct uffdio_writeprotect __user *user_uffdio_wp;
1810
+ struct userfaultfd_wake_range range;
1811
+ bool mode_wp, mode_dontwake;
1812
+
1813
+ if (READ_ONCE(ctx->mmap_changing))
1814
+ return -EAGAIN;
1815
+
1816
+ user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1817
+
1818
+ if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1819
+ sizeof(struct uffdio_writeprotect)))
1820
+ return -EFAULT;
1821
+
1822
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
1823
+ uffdio_wp.range.len);
1824
+ if (ret)
1825
+ return ret;
1826
+
1827
+ if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1828
+ UFFDIO_WRITEPROTECT_MODE_WP))
1829
+ return -EINVAL;
1830
+
1831
+ mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1832
+ mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1833
+
1834
+ if (mode_wp && mode_dontwake)
1835
+ return -EINVAL;
1836
+
1837
+ if (mmget_not_zero(ctx->mm)) {
1838
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1839
+ uffdio_wp.range.len, mode_wp,
1840
+ &ctx->mmap_changing);
1841
+ mmput(ctx->mm);
1842
+ } else {
1843
+ return -ESRCH;
1844
+ }
1845
+
1846
+ if (ret)
1847
+ return ret;
1848
+
1849
+ if (!mode_wp && !mode_dontwake) {
1850
+ range.start = uffdio_wp.range.start;
1851
+ range.len = uffdio_wp.range.len;
1852
+ wake_userfault(ctx, &range);
1853
+ }
1854
+ return ret;
1855
+}
1856
+
1857
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1858
+{
1859
+ __s64 ret;
1860
+ struct uffdio_continue uffdio_continue;
1861
+ struct uffdio_continue __user *user_uffdio_continue;
1862
+ struct userfaultfd_wake_range range;
1863
+
1864
+ user_uffdio_continue = (struct uffdio_continue __user *)arg;
1865
+
1866
+ ret = -EAGAIN;
1867
+ if (READ_ONCE(ctx->mmap_changing))
1868
+ goto out;
1869
+
1870
+ ret = -EFAULT;
1871
+ if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1872
+ /* don't copy the output fields */
1873
+ sizeof(uffdio_continue) - (sizeof(__s64))))
1874
+ goto out;
1875
+
1876
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
1877
+ uffdio_continue.range.len);
1878
+ if (ret)
1879
+ goto out;
1880
+
1881
+ ret = -EINVAL;
1882
+ /* double check for wraparound just in case. */
1883
+ if (uffdio_continue.range.start + uffdio_continue.range.len <=
1884
+ uffdio_continue.range.start) {
1885
+ goto out;
1886
+ }
1887
+ if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
1888
+ goto out;
1889
+
1890
+ if (mmget_not_zero(ctx->mm)) {
1891
+ ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
1892
+ uffdio_continue.range.len,
1893
+ &ctx->mmap_changing);
1894
+ mmput(ctx->mm);
1895
+ } else {
1896
+ return -ESRCH;
1897
+ }
1898
+
1899
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1900
+ return -EFAULT;
1901
+ if (ret < 0)
1902
+ goto out;
1903
+
1904
+ /* len == 0 would wake all */
1905
+ BUG_ON(!ret);
1906
+ range.len = ret;
1907
+ if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1908
+ range.start = uffdio_continue.range.start;
1909
+ wake_userfault(ctx, &range);
1910
+ }
1911
+ ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1912
+
18041913 out:
18051914 return ret;
18061915 }
....@@ -1840,6 +1949,10 @@
18401949 goto err_out;
18411950 /* report all available features and ioctls to userland */
18421951 uffdio_api.features = UFFD_API_FEATURES;
1952
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1953
+ uffdio_api.features &=
1954
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
1955
+#endif
18431956 uffdio_api.ioctls = UFFD_API_IOCTLS;
18441957 ret = -EFAULT;
18451958 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
....@@ -1889,6 +2002,12 @@
18892002 case UFFDIO_ZEROPAGE:
18902003 ret = userfaultfd_zeropage(ctx, arg);
18912004 break;
2005
+ case UFFDIO_WRITEPROTECT:
2006
+ ret = userfaultfd_writeprotect(ctx, arg);
2007
+ break;
2008
+ case UFFDIO_CONTINUE:
2009
+ ret = userfaultfd_continue(ctx, arg);
2010
+ break;
18922011 }
18932012 return ret;
18942013 }
....@@ -1929,7 +2048,7 @@
19292048 .poll = userfaultfd_poll,
19302049 .read = userfaultfd_read,
19312050 .unlocked_ioctl = userfaultfd_ioctl,
1932
- .compat_ioctl = userfaultfd_ioctl,
2051
+ .compat_ioctl = compat_ptr_ioctl,
19332052 .llseek = noop_llseek,
19342053 };
19352054
....@@ -1941,7 +2060,7 @@
19412060 init_waitqueue_head(&ctx->fault_wqh);
19422061 init_waitqueue_head(&ctx->event_wqh);
19432062 init_waitqueue_head(&ctx->fd_wqh);
1944
- seqlock_init(&ctx->refile_seq);
2063
+ seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
19452064 }
19462065
19472066 SYSCALL_DEFINE1(userfaultfd, int, flags)
....@@ -1949,20 +2068,30 @@
19492068 struct userfaultfd_ctx *ctx;
19502069 int fd;
19512070
2071
+ if (!sysctl_unprivileged_userfaultfd &&
2072
+ (flags & UFFD_USER_MODE_ONLY) == 0 &&
2073
+ !capable(CAP_SYS_PTRACE)) {
2074
+ printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
2075
+ "sysctl knob to 1 if kernel faults must be handled "
2076
+ "without obtaining CAP_SYS_PTRACE capability\n");
2077
+ return -EPERM;
2078
+ }
2079
+
19522080 BUG_ON(!current->mm);
19532081
19542082 /* Check the UFFD_* constants for consistency. */
2083
+ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
19552084 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
19562085 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
19572086
1958
- if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
2087
+ if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
19592088 return -EINVAL;
19602089
19612090 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
19622091 if (!ctx)
19632092 return -ENOMEM;
19642093
1965
- atomic_set(&ctx->refcount, 1);
2094
+ refcount_set(&ctx->refcount, 1);
19662095 ctx->flags = flags;
19672096 ctx->features = 0;
19682097 ctx->released = false;
....@@ -1971,8 +2100,8 @@
19712100 /* prevent the mm struct to be freed */
19722101 mmgrab(ctx->mm);
19732102
1974
- fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
1975
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
2103
+ fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
2104
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
19762105 if (fd < 0) {
19772106 mmdrop(ctx->mm);
19782107 kmem_cache_free(userfaultfd_ctx_cachep, ctx);