hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/mm/mprotect.c
....@@ -9,7 +9,7 @@
99 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
1010 */
1111
12
-#include <linux/mm.h>
12
+#include <linux/pagewalk.h>
1313 #include <linux/hugetlb.h>
1414 #include <linux/shm.h>
1515 #include <linux/mman.h>
....@@ -28,7 +28,7 @@
2828 #include <linux/ksm.h>
2929 #include <linux/uaccess.h>
3030 #include <linux/mm_inline.h>
31
-#include <asm/pgtable.h>
31
+#include <linux/pgtable.h>
3232 #include <asm/cacheflush.h>
3333 #include <asm/mmu_context.h>
3434 #include <asm/tlbflush.h>
....@@ -37,16 +37,19 @@
3737
3838 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
3939 unsigned long addr, unsigned long end, pgprot_t newprot,
40
- int dirty_accountable, int prot_numa)
40
+ unsigned long cp_flags)
4141 {
42
- struct mm_struct *mm = vma->vm_mm;
4342 pte_t *pte, oldpte;
4443 spinlock_t *ptl;
4544 unsigned long pages = 0;
4645 int target_node = NUMA_NO_NODE;
46
+ bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
47
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
48
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
49
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
4750
4851 /*
49
- * Can be called with only the mmap_sem for reading by
52
+ * Can be called with only the mmap_lock for reading by
5053 * prot_numa so we must check the pmd isn't constantly
5154 * changing from under us from pmd_none to pmd_trans_huge
5255 * and/or the other way around.
....@@ -56,7 +59,7 @@
5659
5760 /*
5861 * The pmd points to a regular pte so the pmd can't change
59
- * from under us even if the mmap_sem is only hold for
62
+ * from under us even if the mmap_lock is only hold for
6063 * reading.
6164 */
6265 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
....@@ -81,13 +84,17 @@
8184 if (prot_numa) {
8285 struct page *page;
8386
87
+ /* Avoid TLB flush if possible */
88
+ if (pte_protnone(oldpte))
89
+ continue;
90
+
8491 page = vm_normal_page(vma, addr, oldpte);
8592 if (!page || PageKsm(page))
8693 continue;
8794
8895 /* Also skip shared copy-on-write pages */
8996 if (is_cow_mapping(vma->vm_flags) &&
90
- page_mapcount(page) != 1)
97
+ page_count(page) != 1)
9198 continue;
9299
93100 /*
....@@ -95,11 +102,7 @@
95102 * it cannot move them all from MIGRATE_ASYNC
96103 * context.
97104 */
98
- if (page_is_file_cache(page) && PageDirty(page))
99
- continue;
100
-
101
- /* Avoid TLB flush if possible */
102
- if (pte_protnone(oldpte))
105
+ if (page_is_file_lru(page) && PageDirty(page))
103106 continue;
104107
105108 /*
....@@ -110,10 +113,23 @@
110113 continue;
111114 }
112115
113
- ptent = ptep_modify_prot_start(mm, addr, pte);
114
- ptent = pte_modify(ptent, newprot);
116
+ oldpte = ptep_modify_prot_start(vma, addr, pte);
117
+ ptent = pte_modify(oldpte, newprot);
115118 if (preserve_write)
116119 ptent = pte_mk_savedwrite(ptent);
120
+
121
+ if (uffd_wp) {
122
+ ptent = pte_wrprotect(ptent);
123
+ ptent = pte_mkuffd_wp(ptent);
124
+ } else if (uffd_wp_resolve) {
125
+ /*
126
+ * Leave the write bit to be handled
127
+ * by PF interrupt handler, then
128
+ * things like COW could be properly
129
+ * handled.
130
+ */
131
+ ptent = pte_clear_uffd_wp(ptent);
132
+ }
117133
118134 /* Avoid taking write faults for known dirty pages */
119135 if (dirty_accountable && pte_dirty(ptent) &&
....@@ -121,13 +137,13 @@
121137 !(vma->vm_flags & VM_SOFTDIRTY))) {
122138 ptent = pte_mkwrite(ptent);
123139 }
124
- ptep_modify_prot_commit(mm, addr, pte, ptent);
140
+ ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
125141 pages++;
126
- } else if (IS_ENABLED(CONFIG_MIGRATION)) {
142
+ } else if (is_swap_pte(oldpte)) {
127143 swp_entry_t entry = pte_to_swp_entry(oldpte);
144
+ pte_t newpte;
128145
129146 if (is_write_migration_entry(entry)) {
130
- pte_t newpte;
131147 /*
132148 * A protection check is difficult so
133149 * just be safe and disable write
....@@ -136,22 +152,28 @@
136152 newpte = swp_entry_to_pte(entry);
137153 if (pte_swp_soft_dirty(oldpte))
138154 newpte = pte_swp_mksoft_dirty(newpte);
139
- set_pte_at(mm, addr, pte, newpte);
140
-
141
- pages++;
142
- }
143
-
144
- if (is_write_device_private_entry(entry)) {
145
- pte_t newpte;
146
-
155
+ if (pte_swp_uffd_wp(oldpte))
156
+ newpte = pte_swp_mkuffd_wp(newpte);
157
+ } else if (is_write_device_private_entry(entry)) {
147158 /*
148159 * We do not preserve soft-dirtiness. See
149160 * copy_one_pte() for explanation.
150161 */
151162 make_device_private_entry_read(&entry);
152163 newpte = swp_entry_to_pte(entry);
153
- set_pte_at(mm, addr, pte, newpte);
164
+ if (pte_swp_uffd_wp(oldpte))
165
+ newpte = pte_swp_mkuffd_wp(newpte);
166
+ } else {
167
+ newpte = oldpte;
168
+ }
154169
170
+ if (uffd_wp)
171
+ newpte = pte_swp_mkuffd_wp(newpte);
172
+ else if (uffd_wp_resolve)
173
+ newpte = pte_swp_clear_uffd_wp(newpte);
174
+
175
+ if (!pte_same(oldpte, newpte)) {
176
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
155177 pages++;
156178 }
157179 }
....@@ -189,14 +211,15 @@
189211
190212 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
191213 pud_t *pud, unsigned long addr, unsigned long end,
192
- pgprot_t newprot, int dirty_accountable, int prot_numa)
214
+ pgprot_t newprot, unsigned long cp_flags)
193215 {
194216 pmd_t *pmd;
195
- struct mm_struct *mm = vma->vm_mm;
196217 unsigned long next;
197218 unsigned long pages = 0;
198219 unsigned long nr_huge_updates = 0;
199
- unsigned long mni_start = 0;
220
+ struct mmu_notifier_range range;
221
+
222
+ range.start = 0;
200223
201224 pmd = pmd_offset(pud, addr);
202225 do {
....@@ -205,7 +228,7 @@
205228 next = pmd_addr_end(addr, end);
206229
207230 /*
208
- * Automatic NUMA balancing walks the tables with mmap_sem
231
+ * Automatic NUMA balancing walks the tables with mmap_lock
209232 * held for read. It's possible a parallel update to occur
210233 * between pmd_trans_huge() and a pmd_none_or_clear_bad()
211234 * check leading to a false positive and clearing.
....@@ -217,9 +240,11 @@
217240 goto next;
218241
219242 /* invoke the mmu notifier if the pmd is populated */
220
- if (!mni_start) {
221
- mni_start = addr;
222
- mmu_notifier_invalidate_range_start(mm, mni_start, end);
243
+ if (!range.start) {
244
+ mmu_notifier_range_init(&range,
245
+ MMU_NOTIFY_PROTECTION_VMA, 0,
246
+ vma, vma->vm_mm, addr, end);
247
+ mmu_notifier_invalidate_range_start(&range);
223248 }
224249
225250 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
....@@ -227,7 +252,7 @@
227252 __split_huge_pmd(vma, pmd, addr, false, NULL);
228253 } else {
229254 int nr_ptes = change_huge_pmd(vma, pmd, addr,
230
- newprot, prot_numa);
255
+ newprot, cp_flags);
231256
232257 if (nr_ptes) {
233258 if (nr_ptes == HPAGE_PMD_NR) {
....@@ -242,14 +267,14 @@
242267 /* fall through, the trans huge pmd just split */
243268 }
244269 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
245
- dirty_accountable, prot_numa);
270
+ cp_flags);
246271 pages += this_pages;
247272 next:
248273 cond_resched();
249274 } while (pmd++, addr = next, addr != end);
250275
251
- if (mni_start)
252
- mmu_notifier_invalidate_range_end(mm, mni_start, end);
276
+ if (range.start)
277
+ mmu_notifier_invalidate_range_end(&range);
253278
254279 if (nr_huge_updates)
255280 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
....@@ -258,7 +283,7 @@
258283
259284 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
260285 p4d_t *p4d, unsigned long addr, unsigned long end,
261
- pgprot_t newprot, int dirty_accountable, int prot_numa)
286
+ pgprot_t newprot, unsigned long cp_flags)
262287 {
263288 pud_t *pud;
264289 unsigned long next;
....@@ -270,7 +295,7 @@
270295 if (pud_none_or_clear_bad(pud))
271296 continue;
272297 pages += change_pmd_range(vma, pud, addr, next, newprot,
273
- dirty_accountable, prot_numa);
298
+ cp_flags);
274299 } while (pud++, addr = next, addr != end);
275300
276301 return pages;
....@@ -278,7 +303,7 @@
278303
279304 static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
280305 pgd_t *pgd, unsigned long addr, unsigned long end,
281
- pgprot_t newprot, int dirty_accountable, int prot_numa)
306
+ pgprot_t newprot, unsigned long cp_flags)
282307 {
283308 p4d_t *p4d;
284309 unsigned long next;
....@@ -290,7 +315,7 @@
290315 if (p4d_none_or_clear_bad(p4d))
291316 continue;
292317 pages += change_pud_range(vma, p4d, addr, next, newprot,
293
- dirty_accountable, prot_numa);
318
+ cp_flags);
294319 } while (p4d++, addr = next, addr != end);
295320
296321 return pages;
....@@ -298,7 +323,7 @@
298323
299324 static unsigned long change_protection_range(struct vm_area_struct *vma,
300325 unsigned long addr, unsigned long end, pgprot_t newprot,
301
- int dirty_accountable, int prot_numa)
326
+ unsigned long cp_flags)
302327 {
303328 struct mm_struct *mm = vma->vm_mm;
304329 pgd_t *pgd;
....@@ -315,7 +340,7 @@
315340 if (pgd_none_or_clear_bad(pgd))
316341 continue;
317342 pages += change_p4d_range(vma, pgd, addr, next, newprot,
318
- dirty_accountable, prot_numa);
343
+ cp_flags);
319344 } while (pgd++, addr = next, addr != end);
320345
321346 /* Only flush the TLB if we actually modified any entries: */
....@@ -328,14 +353,17 @@
328353
329354 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
330355 unsigned long end, pgprot_t newprot,
331
- int dirty_accountable, int prot_numa)
356
+ unsigned long cp_flags)
332357 {
333358 unsigned long pages;
359
+
360
+ BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
334361
335362 if (is_vm_hugetlb_page(vma))
336363 pages = hugetlb_change_protection(vma, start, end, newprot);
337364 else
338
- pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
365
+ pages = change_protection_range(vma, start, end, newprot,
366
+ cp_flags);
339367
340368 return pages;
341369 }
....@@ -361,20 +389,11 @@
361389 return 0;
362390 }
363391
364
-static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
365
- unsigned long end, unsigned long newflags)
366
-{
367
- pgprot_t new_pgprot = vm_get_page_prot(newflags);
368
- struct mm_walk prot_none_walk = {
369
- .pte_entry = prot_none_pte_entry,
370
- .hugetlb_entry = prot_none_hugetlb_entry,
371
- .test_walk = prot_none_test,
372
- .mm = current->mm,
373
- .private = &new_pgprot,
374
- };
375
-
376
- return walk_page_range(start, end, &prot_none_walk);
377
-}
392
+static const struct mm_walk_ops prot_none_walk_ops = {
393
+ .pte_entry = prot_none_pte_entry,
394
+ .hugetlb_entry = prot_none_hugetlb_entry,
395
+ .test_walk = prot_none_test,
396
+};
378397
379398 int
380399 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
....@@ -400,8 +419,11 @@
400419 */
401420 if (arch_has_pfn_modify_check() &&
402421 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
403
- (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
404
- error = prot_none_walk(vma, start, end, newflags);
422
+ (newflags & VM_ACCESS_FLAGS) == 0) {
423
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
424
+
425
+ error = walk_page_range(current->mm, start, end,
426
+ &prot_none_walk_ops, &new_pgprot);
405427 if (error)
406428 return error;
407429 }
....@@ -455,15 +477,17 @@
455477
456478 success:
457479 /*
458
- * vm_flags and vm_page_prot are protected by the mmap_sem
480
+ * vm_flags and vm_page_prot are protected by the mmap_lock
459481 * held in write mode.
460482 */
461
- vma->vm_flags = newflags;
483
+ vm_write_begin(vma);
484
+ WRITE_ONCE(vma->vm_flags, newflags);
462485 dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
463486 vma_set_page_prot(vma);
464487
465488 change_protection(vma, start, end, vma->vm_page_prot,
466
- dirty_accountable, 0);
489
+ dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
490
+ vm_write_end(vma);
467491
468492 /*
469493 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
....@@ -516,7 +540,7 @@
516540
517541 reqprot = prot;
518542
519
- if (down_write_killable(&current->mm->mmap_sem))
543
+ if (mmap_write_lock_killable(current->mm))
520544 return -EINTR;
521545
522546 /*
....@@ -576,8 +600,14 @@
576600 newflags |= (vma->vm_flags & ~mask_off_old_flags);
577601
578602 /* newflags >> 4 shift VM_MAY% in place of VM_% */
579
- if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
603
+ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
580604 error = -EACCES;
605
+ goto out;
606
+ }
607
+
608
+ /* Allow architectures to sanity-check the new flags */
609
+ if (!arch_validate_flags(newflags)) {
610
+ error = -EINVAL;
581611 goto out;
582612 }
583613
....@@ -606,7 +636,7 @@
606636 prot = reqprot;
607637 }
608638 out:
609
- up_write(&current->mm->mmap_sem);
639
+ mmap_write_unlock(current->mm);
610640 return error;
611641 }
612642
....@@ -636,7 +666,7 @@
636666 if (init_val & ~PKEY_ACCESS_MASK)
637667 return -EINVAL;
638668
639
- down_write(&current->mm->mmap_sem);
669
+ mmap_write_lock(current->mm);
640670 pkey = mm_pkey_alloc(current->mm);
641671
642672 ret = -ENOSPC;
....@@ -650,7 +680,7 @@
650680 }
651681 ret = pkey;
652682 out:
653
- up_write(&current->mm->mmap_sem);
683
+ mmap_write_unlock(current->mm);
654684 return ret;
655685 }
656686
....@@ -658,9 +688,9 @@
658688 {
659689 int ret;
660690
661
- down_write(&current->mm->mmap_sem);
691
+ mmap_write_lock(current->mm);
662692 ret = mm_pkey_free(current->mm, pkey);
663
- up_write(&current->mm->mmap_sem);
693
+ mmap_write_unlock(current->mm);
664694
665695 /*
666696 * We could provie warnings or errors if any VMA still