.. | .. |
---|
9 | 9 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved |
---|
10 | 10 | */ |
---|
11 | 11 | |
---|
12 | | -#include <linux/mm.h> |
---|
| 12 | +#include <linux/pagewalk.h> |
---|
13 | 13 | #include <linux/hugetlb.h> |
---|
14 | 14 | #include <linux/shm.h> |
---|
15 | 15 | #include <linux/mman.h> |
---|
.. | .. |
---|
28 | 28 | #include <linux/ksm.h> |
---|
29 | 29 | #include <linux/uaccess.h> |
---|
30 | 30 | #include <linux/mm_inline.h> |
---|
31 | | -#include <asm/pgtable.h> |
---|
| 31 | +#include <linux/pgtable.h> |
---|
32 | 32 | #include <asm/cacheflush.h> |
---|
33 | 33 | #include <asm/mmu_context.h> |
---|
34 | 34 | #include <asm/tlbflush.h> |
---|
.. | .. |
---|
37 | 37 | |
---|
38 | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
---|
39 | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
---|
40 | | - int dirty_accountable, int prot_numa) |
---|
| 40 | + unsigned long cp_flags) |
---|
41 | 41 | { |
---|
42 | | - struct mm_struct *mm = vma->vm_mm; |
---|
43 | 42 | pte_t *pte, oldpte; |
---|
44 | 43 | spinlock_t *ptl; |
---|
45 | 44 | unsigned long pages = 0; |
---|
46 | 45 | int target_node = NUMA_NO_NODE; |
---|
| 46 | + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; |
---|
| 47 | + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; |
---|
| 48 | + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; |
---|
| 49 | + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; |
---|
47 | 50 | |
---|
48 | 51 | /* |
---|
49 | | - * Can be called with only the mmap_sem for reading by |
---|
| 52 | + * Can be called with only the mmap_lock for reading by |
---|
50 | 53 | * prot_numa so we must check the pmd isn't constantly |
---|
51 | 54 | * changing from under us from pmd_none to pmd_trans_huge |
---|
52 | 55 | * and/or the other way around. |
---|
.. | .. |
---|
56 | 59 | |
---|
57 | 60 | /* |
---|
58 | 61 | * The pmd points to a regular pte so the pmd can't change |
---|
59 | | - * from under us even if the mmap_sem is only hold for |
---|
| 62 | + * from under us even if the mmap_lock is only hold for |
---|
60 | 63 | * reading. |
---|
61 | 64 | */ |
---|
62 | 65 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
---|
.. | .. |
---|
81 | 84 | if (prot_numa) { |
---|
82 | 85 | struct page *page; |
---|
83 | 86 | |
---|
| 87 | + /* Avoid TLB flush if possible */ |
---|
| 88 | + if (pte_protnone(oldpte)) |
---|
| 89 | + continue; |
---|
| 90 | + |
---|
84 | 91 | page = vm_normal_page(vma, addr, oldpte); |
---|
85 | 92 | if (!page || PageKsm(page)) |
---|
86 | 93 | continue; |
---|
87 | 94 | |
---|
88 | 95 | /* Also skip shared copy-on-write pages */ |
---|
89 | 96 | if (is_cow_mapping(vma->vm_flags) && |
---|
90 | | - page_mapcount(page) != 1) |
---|
| 97 | + page_count(page) != 1) |
---|
91 | 98 | continue; |
---|
92 | 99 | |
---|
93 | 100 | /* |
---|
.. | .. |
---|
95 | 102 | * it cannot move them all from MIGRATE_ASYNC |
---|
96 | 103 | * context. |
---|
97 | 104 | */ |
---|
98 | | - if (page_is_file_cache(page) && PageDirty(page)) |
---|
99 | | - continue; |
---|
100 | | - |
---|
101 | | - /* Avoid TLB flush if possible */ |
---|
102 | | - if (pte_protnone(oldpte)) |
---|
| 105 | + if (page_is_file_lru(page) && PageDirty(page)) |
---|
103 | 106 | continue; |
---|
104 | 107 | |
---|
105 | 108 | /* |
---|
.. | .. |
---|
110 | 113 | continue; |
---|
111 | 114 | } |
---|
112 | 115 | |
---|
113 | | - ptent = ptep_modify_prot_start(mm, addr, pte); |
---|
114 | | - ptent = pte_modify(ptent, newprot); |
---|
| 116 | + oldpte = ptep_modify_prot_start(vma, addr, pte); |
---|
| 117 | + ptent = pte_modify(oldpte, newprot); |
---|
115 | 118 | if (preserve_write) |
---|
116 | 119 | ptent = pte_mk_savedwrite(ptent); |
---|
| 120 | + |
---|
| 121 | + if (uffd_wp) { |
---|
| 122 | + ptent = pte_wrprotect(ptent); |
---|
| 123 | + ptent = pte_mkuffd_wp(ptent); |
---|
| 124 | + } else if (uffd_wp_resolve) { |
---|
| 125 | + /* |
---|
| 126 | + * Leave the write bit to be handled |
---|
| 127 | + * by PF interrupt handler, then |
---|
| 128 | + * things like COW could be properly |
---|
| 129 | + * handled. |
---|
| 130 | + */ |
---|
| 131 | + ptent = pte_clear_uffd_wp(ptent); |
---|
| 132 | + } |
---|
117 | 133 | |
---|
118 | 134 | /* Avoid taking write faults for known dirty pages */ |
---|
119 | 135 | if (dirty_accountable && pte_dirty(ptent) && |
---|
.. | .. |
---|
121 | 137 | !(vma->vm_flags & VM_SOFTDIRTY))) { |
---|
122 | 138 | ptent = pte_mkwrite(ptent); |
---|
123 | 139 | } |
---|
124 | | - ptep_modify_prot_commit(mm, addr, pte, ptent); |
---|
| 140 | + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); |
---|
125 | 141 | pages++; |
---|
126 | | - } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
---|
| 142 | + } else if (is_swap_pte(oldpte)) { |
---|
127 | 143 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
---|
| 144 | + pte_t newpte; |
---|
128 | 145 | |
---|
129 | 146 | if (is_write_migration_entry(entry)) { |
---|
130 | | - pte_t newpte; |
---|
131 | 147 | /* |
---|
132 | 148 | * A protection check is difficult so |
---|
133 | 149 | * just be safe and disable write |
---|
.. | .. |
---|
136 | 152 | newpte = swp_entry_to_pte(entry); |
---|
137 | 153 | if (pte_swp_soft_dirty(oldpte)) |
---|
138 | 154 | newpte = pte_swp_mksoft_dirty(newpte); |
---|
139 | | - set_pte_at(mm, addr, pte, newpte); |
---|
140 | | - |
---|
141 | | - pages++; |
---|
142 | | - } |
---|
143 | | - |
---|
144 | | - if (is_write_device_private_entry(entry)) { |
---|
145 | | - pte_t newpte; |
---|
146 | | - |
---|
| 155 | + if (pte_swp_uffd_wp(oldpte)) |
---|
| 156 | + newpte = pte_swp_mkuffd_wp(newpte); |
---|
| 157 | + } else if (is_write_device_private_entry(entry)) { |
---|
147 | 158 | /* |
---|
148 | 159 | * We do not preserve soft-dirtiness. See |
---|
149 | 160 | * copy_one_pte() for explanation. |
---|
150 | 161 | */ |
---|
151 | 162 | make_device_private_entry_read(&entry); |
---|
152 | 163 | newpte = swp_entry_to_pte(entry); |
---|
153 | | - set_pte_at(mm, addr, pte, newpte); |
---|
| 164 | + if (pte_swp_uffd_wp(oldpte)) |
---|
| 165 | + newpte = pte_swp_mkuffd_wp(newpte); |
---|
| 166 | + } else { |
---|
| 167 | + newpte = oldpte; |
---|
| 168 | + } |
---|
154 | 169 | |
---|
| 170 | + if (uffd_wp) |
---|
| 171 | + newpte = pte_swp_mkuffd_wp(newpte); |
---|
| 172 | + else if (uffd_wp_resolve) |
---|
| 173 | + newpte = pte_swp_clear_uffd_wp(newpte); |
---|
| 174 | + |
---|
| 175 | + if (!pte_same(oldpte, newpte)) { |
---|
| 176 | + set_pte_at(vma->vm_mm, addr, pte, newpte); |
---|
155 | 177 | pages++; |
---|
156 | 178 | } |
---|
157 | 179 | } |
---|
.. | .. |
---|
189 | 211 | |
---|
190 | 212 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, |
---|
191 | 213 | pud_t *pud, unsigned long addr, unsigned long end, |
---|
192 | | - pgprot_t newprot, int dirty_accountable, int prot_numa) |
---|
| 214 | + pgprot_t newprot, unsigned long cp_flags) |
---|
193 | 215 | { |
---|
194 | 216 | pmd_t *pmd; |
---|
195 | | - struct mm_struct *mm = vma->vm_mm; |
---|
196 | 217 | unsigned long next; |
---|
197 | 218 | unsigned long pages = 0; |
---|
198 | 219 | unsigned long nr_huge_updates = 0; |
---|
199 | | - unsigned long mni_start = 0; |
---|
| 220 | + struct mmu_notifier_range range; |
---|
| 221 | + |
---|
| 222 | + range.start = 0; |
---|
200 | 223 | |
---|
201 | 224 | pmd = pmd_offset(pud, addr); |
---|
202 | 225 | do { |
---|
.. | .. |
---|
205 | 228 | next = pmd_addr_end(addr, end); |
---|
206 | 229 | |
---|
207 | 230 | /* |
---|
208 | | - * Automatic NUMA balancing walks the tables with mmap_sem |
---|
| 231 | + * Automatic NUMA balancing walks the tables with mmap_lock |
---|
209 | 232 | * held for read. It's possible a parallel update to occur |
---|
210 | 233 | * between pmd_trans_huge() and a pmd_none_or_clear_bad() |
---|
211 | 234 | * check leading to a false positive and clearing. |
---|
.. | .. |
---|
217 | 240 | goto next; |
---|
218 | 241 | |
---|
219 | 242 | /* invoke the mmu notifier if the pmd is populated */ |
---|
220 | | - if (!mni_start) { |
---|
221 | | - mni_start = addr; |
---|
222 | | - mmu_notifier_invalidate_range_start(mm, mni_start, end); |
---|
| 243 | + if (!range.start) { |
---|
| 244 | + mmu_notifier_range_init(&range, |
---|
| 245 | + MMU_NOTIFY_PROTECTION_VMA, 0, |
---|
| 246 | + vma, vma->vm_mm, addr, end); |
---|
| 247 | + mmu_notifier_invalidate_range_start(&range); |
---|
223 | 248 | } |
---|
224 | 249 | |
---|
225 | 250 | if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
---|
.. | .. |
---|
227 | 252 | __split_huge_pmd(vma, pmd, addr, false, NULL); |
---|
228 | 253 | } else { |
---|
229 | 254 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
---|
230 | | - newprot, prot_numa); |
---|
| 255 | + newprot, cp_flags); |
---|
231 | 256 | |
---|
232 | 257 | if (nr_ptes) { |
---|
233 | 258 | if (nr_ptes == HPAGE_PMD_NR) { |
---|
.. | .. |
---|
242 | 267 | /* fall through, the trans huge pmd just split */ |
---|
243 | 268 | } |
---|
244 | 269 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, |
---|
245 | | - dirty_accountable, prot_numa); |
---|
| 270 | + cp_flags); |
---|
246 | 271 | pages += this_pages; |
---|
247 | 272 | next: |
---|
248 | 273 | cond_resched(); |
---|
249 | 274 | } while (pmd++, addr = next, addr != end); |
---|
250 | 275 | |
---|
251 | | - if (mni_start) |
---|
252 | | - mmu_notifier_invalidate_range_end(mm, mni_start, end); |
---|
| 276 | + if (range.start) |
---|
| 277 | + mmu_notifier_invalidate_range_end(&range); |
---|
253 | 278 | |
---|
254 | 279 | if (nr_huge_updates) |
---|
255 | 280 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); |
---|
.. | .. |
---|
258 | 283 | |
---|
259 | 284 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, |
---|
260 | 285 | p4d_t *p4d, unsigned long addr, unsigned long end, |
---|
261 | | - pgprot_t newprot, int dirty_accountable, int prot_numa) |
---|
| 286 | + pgprot_t newprot, unsigned long cp_flags) |
---|
262 | 287 | { |
---|
263 | 288 | pud_t *pud; |
---|
264 | 289 | unsigned long next; |
---|
.. | .. |
---|
270 | 295 | if (pud_none_or_clear_bad(pud)) |
---|
271 | 296 | continue; |
---|
272 | 297 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
---|
273 | | - dirty_accountable, prot_numa); |
---|
| 298 | + cp_flags); |
---|
274 | 299 | } while (pud++, addr = next, addr != end); |
---|
275 | 300 | |
---|
276 | 301 | return pages; |
---|
.. | .. |
---|
278 | 303 | |
---|
279 | 304 | static inline unsigned long change_p4d_range(struct vm_area_struct *vma, |
---|
280 | 305 | pgd_t *pgd, unsigned long addr, unsigned long end, |
---|
281 | | - pgprot_t newprot, int dirty_accountable, int prot_numa) |
---|
| 306 | + pgprot_t newprot, unsigned long cp_flags) |
---|
282 | 307 | { |
---|
283 | 308 | p4d_t *p4d; |
---|
284 | 309 | unsigned long next; |
---|
.. | .. |
---|
290 | 315 | if (p4d_none_or_clear_bad(p4d)) |
---|
291 | 316 | continue; |
---|
292 | 317 | pages += change_pud_range(vma, p4d, addr, next, newprot, |
---|
293 | | - dirty_accountable, prot_numa); |
---|
| 318 | + cp_flags); |
---|
294 | 319 | } while (p4d++, addr = next, addr != end); |
---|
295 | 320 | |
---|
296 | 321 | return pages; |
---|
.. | .. |
---|
298 | 323 | |
---|
299 | 324 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
---|
300 | 325 | unsigned long addr, unsigned long end, pgprot_t newprot, |
---|
301 | | - int dirty_accountable, int prot_numa) |
---|
| 326 | + unsigned long cp_flags) |
---|
302 | 327 | { |
---|
303 | 328 | struct mm_struct *mm = vma->vm_mm; |
---|
304 | 329 | pgd_t *pgd; |
---|
.. | .. |
---|
315 | 340 | if (pgd_none_or_clear_bad(pgd)) |
---|
316 | 341 | continue; |
---|
317 | 342 | pages += change_p4d_range(vma, pgd, addr, next, newprot, |
---|
318 | | - dirty_accountable, prot_numa); |
---|
| 343 | + cp_flags); |
---|
319 | 344 | } while (pgd++, addr = next, addr != end); |
---|
320 | 345 | |
---|
321 | 346 | /* Only flush the TLB if we actually modified any entries: */ |
---|
.. | .. |
---|
328 | 353 | |
---|
329 | 354 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, |
---|
330 | 355 | unsigned long end, pgprot_t newprot, |
---|
331 | | - int dirty_accountable, int prot_numa) |
---|
| 356 | + unsigned long cp_flags) |
---|
332 | 357 | { |
---|
333 | 358 | unsigned long pages; |
---|
| 359 | + |
---|
| 360 | + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); |
---|
334 | 361 | |
---|
335 | 362 | if (is_vm_hugetlb_page(vma)) |
---|
336 | 363 | pages = hugetlb_change_protection(vma, start, end, newprot); |
---|
337 | 364 | else |
---|
338 | | - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); |
---|
| 365 | + pages = change_protection_range(vma, start, end, newprot, |
---|
| 366 | + cp_flags); |
---|
339 | 367 | |
---|
340 | 368 | return pages; |
---|
341 | 369 | } |
---|
.. | .. |
---|
361 | 389 | return 0; |
---|
362 | 390 | } |
---|
363 | 391 | |
---|
364 | | -static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, |
---|
365 | | - unsigned long end, unsigned long newflags) |
---|
366 | | -{ |
---|
367 | | - pgprot_t new_pgprot = vm_get_page_prot(newflags); |
---|
368 | | - struct mm_walk prot_none_walk = { |
---|
369 | | - .pte_entry = prot_none_pte_entry, |
---|
370 | | - .hugetlb_entry = prot_none_hugetlb_entry, |
---|
371 | | - .test_walk = prot_none_test, |
---|
372 | | - .mm = current->mm, |
---|
373 | | - .private = &new_pgprot, |
---|
374 | | - }; |
---|
375 | | - |
---|
376 | | - return walk_page_range(start, end, &prot_none_walk); |
---|
377 | | -} |
---|
| 392 | +static const struct mm_walk_ops prot_none_walk_ops = { |
---|
| 393 | + .pte_entry = prot_none_pte_entry, |
---|
| 394 | + .hugetlb_entry = prot_none_hugetlb_entry, |
---|
| 395 | + .test_walk = prot_none_test, |
---|
| 396 | +}; |
---|
378 | 397 | |
---|
379 | 398 | int |
---|
380 | 399 | mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, |
---|
.. | .. |
---|
400 | 419 | */ |
---|
401 | 420 | if (arch_has_pfn_modify_check() && |
---|
402 | 421 | (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && |
---|
403 | | - (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { |
---|
404 | | - error = prot_none_walk(vma, start, end, newflags); |
---|
| 422 | + (newflags & VM_ACCESS_FLAGS) == 0) { |
---|
| 423 | + pgprot_t new_pgprot = vm_get_page_prot(newflags); |
---|
| 424 | + |
---|
| 425 | + error = walk_page_range(current->mm, start, end, |
---|
| 426 | + &prot_none_walk_ops, &new_pgprot); |
---|
405 | 427 | if (error) |
---|
406 | 428 | return error; |
---|
407 | 429 | } |
---|
.. | .. |
---|
455 | 477 | |
---|
456 | 478 | success: |
---|
457 | 479 | /* |
---|
458 | | - * vm_flags and vm_page_prot are protected by the mmap_sem |
---|
| 480 | + * vm_flags and vm_page_prot are protected by the mmap_lock |
---|
459 | 481 | * held in write mode. |
---|
460 | 482 | */ |
---|
461 | | - vma->vm_flags = newflags; |
---|
| 483 | + vm_write_begin(vma); |
---|
| 484 | + WRITE_ONCE(vma->vm_flags, newflags); |
---|
462 | 485 | dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); |
---|
463 | 486 | vma_set_page_prot(vma); |
---|
464 | 487 | |
---|
465 | 488 | change_protection(vma, start, end, vma->vm_page_prot, |
---|
466 | | - dirty_accountable, 0); |
---|
| 489 | + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); |
---|
| 490 | + vm_write_end(vma); |
---|
467 | 491 | |
---|
468 | 492 | /* |
---|
469 | 493 | * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major |
---|
.. | .. |
---|
516 | 540 | |
---|
517 | 541 | reqprot = prot; |
---|
518 | 542 | |
---|
519 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 543 | + if (mmap_write_lock_killable(current->mm)) |
---|
520 | 544 | return -EINTR; |
---|
521 | 545 | |
---|
522 | 546 | /* |
---|
.. | .. |
---|
576 | 600 | newflags |= (vma->vm_flags & ~mask_off_old_flags); |
---|
577 | 601 | |
---|
578 | 602 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
---|
579 | | - if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { |
---|
| 603 | + if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { |
---|
580 | 604 | error = -EACCES; |
---|
| 605 | + goto out; |
---|
| 606 | + } |
---|
| 607 | + |
---|
| 608 | + /* Allow architectures to sanity-check the new flags */ |
---|
| 609 | + if (!arch_validate_flags(newflags)) { |
---|
| 610 | + error = -EINVAL; |
---|
581 | 611 | goto out; |
---|
582 | 612 | } |
---|
583 | 613 | |
---|
.. | .. |
---|
606 | 636 | prot = reqprot; |
---|
607 | 637 | } |
---|
608 | 638 | out: |
---|
609 | | - up_write(¤t->mm->mmap_sem); |
---|
| 639 | + mmap_write_unlock(current->mm); |
---|
610 | 640 | return error; |
---|
611 | 641 | } |
---|
612 | 642 | |
---|
.. | .. |
---|
636 | 666 | if (init_val & ~PKEY_ACCESS_MASK) |
---|
637 | 667 | return -EINVAL; |
---|
638 | 668 | |
---|
639 | | - down_write(¤t->mm->mmap_sem); |
---|
| 669 | + mmap_write_lock(current->mm); |
---|
640 | 670 | pkey = mm_pkey_alloc(current->mm); |
---|
641 | 671 | |
---|
642 | 672 | ret = -ENOSPC; |
---|
.. | .. |
---|
650 | 680 | } |
---|
651 | 681 | ret = pkey; |
---|
652 | 682 | out: |
---|
653 | | - up_write(¤t->mm->mmap_sem); |
---|
| 683 | + mmap_write_unlock(current->mm); |
---|
654 | 684 | return ret; |
---|
655 | 685 | } |
---|
656 | 686 | |
---|
.. | .. |
---|
658 | 688 | { |
---|
659 | 689 | int ret; |
---|
660 | 690 | |
---|
661 | | - down_write(¤t->mm->mmap_sem); |
---|
| 691 | + mmap_write_lock(current->mm); |
---|
662 | 692 | ret = mm_pkey_free(current->mm, pkey); |
---|
663 | | - up_write(¤t->mm->mmap_sem); |
---|
| 693 | + mmap_write_unlock(current->mm); |
---|
664 | 694 | |
---|
665 | 695 | /* |
---|
666 | 696 | * We could provie warnings or errors if any VMA still |
---|