.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * mm/userfaultfd.c |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright (C) 2015 Red Hat, Inc. |
---|
5 | | - * |
---|
6 | | - * This work is licensed under the terms of the GNU GPL, version 2. See |
---|
7 | | - * the COPYING file in the top-level directory. |
---|
8 | 6 | */ |
---|
9 | 7 | |
---|
10 | 8 | #include <linux/mm.h> |
---|
.. | .. |
---|
20 | 18 | #include <asm/tlbflush.h> |
---|
21 | 19 | #include "internal.h" |
---|
22 | 20 | |
---|
| 21 | +static __always_inline |
---|
| 22 | +struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, |
---|
| 23 | + unsigned long dst_start, |
---|
| 24 | + unsigned long len) |
---|
| 25 | +{ |
---|
| 26 | + /* |
---|
| 27 | + * Make sure that the dst range is both valid and fully within a |
---|
| 28 | + * single existing vma. |
---|
| 29 | + */ |
---|
| 30 | + struct vm_area_struct *dst_vma; |
---|
| 31 | + |
---|
| 32 | + dst_vma = find_vma(dst_mm, dst_start); |
---|
| 33 | + if (!dst_vma) |
---|
| 34 | + return NULL; |
---|
| 35 | + |
---|
| 36 | + if (dst_start < dst_vma->vm_start || |
---|
| 37 | + dst_start + len > dst_vma->vm_end) |
---|
| 38 | + return NULL; |
---|
| 39 | + |
---|
| 40 | + /* |
---|
| 41 | + * Check the vma is registered in uffd, this is required to |
---|
| 42 | + * enforce the VM_MAYWRITE check done at uffd registration |
---|
| 43 | + * time. |
---|
| 44 | + */ |
---|
| 45 | + if (!dst_vma->vm_userfaultfd_ctx.ctx) |
---|
| 46 | + return NULL; |
---|
| 47 | + |
---|
| 48 | + return dst_vma; |
---|
| 49 | +} |
---|
| 50 | + |
---|
| 51 | +/* |
---|
| 52 | + * Install PTEs, to map dst_addr (within dst_vma) to page. |
---|
| 53 | + * |
---|
| 54 | + * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem |
---|
| 55 | + * and anon, and for both shared and private VMAs. |
---|
| 56 | + */ |
---|
| 57 | +int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, |
---|
| 58 | + struct vm_area_struct *dst_vma, |
---|
| 59 | + unsigned long dst_addr, struct page *page, |
---|
| 60 | + bool newly_allocated, bool wp_copy) |
---|
| 61 | +{ |
---|
| 62 | + int ret; |
---|
| 63 | + pte_t _dst_pte, *dst_pte; |
---|
| 64 | + bool writable = dst_vma->vm_flags & VM_WRITE; |
---|
| 65 | + bool vm_shared = dst_vma->vm_flags & VM_SHARED; |
---|
| 66 | + bool page_in_cache = page_mapping(page); |
---|
| 67 | + spinlock_t *ptl; |
---|
| 68 | + struct inode *inode; |
---|
| 69 | + pgoff_t offset, max_off; |
---|
| 70 | + |
---|
| 71 | + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); |
---|
| 72 | + if (page_in_cache && !vm_shared) |
---|
| 73 | + writable = false; |
---|
| 74 | + if (writable || !page_in_cache) |
---|
| 75 | + _dst_pte = pte_mkdirty(_dst_pte); |
---|
| 76 | + if (writable) { |
---|
| 77 | + if (wp_copy) |
---|
| 78 | + _dst_pte = pte_mkuffd_wp(_dst_pte); |
---|
| 79 | + else |
---|
| 80 | + _dst_pte = pte_mkwrite(_dst_pte); |
---|
| 81 | + } |
---|
| 82 | + |
---|
| 83 | + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); |
---|
| 84 | + |
---|
| 85 | + if (vma_is_shmem(dst_vma)) { |
---|
| 86 | + /* serialize against truncate with the page table lock */ |
---|
| 87 | + inode = dst_vma->vm_file->f_inode; |
---|
| 88 | + offset = linear_page_index(dst_vma, dst_addr); |
---|
| 89 | + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
---|
| 90 | + ret = -EFAULT; |
---|
| 91 | + if (unlikely(offset >= max_off)) |
---|
| 92 | + goto out_unlock; |
---|
| 93 | + } |
---|
| 94 | + |
---|
| 95 | + ret = -EEXIST; |
---|
| 96 | + if (!pte_none(*dst_pte)) |
---|
| 97 | + goto out_unlock; |
---|
| 98 | + |
---|
| 99 | + if (page_in_cache) |
---|
| 100 | + page_add_file_rmap(page, false); |
---|
| 101 | + else |
---|
| 102 | + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); |
---|
| 103 | + |
---|
| 104 | + /* |
---|
| 105 | + * Must happen after rmap, as mm_counter() checks mapping (via |
---|
| 106 | + * PageAnon()), which is set by __page_set_anon_rmap(). |
---|
| 107 | + */ |
---|
| 108 | + inc_mm_counter(dst_mm, mm_counter(page)); |
---|
| 109 | + |
---|
| 110 | + if (newly_allocated) |
---|
| 111 | + lru_cache_add_inactive_or_unevictable(page, dst_vma); |
---|
| 112 | + |
---|
| 113 | + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); |
---|
| 114 | + |
---|
| 115 | + /* No need to invalidate - it was non-present before */ |
---|
| 116 | + update_mmu_cache(dst_vma, dst_addr, dst_pte); |
---|
| 117 | + ret = 0; |
---|
| 118 | +out_unlock: |
---|
| 119 | + pte_unmap_unlock(dst_pte, ptl); |
---|
| 120 | + return ret; |
---|
| 121 | +} |
---|
| 122 | + |
---|
23 | 123 | static int mcopy_atomic_pte(struct mm_struct *dst_mm, |
---|
24 | 124 | pmd_t *dst_pmd, |
---|
25 | 125 | struct vm_area_struct *dst_vma, |
---|
26 | 126 | unsigned long dst_addr, |
---|
27 | 127 | unsigned long src_addr, |
---|
28 | | - struct page **pagep) |
---|
| 128 | + struct page **pagep, |
---|
| 129 | + bool wp_copy) |
---|
29 | 130 | { |
---|
30 | | - struct mem_cgroup *memcg; |
---|
31 | | - pte_t _dst_pte, *dst_pte; |
---|
32 | | - spinlock_t *ptl; |
---|
33 | 131 | void *page_kaddr; |
---|
34 | 132 | int ret; |
---|
35 | 133 | struct page *page; |
---|
36 | | - pgoff_t offset, max_off; |
---|
37 | | - struct inode *inode; |
---|
38 | 134 | |
---|
39 | 135 | if (!*pagep) { |
---|
40 | 136 | ret = -ENOMEM; |
---|
.. | .. |
---|
48 | 144 | PAGE_SIZE); |
---|
49 | 145 | kunmap_atomic(page_kaddr); |
---|
50 | 146 | |
---|
51 | | - /* fallback to copy_from_user outside mmap_sem */ |
---|
| 147 | + /* fallback to copy_from_user outside mmap_lock */ |
---|
52 | 148 | if (unlikely(ret)) { |
---|
53 | 149 | ret = -ENOENT; |
---|
54 | 150 | *pagep = page; |
---|
55 | 151 | /* don't free the page */ |
---|
56 | 152 | goto out; |
---|
57 | 153 | } |
---|
| 154 | + |
---|
| 155 | + flush_dcache_page(page); |
---|
58 | 156 | } else { |
---|
59 | 157 | page = *pagep; |
---|
60 | 158 | *pagep = NULL; |
---|
.. | .. |
---|
62 | 160 | |
---|
63 | 161 | /* |
---|
64 | 162 | * The memory barrier inside __SetPageUptodate makes sure that |
---|
65 | | - * preceeding stores to the page contents become visible before |
---|
| 163 | + * preceding stores to the page contents become visible before |
---|
66 | 164 | * the set_pte_at() write. |
---|
67 | 165 | */ |
---|
68 | 166 | __SetPageUptodate(page); |
---|
69 | 167 | |
---|
70 | 168 | ret = -ENOMEM; |
---|
71 | | - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) |
---|
| 169 | + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) |
---|
72 | 170 | goto out_release; |
---|
73 | 171 | |
---|
74 | | - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); |
---|
75 | | - if (dst_vma->vm_flags & VM_WRITE) |
---|
76 | | - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); |
---|
77 | | - |
---|
78 | | - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); |
---|
79 | | - if (dst_vma->vm_file) { |
---|
80 | | - /* the shmem MAP_PRIVATE case requires checking the i_size */ |
---|
81 | | - inode = dst_vma->vm_file->f_inode; |
---|
82 | | - offset = linear_page_index(dst_vma, dst_addr); |
---|
83 | | - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
---|
84 | | - ret = -EFAULT; |
---|
85 | | - if (unlikely(offset >= max_off)) |
---|
86 | | - goto out_release_uncharge_unlock; |
---|
87 | | - } |
---|
88 | | - ret = -EEXIST; |
---|
89 | | - if (!pte_none(*dst_pte)) |
---|
90 | | - goto out_release_uncharge_unlock; |
---|
91 | | - |
---|
92 | | - inc_mm_counter(dst_mm, MM_ANONPAGES); |
---|
93 | | - page_add_new_anon_rmap(page, dst_vma, dst_addr, false); |
---|
94 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
---|
95 | | - lru_cache_add_active_or_unevictable(page, dst_vma); |
---|
96 | | - |
---|
97 | | - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); |
---|
98 | | - |
---|
99 | | - /* No need to invalidate - it was non-present before */ |
---|
100 | | - update_mmu_cache(dst_vma, dst_addr, dst_pte); |
---|
101 | | - |
---|
102 | | - pte_unmap_unlock(dst_pte, ptl); |
---|
103 | | - ret = 0; |
---|
| 172 | + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
---|
| 173 | + page, true, wp_copy); |
---|
| 174 | + if (ret) |
---|
| 175 | + goto out_release; |
---|
104 | 176 | out: |
---|
105 | 177 | return ret; |
---|
106 | | -out_release_uncharge_unlock: |
---|
107 | | - pte_unmap_unlock(dst_pte, ptl); |
---|
108 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
109 | 178 | out_release: |
---|
110 | 179 | put_page(page); |
---|
111 | 180 | goto out; |
---|
.. | .. |
---|
146 | 215 | return ret; |
---|
147 | 216 | } |
---|
148 | 217 | |
---|
| 218 | +/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ |
---|
| 219 | +static int mcontinue_atomic_pte(struct mm_struct *dst_mm, |
---|
| 220 | + pmd_t *dst_pmd, |
---|
| 221 | + struct vm_area_struct *dst_vma, |
---|
| 222 | + unsigned long dst_addr, |
---|
| 223 | + bool wp_copy) |
---|
| 224 | +{ |
---|
| 225 | + struct inode *inode = file_inode(dst_vma->vm_file); |
---|
| 226 | + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); |
---|
| 227 | + struct page *page; |
---|
| 228 | + int ret; |
---|
| 229 | + |
---|
| 230 | + ret = shmem_getpage(inode, pgoff, &page, SGP_READ); |
---|
| 231 | + if (ret) |
---|
| 232 | + goto out; |
---|
| 233 | + if (!page) { |
---|
| 234 | + ret = -EFAULT; |
---|
| 235 | + goto out; |
---|
| 236 | + } |
---|
| 237 | + |
---|
| 238 | + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
---|
| 239 | + page, false, wp_copy); |
---|
| 240 | + if (ret) |
---|
| 241 | + goto out_release; |
---|
| 242 | + |
---|
| 243 | + unlock_page(page); |
---|
| 244 | + ret = 0; |
---|
| 245 | +out: |
---|
| 246 | + return ret; |
---|
| 247 | +out_release: |
---|
| 248 | + unlock_page(page); |
---|
| 249 | + put_page(page); |
---|
| 250 | + goto out; |
---|
| 251 | +} |
---|
| 252 | + |
---|
149 | 253 | static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) |
---|
150 | 254 | { |
---|
151 | 255 | pgd_t *pgd; |
---|
.. | .. |
---|
170 | 274 | #ifdef CONFIG_HUGETLB_PAGE |
---|
171 | 275 | /* |
---|
172 | 276 | * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is |
---|
173 | | - * called with mmap_sem held, it will release mmap_sem before returning. |
---|
| 277 | + * called with mmap_lock held, it will release mmap_lock before returning. |
---|
174 | 278 | */ |
---|
175 | 279 | static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, |
---|
176 | 280 | struct vm_area_struct *dst_vma, |
---|
177 | 281 | unsigned long dst_start, |
---|
178 | 282 | unsigned long src_start, |
---|
179 | 283 | unsigned long len, |
---|
180 | | - bool zeropage) |
---|
| 284 | + enum mcopy_atomic_mode mode) |
---|
181 | 285 | { |
---|
182 | 286 | int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; |
---|
183 | 287 | int vm_shared = dst_vma->vm_flags & VM_SHARED; |
---|
.. | .. |
---|
186 | 290 | unsigned long src_addr, dst_addr; |
---|
187 | 291 | long copied; |
---|
188 | 292 | struct page *page; |
---|
189 | | - struct hstate *h; |
---|
190 | 293 | unsigned long vma_hpagesize; |
---|
191 | 294 | pgoff_t idx; |
---|
192 | 295 | u32 hash; |
---|
.. | .. |
---|
198 | 301 | * by THP. Since we can not reliably insert a zero page, this |
---|
199 | 302 | * feature is not supported. |
---|
200 | 303 | */ |
---|
201 | | - if (zeropage) { |
---|
202 | | - up_read(&dst_mm->mmap_sem); |
---|
| 304 | + if (mode == MCOPY_ATOMIC_ZEROPAGE) { |
---|
| 305 | + mmap_read_unlock(dst_mm); |
---|
203 | 306 | return -EINVAL; |
---|
204 | 307 | } |
---|
205 | 308 | |
---|
.. | .. |
---|
218 | 321 | |
---|
219 | 322 | retry: |
---|
220 | 323 | /* |
---|
221 | | - * On routine entry dst_vma is set. If we had to drop mmap_sem and |
---|
| 324 | + * On routine entry dst_vma is set. If we had to drop mmap_lock and |
---|
222 | 325 | * retry, dst_vma will be set to NULL and we must lookup again. |
---|
223 | 326 | */ |
---|
224 | 327 | if (!dst_vma) { |
---|
225 | 328 | err = -ENOENT; |
---|
226 | | - dst_vma = find_vma(dst_mm, dst_start); |
---|
| 329 | + dst_vma = find_dst_vma(dst_mm, dst_start, len); |
---|
227 | 330 | if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) |
---|
228 | | - goto out_unlock; |
---|
229 | | - /* |
---|
230 | | - * Check the vma is registered in uffd, this is |
---|
231 | | - * required to enforce the VM_MAYWRITE check done at |
---|
232 | | - * uffd registration time. |
---|
233 | | - */ |
---|
234 | | - if (!dst_vma->vm_userfaultfd_ctx.ctx) |
---|
235 | | - goto out_unlock; |
---|
236 | | - |
---|
237 | | - if (dst_start < dst_vma->vm_start || |
---|
238 | | - dst_start + len > dst_vma->vm_end) |
---|
239 | 331 | goto out_unlock; |
---|
240 | 332 | |
---|
241 | 333 | err = -EINVAL; |
---|
.. | .. |
---|
244 | 336 | |
---|
245 | 337 | vm_shared = dst_vma->vm_flags & VM_SHARED; |
---|
246 | 338 | } |
---|
247 | | - |
---|
248 | | - if (WARN_ON(dst_addr & (vma_hpagesize - 1) || |
---|
249 | | - (len - copied) & (vma_hpagesize - 1))) |
---|
250 | | - goto out_unlock; |
---|
251 | 339 | |
---|
252 | 340 | /* |
---|
253 | 341 | * If not shared, ensure the dst_vma has a anon_vma. |
---|
.. | .. |
---|
258 | 346 | goto out_unlock; |
---|
259 | 347 | } |
---|
260 | 348 | |
---|
261 | | - h = hstate_vma(dst_vma); |
---|
262 | | - |
---|
263 | 349 | while (src_addr < src_start + len) { |
---|
264 | | - pte_t dst_pteval; |
---|
265 | | - |
---|
266 | 350 | BUG_ON(dst_addr >= dst_start + len); |
---|
267 | | - VM_BUG_ON(dst_addr & ~huge_page_mask(h)); |
---|
268 | 351 | |
---|
269 | 352 | /* |
---|
270 | | - * Serialize via hugetlb_fault_mutex |
---|
| 353 | + * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. |
---|
| 354 | + * i_mmap_rwsem ensures the dst_pte remains valid even |
---|
| 355 | + * in the case of shared pmds. fault mutex prevents |
---|
| 356 | + * races with other faulting threads. |
---|
271 | 357 | */ |
---|
272 | | - idx = linear_page_index(dst_vma, dst_addr); |
---|
273 | 358 | mapping = dst_vma->vm_file->f_mapping; |
---|
274 | | - hash = hugetlb_fault_mutex_hash(h, mapping, idx); |
---|
| 359 | + i_mmap_lock_read(mapping); |
---|
| 360 | + idx = linear_page_index(dst_vma, dst_addr); |
---|
| 361 | + hash = hugetlb_fault_mutex_hash(mapping, idx); |
---|
275 | 362 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
---|
276 | 363 | |
---|
277 | 364 | err = -ENOMEM; |
---|
278 | | - dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); |
---|
| 365 | + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); |
---|
279 | 366 | if (!dst_pte) { |
---|
280 | 367 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 368 | + i_mmap_unlock_read(mapping); |
---|
281 | 369 | goto out_unlock; |
---|
282 | 370 | } |
---|
283 | 371 | |
---|
284 | | - err = -EEXIST; |
---|
285 | | - dst_pteval = huge_ptep_get(dst_pte); |
---|
286 | | - if (!huge_pte_none(dst_pteval)) { |
---|
| 372 | + if (mode != MCOPY_ATOMIC_CONTINUE && |
---|
| 373 | + !huge_pte_none(huge_ptep_get(dst_pte))) { |
---|
| 374 | + err = -EEXIST; |
---|
287 | 375 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 376 | + i_mmap_unlock_read(mapping); |
---|
288 | 377 | goto out_unlock; |
---|
289 | 378 | } |
---|
290 | 379 | |
---|
291 | 380 | err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, |
---|
292 | | - dst_addr, src_addr, &page); |
---|
| 381 | + dst_addr, src_addr, mode, &page); |
---|
293 | 382 | |
---|
294 | 383 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
---|
| 384 | + i_mmap_unlock_read(mapping); |
---|
295 | 385 | vm_alloc_shared = vm_shared; |
---|
296 | 386 | |
---|
297 | 387 | cond_resched(); |
---|
298 | 388 | |
---|
299 | 389 | if (unlikely(err == -ENOENT)) { |
---|
300 | | - up_read(&dst_mm->mmap_sem); |
---|
| 390 | + mmap_read_unlock(dst_mm); |
---|
301 | 391 | BUG_ON(!page); |
---|
302 | 392 | |
---|
303 | 393 | err = copy_huge_page_from_user(page, |
---|
304 | 394 | (const void __user *)src_addr, |
---|
305 | | - pages_per_huge_page(h), true); |
---|
| 395 | + vma_hpagesize / PAGE_SIZE, |
---|
| 396 | + true); |
---|
306 | 397 | if (unlikely(err)) { |
---|
307 | 398 | err = -EFAULT; |
---|
308 | 399 | goto out; |
---|
309 | 400 | } |
---|
310 | | - down_read(&dst_mm->mmap_sem); |
---|
| 401 | + mmap_read_lock(dst_mm); |
---|
311 | 402 | |
---|
312 | 403 | dst_vma = NULL; |
---|
313 | 404 | goto retry; |
---|
.. | .. |
---|
327 | 418 | } |
---|
328 | 419 | |
---|
329 | 420 | out_unlock: |
---|
330 | | - up_read(&dst_mm->mmap_sem); |
---|
| 421 | + mmap_read_unlock(dst_mm); |
---|
331 | 422 | out: |
---|
332 | 423 | if (page) { |
---|
333 | 424 | /* |
---|
.. | .. |
---|
338 | 429 | * private and shared mappings. See the routine |
---|
339 | 430 | * restore_reserve_on_error for details. Unfortunately, we |
---|
340 | 431 | * can not call restore_reserve_on_error now as it would |
---|
341 | | - * require holding mmap_sem. |
---|
| 432 | + * require holding mmap_lock. |
---|
342 | 433 | * |
---|
343 | 434 | * If a reservation for the page existed in the reservation |
---|
344 | 435 | * map of a private mapping, the map was modified to indicate |
---|
.. | .. |
---|
389 | 480 | unsigned long dst_start, |
---|
390 | 481 | unsigned long src_start, |
---|
391 | 482 | unsigned long len, |
---|
392 | | - bool zeropage); |
---|
| 483 | + enum mcopy_atomic_mode mode); |
---|
393 | 484 | #endif /* CONFIG_HUGETLB_PAGE */ |
---|
394 | 485 | |
---|
395 | 486 | static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, |
---|
.. | .. |
---|
398 | 489 | unsigned long dst_addr, |
---|
399 | 490 | unsigned long src_addr, |
---|
400 | 491 | struct page **page, |
---|
401 | | - bool zeropage) |
---|
| 492 | + enum mcopy_atomic_mode mode, |
---|
| 493 | + bool wp_copy) |
---|
402 | 494 | { |
---|
403 | 495 | ssize_t err; |
---|
| 496 | + |
---|
| 497 | + if (mode == MCOPY_ATOMIC_CONTINUE) { |
---|
| 498 | + return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
---|
| 499 | + wp_copy); |
---|
| 500 | + } |
---|
404 | 501 | |
---|
405 | 502 | /* |
---|
406 | 503 | * The normal page fault path for a shmem will invoke the |
---|
.. | .. |
---|
413 | 510 | * and not in the radix tree. |
---|
414 | 511 | */ |
---|
415 | 512 | if (!(dst_vma->vm_flags & VM_SHARED)) { |
---|
416 | | - if (!zeropage) |
---|
| 513 | + if (mode == MCOPY_ATOMIC_NORMAL) |
---|
417 | 514 | err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, |
---|
418 | | - dst_addr, src_addr, page); |
---|
| 515 | + dst_addr, src_addr, page, |
---|
| 516 | + wp_copy); |
---|
419 | 517 | else |
---|
420 | 518 | err = mfill_zeropage_pte(dst_mm, dst_pmd, |
---|
421 | 519 | dst_vma, dst_addr); |
---|
422 | 520 | } else { |
---|
423 | | - if (!zeropage) |
---|
424 | | - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, |
---|
425 | | - dst_vma, dst_addr, |
---|
426 | | - src_addr, page); |
---|
427 | | - else |
---|
428 | | - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, |
---|
429 | | - dst_vma, dst_addr); |
---|
| 521 | + VM_WARN_ON_ONCE(wp_copy); |
---|
| 522 | + err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, |
---|
| 523 | + dst_addr, src_addr, |
---|
| 524 | + mode != MCOPY_ATOMIC_NORMAL, |
---|
| 525 | + page); |
---|
430 | 526 | } |
---|
431 | 527 | |
---|
432 | 528 | return err; |
---|
.. | .. |
---|
436 | 532 | unsigned long dst_start, |
---|
437 | 533 | unsigned long src_start, |
---|
438 | 534 | unsigned long len, |
---|
439 | | - bool zeropage, |
---|
440 | | - bool *mmap_changing) |
---|
| 535 | + enum mcopy_atomic_mode mcopy_mode, |
---|
| 536 | + bool *mmap_changing, |
---|
| 537 | + __u64 mode) |
---|
441 | 538 | { |
---|
442 | 539 | struct vm_area_struct *dst_vma; |
---|
443 | 540 | ssize_t err; |
---|
.. | .. |
---|
445 | 542 | unsigned long src_addr, dst_addr; |
---|
446 | 543 | long copied; |
---|
447 | 544 | struct page *page; |
---|
| 545 | + bool wp_copy; |
---|
448 | 546 | |
---|
449 | 547 | /* |
---|
450 | 548 | * Sanitize the command parameters: |
---|
.. | .. |
---|
461 | 559 | copied = 0; |
---|
462 | 560 | page = NULL; |
---|
463 | 561 | retry: |
---|
464 | | - down_read(&dst_mm->mmap_sem); |
---|
| 562 | + mmap_read_lock(dst_mm); |
---|
465 | 563 | |
---|
466 | 564 | /* |
---|
467 | 565 | * If memory mappings are changing because of non-cooperative |
---|
.. | .. |
---|
477 | 575 | * both valid and fully within a single existing vma. |
---|
478 | 576 | */ |
---|
479 | 577 | err = -ENOENT; |
---|
480 | | - dst_vma = find_vma(dst_mm, dst_start); |
---|
| 578 | + dst_vma = find_dst_vma(dst_mm, dst_start, len); |
---|
481 | 579 | if (!dst_vma) |
---|
482 | | - goto out_unlock; |
---|
483 | | - /* |
---|
484 | | - * Check the vma is registered in uffd, this is required to |
---|
485 | | - * enforce the VM_MAYWRITE check done at uffd registration |
---|
486 | | - * time. |
---|
487 | | - */ |
---|
488 | | - if (!dst_vma->vm_userfaultfd_ctx.ctx) |
---|
489 | | - goto out_unlock; |
---|
490 | | - |
---|
491 | | - if (dst_start < dst_vma->vm_start || |
---|
492 | | - dst_start + len > dst_vma->vm_end) |
---|
493 | 580 | goto out_unlock; |
---|
494 | 581 | |
---|
495 | 582 | err = -EINVAL; |
---|
.. | .. |
---|
502 | 589 | goto out_unlock; |
---|
503 | 590 | |
---|
504 | 591 | /* |
---|
| 592 | + * validate 'mode' now that we know the dst_vma: don't allow |
---|
| 593 | + * a wrprotect copy if the userfaultfd didn't register as WP. |
---|
| 594 | + */ |
---|
| 595 | + wp_copy = mode & UFFDIO_COPY_MODE_WP; |
---|
| 596 | + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) |
---|
| 597 | + goto out_unlock; |
---|
| 598 | + |
---|
| 599 | + /* |
---|
505 | 600 | * If this is a HUGETLB vma, pass off to appropriate routine |
---|
506 | 601 | */ |
---|
507 | 602 | if (is_vm_hugetlb_page(dst_vma)) |
---|
508 | 603 | return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, |
---|
509 | | - src_start, len, zeropage); |
---|
| 604 | + src_start, len, mcopy_mode); |
---|
510 | 605 | |
---|
511 | 606 | if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) |
---|
| 607 | + goto out_unlock; |
---|
| 608 | + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) |
---|
512 | 609 | goto out_unlock; |
---|
513 | 610 | |
---|
514 | 611 | /* |
---|
.. | .. |
---|
542 | 639 | break; |
---|
543 | 640 | } |
---|
544 | 641 | if (unlikely(pmd_none(dst_pmdval)) && |
---|
545 | | - unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { |
---|
| 642 | + unlikely(__pte_alloc(dst_mm, dst_pmd))) { |
---|
546 | 643 | err = -ENOMEM; |
---|
547 | 644 | break; |
---|
548 | 645 | } |
---|
.. | .. |
---|
556 | 653 | BUG_ON(pmd_trans_huge(*dst_pmd)); |
---|
557 | 654 | |
---|
558 | 655 | err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
---|
559 | | - src_addr, &page, zeropage); |
---|
| 656 | + src_addr, &page, mcopy_mode, wp_copy); |
---|
560 | 657 | cond_resched(); |
---|
561 | 658 | |
---|
562 | 659 | if (unlikely(err == -ENOENT)) { |
---|
563 | 660 | void *page_kaddr; |
---|
564 | 661 | |
---|
565 | | - up_read(&dst_mm->mmap_sem); |
---|
| 662 | + mmap_read_unlock(dst_mm); |
---|
566 | 663 | BUG_ON(!page); |
---|
567 | 664 | |
---|
568 | 665 | page_kaddr = kmap(page); |
---|
.. | .. |
---|
574 | 671 | err = -EFAULT; |
---|
575 | 672 | goto out; |
---|
576 | 673 | } |
---|
| 674 | + flush_dcache_page(page); |
---|
577 | 675 | goto retry; |
---|
578 | 676 | } else |
---|
579 | 677 | BUG_ON(page); |
---|
.. | .. |
---|
591 | 689 | } |
---|
592 | 690 | |
---|
593 | 691 | out_unlock: |
---|
594 | | - up_read(&dst_mm->mmap_sem); |
---|
| 692 | + mmap_read_unlock(dst_mm); |
---|
595 | 693 | out: |
---|
596 | 694 | if (page) |
---|
597 | 695 | put_page(page); |
---|
.. | .. |
---|
603 | 701 | |
---|
604 | 702 | ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, |
---|
605 | 703 | unsigned long src_start, unsigned long len, |
---|
606 | | - bool *mmap_changing) |
---|
| 704 | + bool *mmap_changing, __u64 mode) |
---|
607 | 705 | { |
---|
608 | | - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, |
---|
609 | | - mmap_changing); |
---|
| 706 | + return __mcopy_atomic(dst_mm, dst_start, src_start, len, |
---|
| 707 | + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); |
---|
610 | 708 | } |
---|
611 | 709 | |
---|
612 | 710 | ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, |
---|
613 | 711 | unsigned long len, bool *mmap_changing) |
---|
614 | 712 | { |
---|
615 | | - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); |
---|
| 713 | + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, |
---|
| 714 | + mmap_changing, 0); |
---|
| 715 | +} |
---|
| 716 | + |
---|
| 717 | +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, |
---|
| 718 | + unsigned long len, bool *mmap_changing) |
---|
| 719 | +{ |
---|
| 720 | + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, |
---|
| 721 | + mmap_changing, 0); |
---|
| 722 | +} |
---|
| 723 | + |
---|
| 724 | +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, |
---|
| 725 | + unsigned long len, bool enable_wp, bool *mmap_changing) |
---|
| 726 | +{ |
---|
| 727 | + struct vm_area_struct *dst_vma; |
---|
| 728 | + pgprot_t newprot; |
---|
| 729 | + int err; |
---|
| 730 | + |
---|
| 731 | + /* |
---|
| 732 | + * Sanitize the command parameters: |
---|
| 733 | + */ |
---|
| 734 | + BUG_ON(start & ~PAGE_MASK); |
---|
| 735 | + BUG_ON(len & ~PAGE_MASK); |
---|
| 736 | + |
---|
| 737 | + /* Does the address range wrap, or is the span zero-sized? */ |
---|
| 738 | + BUG_ON(start + len <= start); |
---|
| 739 | + |
---|
| 740 | + mmap_read_lock(dst_mm); |
---|
| 741 | + |
---|
| 742 | + /* |
---|
| 743 | + * If memory mappings are changing because of non-cooperative |
---|
| 744 | + * operation (e.g. mremap) running in parallel, bail out and |
---|
| 745 | + * request the user to retry later |
---|
| 746 | + */ |
---|
| 747 | + err = -EAGAIN; |
---|
| 748 | + if (mmap_changing && READ_ONCE(*mmap_changing)) |
---|
| 749 | + goto out_unlock; |
---|
| 750 | + |
---|
| 751 | + err = -ENOENT; |
---|
| 752 | + dst_vma = find_dst_vma(dst_mm, start, len); |
---|
| 753 | + /* |
---|
| 754 | + * Make sure the vma is not shared, that the dst range is |
---|
| 755 | + * both valid and fully within a single existing vma. |
---|
| 756 | + */ |
---|
| 757 | + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) |
---|
| 758 | + goto out_unlock; |
---|
| 759 | + if (!userfaultfd_wp(dst_vma)) |
---|
| 760 | + goto out_unlock; |
---|
| 761 | + if (!vma_is_anonymous(dst_vma)) |
---|
| 762 | + goto out_unlock; |
---|
| 763 | + |
---|
| 764 | + if (enable_wp) |
---|
| 765 | + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); |
---|
| 766 | + else |
---|
| 767 | + newprot = vm_get_page_prot(dst_vma->vm_flags); |
---|
| 768 | + |
---|
| 769 | + change_protection(dst_vma, start, start + len, newprot, |
---|
| 770 | + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); |
---|
| 771 | + |
---|
| 772 | + err = 0; |
---|
| 773 | +out_unlock: |
---|
| 774 | + mmap_read_unlock(dst_mm); |
---|
| 775 | + return err; |
---|
616 | 776 | } |
---|