.. | .. |
---|
30 | 30 | |
---|
31 | 31 | #include "internal.h" |
---|
32 | 32 | |
---|
33 | | -static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
---|
| 33 | +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) |
---|
34 | 34 | { |
---|
35 | 35 | pgd_t *pgd; |
---|
36 | 36 | p4d_t *p4d; |
---|
37 | 37 | pud_t *pud; |
---|
38 | | - pmd_t *pmd; |
---|
39 | 38 | |
---|
40 | 39 | pgd = pgd_offset(mm, addr); |
---|
41 | 40 | if (pgd_none_or_clear_bad(pgd)) |
---|
.. | .. |
---|
49 | 48 | if (pud_none_or_clear_bad(pud)) |
---|
50 | 49 | return NULL; |
---|
51 | 50 | |
---|
| 51 | + return pud; |
---|
| 52 | +} |
---|
| 53 | + |
---|
| 54 | +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
---|
| 55 | +{ |
---|
| 56 | + pud_t *pud; |
---|
| 57 | + pmd_t *pmd; |
---|
| 58 | + |
---|
| 59 | + pud = get_old_pud(mm, addr); |
---|
| 60 | + if (!pud) |
---|
| 61 | + return NULL; |
---|
| 62 | + |
---|
52 | 63 | pmd = pmd_offset(pud, addr); |
---|
53 | 64 | if (pmd_none(*pmd)) |
---|
54 | 65 | return NULL; |
---|
.. | .. |
---|
56 | 67 | return pmd; |
---|
57 | 68 | } |
---|
58 | 69 | |
---|
59 | | -static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 70 | +static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
60 | 71 | unsigned long addr) |
---|
61 | 72 | { |
---|
62 | 73 | pgd_t *pgd; |
---|
63 | 74 | p4d_t *p4d; |
---|
64 | | - pud_t *pud; |
---|
65 | | - pmd_t *pmd; |
---|
66 | 75 | |
---|
67 | 76 | pgd = pgd_offset(mm, addr); |
---|
68 | 77 | p4d = p4d_alloc(mm, pgd, addr); |
---|
69 | 78 | if (!p4d) |
---|
70 | 79 | return NULL; |
---|
71 | | - pud = pud_alloc(mm, p4d, addr); |
---|
| 80 | + |
---|
| 81 | + return pud_alloc(mm, p4d, addr); |
---|
| 82 | +} |
---|
| 83 | + |
---|
| 84 | +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
| 85 | + unsigned long addr) |
---|
| 86 | +{ |
---|
| 87 | + pud_t *pud; |
---|
| 88 | + pmd_t *pmd; |
---|
| 89 | + |
---|
| 90 | + pud = alloc_new_pud(mm, vma, addr); |
---|
72 | 91 | if (!pud) |
---|
73 | 92 | return NULL; |
---|
74 | 93 | |
---|
.. | .. |
---|
133 | 152 | * such races: |
---|
134 | 153 | * |
---|
135 | 154 | * - During exec() shift_arg_pages(), we use a specially tagged vma |
---|
136 | | - * which rmap call sites look for using is_vma_temporary_stack(). |
---|
| 155 | + * which rmap call sites look for using vma_is_temporary_stack(). |
---|
137 | 156 | * |
---|
138 | 157 | * - During mremap(), new_vma is often known to be placed after vma |
---|
139 | 158 | * in rmap traversal order. This ensures rmap will always observe |
---|
.. | .. |
---|
146 | 165 | |
---|
147 | 166 | /* |
---|
148 | 167 | * We don't have to worry about the ordering of src and dst |
---|
149 | | - * pte locks because exclusive mmap_sem prevents deadlock. |
---|
| 168 | + * pte locks because exclusive mmap_lock prevents deadlock. |
---|
150 | 169 | */ |
---|
151 | 170 | old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); |
---|
152 | 171 | new_pte = pte_offset_map(new_pmd, new_addr); |
---|
.. | .. |
---|
191 | 210 | drop_rmap_locks(vma); |
---|
192 | 211 | } |
---|
193 | 212 | |
---|
| 213 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 214 | +static inline bool trylock_vma_ref_count(struct vm_area_struct *vma) |
---|
| 215 | +{ |
---|
| 216 | + /* |
---|
| 217 | + * If we have the only reference, swap the refcount to -1. This |
---|
| 218 | + * will prevent other concurrent references by get_vma() for SPFs. |
---|
| 219 | + */ |
---|
| 220 | + return atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1; |
---|
| 221 | +} |
---|
| 222 | + |
---|
| 223 | +/* |
---|
| 224 | + * Restore the VMA reference count to 1 after a fast mremap. |
---|
| 225 | + */ |
---|
| 226 | +static inline void unlock_vma_ref_count(struct vm_area_struct *vma) |
---|
| 227 | +{ |
---|
| 228 | + /* |
---|
| 229 | + * This should only be called after a corresponding, |
---|
| 230 | + * successful trylock_vma_ref_count(). |
---|
| 231 | + */ |
---|
| 232 | + VM_BUG_ON_VMA(atomic_cmpxchg(&vma->vm_ref_count, -1, 1) != -1, |
---|
| 233 | + vma); |
---|
| 234 | +} |
---|
| 235 | +#else /* !CONFIG_SPECULATIVE_PAGE_FAULT */ |
---|
| 236 | +static inline bool trylock_vma_ref_count(struct vm_area_struct *vma) |
---|
| 237 | +{ |
---|
| 238 | + return true; |
---|
| 239 | +} |
---|
| 240 | +static inline void unlock_vma_ref_count(struct vm_area_struct *vma) |
---|
| 241 | +{ |
---|
| 242 | +} |
---|
| 243 | +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ |
---|
| 244 | + |
---|
| 245 | +#ifdef CONFIG_HAVE_MOVE_PMD |
---|
| 246 | +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, |
---|
| 247 | + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) |
---|
| 248 | +{ |
---|
| 249 | + spinlock_t *old_ptl, *new_ptl; |
---|
| 250 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 251 | + pmd_t pmd; |
---|
| 252 | + |
---|
| 253 | + /* |
---|
| 254 | + * The destination pmd shouldn't be established, free_pgtables() |
---|
| 255 | + * should have released it. |
---|
| 256 | + * |
---|
| 257 | + * However, there's a case during execve() where we use mremap |
---|
| 258 | + * to move the initial stack, and in that case the target area |
---|
| 259 | + * may overlap the source area (always moving down). |
---|
| 260 | + * |
---|
| 261 | + * If everything is PMD-aligned, that works fine, as moving |
---|
| 262 | + * each pmd down will clear the source pmd. But if we first |
---|
| 263 | + * have a few 4kB-only pages that get moved down, and then |
---|
| 264 | + * hit the "now the rest is PMD-aligned, let's do everything |
---|
| 265 | + * one pmd at a time", we will still have the old (now empty |
---|
| 266 | + * of any 4kB pages, but still there) PMD in the page table |
---|
| 267 | + * tree. |
---|
| 268 | + * |
---|
| 269 | + * Warn on it once - because we really should try to figure |
---|
| 270 | + * out how to do this better - but then say "I won't move |
---|
| 271 | + * this pmd". |
---|
| 272 | + * |
---|
| 273 | + * One alternative might be to just unmap the target pmd at |
---|
| 274 | + * this point, and verify that it really is empty. We'll see. |
---|
| 275 | + */ |
---|
| 276 | + if (WARN_ON_ONCE(!pmd_none(*new_pmd))) |
---|
| 277 | + return false; |
---|
| 278 | + |
---|
| 279 | + /* |
---|
| 280 | + * We hold both exclusive mmap_lock and rmap_lock at this point and |
---|
| 281 | + * cannot block. If we cannot immediately take exclusive ownership |
---|
| 282 | + * of the VMA fallback to the move_ptes(). |
---|
| 283 | + */ |
---|
| 284 | + if (!trylock_vma_ref_count(vma)) |
---|
| 285 | + return false; |
---|
| 286 | + |
---|
| 287 | + /* |
---|
| 288 | + * We don't have to worry about the ordering of src and dst |
---|
| 289 | + * ptlocks because exclusive mmap_lock prevents deadlock. |
---|
| 290 | + */ |
---|
| 291 | + old_ptl = pmd_lock(vma->vm_mm, old_pmd); |
---|
| 292 | + new_ptl = pmd_lockptr(mm, new_pmd); |
---|
| 293 | + if (new_ptl != old_ptl) |
---|
| 294 | + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
---|
| 295 | + |
---|
| 296 | + /* Clear the pmd */ |
---|
| 297 | + pmd = *old_pmd; |
---|
| 298 | + pmd_clear(old_pmd); |
---|
| 299 | + |
---|
| 300 | + VM_BUG_ON(!pmd_none(*new_pmd)); |
---|
| 301 | + |
---|
| 302 | + /* Set the new pmd */ |
---|
| 303 | + set_pmd_at(mm, new_addr, new_pmd, pmd); |
---|
| 304 | + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); |
---|
| 305 | + if (new_ptl != old_ptl) |
---|
| 306 | + spin_unlock(new_ptl); |
---|
| 307 | + spin_unlock(old_ptl); |
---|
| 308 | + |
---|
| 309 | + unlock_vma_ref_count(vma); |
---|
| 310 | + return true; |
---|
| 311 | +} |
---|
| 312 | +#else |
---|
| 313 | +static inline bool move_normal_pmd(struct vm_area_struct *vma, |
---|
| 314 | + unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, |
---|
| 315 | + pmd_t *new_pmd) |
---|
| 316 | +{ |
---|
| 317 | + return false; |
---|
| 318 | +} |
---|
| 319 | +#endif |
---|
| 320 | + |
---|
| 321 | +#ifdef CONFIG_HAVE_MOVE_PUD |
---|
| 322 | +static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, |
---|
| 323 | + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) |
---|
| 324 | +{ |
---|
| 325 | + spinlock_t *old_ptl, *new_ptl; |
---|
| 326 | + struct mm_struct *mm = vma->vm_mm; |
---|
| 327 | + pud_t pud; |
---|
| 328 | + |
---|
| 329 | + /* |
---|
| 330 | + * The destination pud shouldn't be established, free_pgtables() |
---|
| 331 | + * should have released it. |
---|
| 332 | + */ |
---|
| 333 | + if (WARN_ON_ONCE(!pud_none(*new_pud))) |
---|
| 334 | + return false; |
---|
| 335 | + |
---|
| 336 | + /* |
---|
| 337 | + * We hold both exclusive mmap_lock and rmap_lock at this point and |
---|
| 338 | + * cannot block. If we cannot immediately take exclusive ownership |
---|
| 339 | + * of the VMA fallback to the move_ptes(). |
---|
| 340 | + */ |
---|
| 341 | + if (!trylock_vma_ref_count(vma)) |
---|
| 342 | + return false; |
---|
| 343 | + |
---|
| 344 | + /* |
---|
| 345 | + * We don't have to worry about the ordering of src and dst |
---|
| 346 | + * ptlocks because exclusive mmap_lock prevents deadlock. |
---|
| 347 | + */ |
---|
| 348 | + old_ptl = pud_lock(vma->vm_mm, old_pud); |
---|
| 349 | + new_ptl = pud_lockptr(mm, new_pud); |
---|
| 350 | + if (new_ptl != old_ptl) |
---|
| 351 | + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
---|
| 352 | + |
---|
| 353 | + /* Clear the pud */ |
---|
| 354 | + pud = *old_pud; |
---|
| 355 | + pud_clear(old_pud); |
---|
| 356 | + |
---|
| 357 | + VM_BUG_ON(!pud_none(*new_pud)); |
---|
| 358 | + |
---|
| 359 | + /* Set the new pud */ |
---|
| 360 | + set_pud_at(mm, new_addr, new_pud, pud); |
---|
| 361 | + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); |
---|
| 362 | + if (new_ptl != old_ptl) |
---|
| 363 | + spin_unlock(new_ptl); |
---|
| 364 | + spin_unlock(old_ptl); |
---|
| 365 | + |
---|
| 366 | + unlock_vma_ref_count(vma); |
---|
| 367 | + return true; |
---|
| 368 | +} |
---|
| 369 | +#else |
---|
| 370 | +static inline bool move_normal_pud(struct vm_area_struct *vma, |
---|
| 371 | + unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, |
---|
| 372 | + pud_t *new_pud) |
---|
| 373 | +{ |
---|
| 374 | + return false; |
---|
| 375 | +} |
---|
| 376 | +#endif |
---|
| 377 | + |
---|
| 378 | +enum pgt_entry { |
---|
| 379 | + NORMAL_PMD, |
---|
| 380 | + HPAGE_PMD, |
---|
| 381 | + NORMAL_PUD, |
---|
| 382 | +}; |
---|
| 383 | + |
---|
| 384 | +/* |
---|
| 385 | + * Returns an extent of the corresponding size for the pgt_entry specified if |
---|
| 386 | + * valid. Else returns a smaller extent bounded by the end of the source and |
---|
| 387 | + * destination pgt_entry. |
---|
| 388 | + */ |
---|
| 389 | +static __always_inline unsigned long get_extent(enum pgt_entry entry, |
---|
| 390 | + unsigned long old_addr, unsigned long old_end, |
---|
| 391 | + unsigned long new_addr) |
---|
| 392 | +{ |
---|
| 393 | + unsigned long next, extent, mask, size; |
---|
| 394 | + |
---|
| 395 | + switch (entry) { |
---|
| 396 | + case HPAGE_PMD: |
---|
| 397 | + case NORMAL_PMD: |
---|
| 398 | + mask = PMD_MASK; |
---|
| 399 | + size = PMD_SIZE; |
---|
| 400 | + break; |
---|
| 401 | + case NORMAL_PUD: |
---|
| 402 | + mask = PUD_MASK; |
---|
| 403 | + size = PUD_SIZE; |
---|
| 404 | + break; |
---|
| 405 | + default: |
---|
| 406 | + BUILD_BUG(); |
---|
| 407 | + break; |
---|
| 408 | + } |
---|
| 409 | + |
---|
| 410 | + next = (old_addr + size) & mask; |
---|
| 411 | + /* even if next overflowed, extent below will be ok */ |
---|
| 412 | + extent = next - old_addr; |
---|
| 413 | + if (extent > old_end - old_addr) |
---|
| 414 | + extent = old_end - old_addr; |
---|
| 415 | + next = (new_addr + size) & mask; |
---|
| 416 | + if (extent > next - new_addr) |
---|
| 417 | + extent = next - new_addr; |
---|
| 418 | + return extent; |
---|
| 419 | +} |
---|
| 420 | + |
---|
| 421 | +/* |
---|
| 422 | + * Attempts to speedup the move by moving entry at the level corresponding to |
---|
| 423 | + * pgt_entry. Returns true if the move was successful, else false. |
---|
| 424 | + */ |
---|
| 425 | +static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, |
---|
| 426 | + unsigned long old_addr, unsigned long new_addr, |
---|
| 427 | + void *old_entry, void *new_entry, bool need_rmap_locks) |
---|
| 428 | +{ |
---|
| 429 | + bool moved = false; |
---|
| 430 | + |
---|
| 431 | + /* See comment in move_ptes() */ |
---|
| 432 | + if (need_rmap_locks) |
---|
| 433 | + take_rmap_locks(vma); |
---|
| 434 | + |
---|
| 435 | + switch (entry) { |
---|
| 436 | + case NORMAL_PMD: |
---|
| 437 | + moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, |
---|
| 438 | + new_entry); |
---|
| 439 | + break; |
---|
| 440 | + case NORMAL_PUD: |
---|
| 441 | + moved = move_normal_pud(vma, old_addr, new_addr, old_entry, |
---|
| 442 | + new_entry); |
---|
| 443 | + break; |
---|
| 444 | + case HPAGE_PMD: |
---|
| 445 | + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && |
---|
| 446 | + move_huge_pmd(vma, old_addr, new_addr, old_entry, |
---|
| 447 | + new_entry); |
---|
| 448 | + break; |
---|
| 449 | + default: |
---|
| 450 | + WARN_ON_ONCE(1); |
---|
| 451 | + break; |
---|
| 452 | + } |
---|
| 453 | + |
---|
| 454 | + if (need_rmap_locks) |
---|
| 455 | + drop_rmap_locks(vma); |
---|
| 456 | + |
---|
| 457 | + return moved; |
---|
| 458 | +} |
---|
| 459 | + |
---|
194 | 460 | unsigned long move_page_tables(struct vm_area_struct *vma, |
---|
195 | 461 | unsigned long old_addr, struct vm_area_struct *new_vma, |
---|
196 | 462 | unsigned long new_addr, unsigned long len, |
---|
197 | 463 | bool need_rmap_locks) |
---|
198 | 464 | { |
---|
199 | | - unsigned long extent, next, old_end; |
---|
| 465 | + unsigned long extent, old_end; |
---|
| 466 | + struct mmu_notifier_range range; |
---|
200 | 467 | pmd_t *old_pmd, *new_pmd; |
---|
201 | | - unsigned long mmun_start; /* For mmu_notifiers */ |
---|
202 | | - unsigned long mmun_end; /* For mmu_notifiers */ |
---|
| 468 | + |
---|
| 469 | + if (!len) |
---|
| 470 | + return 0; |
---|
203 | 471 | |
---|
204 | 472 | old_end = old_addr + len; |
---|
205 | 473 | flush_cache_range(vma, old_addr, old_end); |
---|
206 | 474 | |
---|
207 | | - mmun_start = old_addr; |
---|
208 | | - mmun_end = old_end; |
---|
209 | | - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
---|
| 475 | + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, |
---|
| 476 | + old_addr, old_end); |
---|
| 477 | + mmu_notifier_invalidate_range_start(&range); |
---|
210 | 478 | |
---|
211 | 479 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
---|
212 | 480 | cond_resched(); |
---|
213 | | - next = (old_addr + PMD_SIZE) & PMD_MASK; |
---|
214 | | - /* even if next overflowed, extent below will be ok */ |
---|
215 | | - extent = next - old_addr; |
---|
216 | | - if (extent > old_end - old_addr) |
---|
217 | | - extent = old_end - old_addr; |
---|
| 481 | + /* |
---|
| 482 | + * If extent is PUD-sized try to speed up the move by moving at the |
---|
| 483 | + * PUD level if possible. |
---|
| 484 | + */ |
---|
| 485 | + extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); |
---|
| 486 | + if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { |
---|
| 487 | + pud_t *old_pud, *new_pud; |
---|
| 488 | + |
---|
| 489 | + old_pud = get_old_pud(vma->vm_mm, old_addr); |
---|
| 490 | + if (!old_pud) |
---|
| 491 | + continue; |
---|
| 492 | + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); |
---|
| 493 | + if (!new_pud) |
---|
| 494 | + break; |
---|
| 495 | + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, |
---|
| 496 | + old_pud, new_pud, true)) |
---|
| 497 | + continue; |
---|
| 498 | + } |
---|
| 499 | + |
---|
| 500 | + extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); |
---|
218 | 501 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); |
---|
219 | 502 | if (!old_pmd) |
---|
220 | 503 | continue; |
---|
221 | 504 | new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); |
---|
222 | 505 | if (!new_pmd) |
---|
223 | 506 | break; |
---|
224 | | - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { |
---|
225 | | - if (extent == HPAGE_PMD_SIZE) { |
---|
226 | | - bool moved; |
---|
227 | | - /* See comment in move_ptes() */ |
---|
228 | | - if (need_rmap_locks) |
---|
229 | | - take_rmap_locks(vma); |
---|
230 | | - moved = move_huge_pmd(vma, old_addr, new_addr, |
---|
231 | | - old_end, old_pmd, new_pmd); |
---|
232 | | - if (need_rmap_locks) |
---|
233 | | - drop_rmap_locks(vma); |
---|
234 | | - if (moved) |
---|
235 | | - continue; |
---|
236 | | - } |
---|
| 507 | + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || |
---|
| 508 | + pmd_devmap(*old_pmd)) { |
---|
| 509 | + if (extent == HPAGE_PMD_SIZE && |
---|
| 510 | + move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, |
---|
| 511 | + old_pmd, new_pmd, need_rmap_locks)) |
---|
| 512 | + continue; |
---|
237 | 513 | split_huge_pmd(vma, old_pmd, old_addr); |
---|
238 | 514 | if (pmd_trans_unstable(old_pmd)) |
---|
239 | 515 | continue; |
---|
| 516 | + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && |
---|
| 517 | + extent == PMD_SIZE) { |
---|
| 518 | + /* |
---|
| 519 | + * If the extent is PMD-sized, try to speed the move by |
---|
| 520 | + * moving at the PMD level if possible. |
---|
| 521 | + */ |
---|
| 522 | + if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, |
---|
| 523 | + old_pmd, new_pmd, true)) |
---|
| 524 | + continue; |
---|
240 | 525 | } |
---|
241 | | - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) |
---|
| 526 | + |
---|
| 527 | + if (pte_alloc(new_vma->vm_mm, new_pmd)) |
---|
242 | 528 | break; |
---|
243 | | - next = (new_addr + PMD_SIZE) & PMD_MASK; |
---|
244 | | - if (extent > next - new_addr) |
---|
245 | | - extent = next - new_addr; |
---|
246 | 529 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, |
---|
247 | 530 | new_pmd, new_addr, need_rmap_locks); |
---|
248 | 531 | } |
---|
249 | 532 | |
---|
250 | | - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
---|
| 533 | + mmu_notifier_invalidate_range_end(&range); |
---|
251 | 534 | |
---|
252 | 535 | return len + old_addr - old_end; /* how much done */ |
---|
253 | 536 | } |
---|
.. | .. |
---|
255 | 538 | static unsigned long move_vma(struct vm_area_struct *vma, |
---|
256 | 539 | unsigned long old_addr, unsigned long old_len, |
---|
257 | 540 | unsigned long new_len, unsigned long new_addr, |
---|
258 | | - bool *locked, struct vm_userfaultfd_ctx *uf, |
---|
259 | | - struct list_head *uf_unmap) |
---|
| 541 | + bool *locked, unsigned long flags, |
---|
| 542 | + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) |
---|
260 | 543 | { |
---|
261 | 544 | struct mm_struct *mm = vma->vm_mm; |
---|
262 | 545 | struct vm_area_struct *new_vma; |
---|
.. | .. |
---|
294 | 577 | if (!new_vma) |
---|
295 | 578 | return -ENOMEM; |
---|
296 | 579 | |
---|
| 580 | + /* new_vma is returned protected by copy_vma, to prevent speculative |
---|
| 581 | + * page fault to be done in the destination area before we move the pte. |
---|
| 582 | + * Now, we must also protect the source VMA since we don't want pages |
---|
| 583 | + * to be mapped in our back while we are copying the PTEs. |
---|
| 584 | + */ |
---|
| 585 | + if (vma != new_vma) |
---|
| 586 | + vm_write_begin(vma); |
---|
| 587 | + |
---|
297 | 588 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
---|
298 | 589 | need_rmap_locks); |
---|
299 | 590 | if (moved_len < old_len) { |
---|
.. | .. |
---|
310 | 601 | */ |
---|
311 | 602 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, |
---|
312 | 603 | true); |
---|
| 604 | + if (vma != new_vma) |
---|
| 605 | + vm_write_end(vma); |
---|
313 | 606 | vma = new_vma; |
---|
314 | 607 | old_len = new_len; |
---|
315 | 608 | old_addr = new_addr; |
---|
.. | .. |
---|
318 | 611 | mremap_userfaultfd_prep(new_vma, uf); |
---|
319 | 612 | arch_remap(mm, old_addr, old_addr + old_len, |
---|
320 | 613 | new_addr, new_addr + new_len); |
---|
| 614 | + if (vma != new_vma) |
---|
| 615 | + vm_write_end(vma); |
---|
321 | 616 | } |
---|
| 617 | + vm_write_end(new_vma); |
---|
322 | 618 | |
---|
323 | 619 | /* Conceal VM_ACCOUNT so old reservation is not undone */ |
---|
324 | 620 | if (vm_flags & VM_ACCOUNT) { |
---|
.. | .. |
---|
345 | 641 | if (unlikely(vma->vm_flags & VM_PFNMAP)) |
---|
346 | 642 | untrack_pfn_moved(vma); |
---|
347 | 643 | |
---|
| 644 | + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { |
---|
| 645 | + if (vm_flags & VM_ACCOUNT) { |
---|
| 646 | + /* Always put back VM_ACCOUNT since we won't unmap */ |
---|
| 647 | + vma->vm_flags |= VM_ACCOUNT; |
---|
| 648 | + |
---|
| 649 | + vm_acct_memory(new_len >> PAGE_SHIFT); |
---|
| 650 | + } |
---|
| 651 | + |
---|
| 652 | + /* |
---|
| 653 | + * VMAs can actually be merged back together in copy_vma |
---|
| 654 | + * calling merge_vma. This can happen with anonymous vmas |
---|
| 655 | + * which have not yet been faulted, so if we were to consider |
---|
| 656 | + * this VMA split we'll end up adding VM_ACCOUNT on the |
---|
| 657 | + * next VMA, which is completely unrelated if this VMA |
---|
| 658 | + * was re-merged. |
---|
| 659 | + */ |
---|
| 660 | + if (split && new_vma == vma) |
---|
| 661 | + split = 0; |
---|
| 662 | + |
---|
| 663 | + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ |
---|
| 664 | + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
---|
| 665 | + |
---|
| 666 | + /* Because we won't unmap we don't need to touch locked_vm */ |
---|
| 667 | + goto out; |
---|
| 668 | + } |
---|
| 669 | + |
---|
348 | 670 | if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { |
---|
349 | 671 | /* OOM: unable to split vma, just get accounts right */ |
---|
350 | 672 | vm_unacct_memory(excess >> PAGE_SHIFT); |
---|
351 | 673 | excess = 0; |
---|
352 | 674 | } |
---|
| 675 | + |
---|
| 676 | + if (vm_flags & VM_LOCKED) { |
---|
| 677 | + mm->locked_vm += new_len >> PAGE_SHIFT; |
---|
| 678 | + *locked = true; |
---|
| 679 | + } |
---|
| 680 | +out: |
---|
353 | 681 | mm->hiwater_vm = hiwater_vm; |
---|
354 | 682 | |
---|
355 | 683 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ |
---|
.. | .. |
---|
359 | 687 | vma->vm_next->vm_flags |= VM_ACCOUNT; |
---|
360 | 688 | } |
---|
361 | 689 | |
---|
362 | | - if (vm_flags & VM_LOCKED) { |
---|
363 | | - mm->locked_vm += new_len >> PAGE_SHIFT; |
---|
364 | | - *locked = true; |
---|
365 | | - } |
---|
366 | | - |
---|
367 | 690 | return new_addr; |
---|
368 | 691 | } |
---|
369 | 692 | |
---|
370 | 693 | static struct vm_area_struct *vma_to_resize(unsigned long addr, |
---|
371 | | - unsigned long old_len, unsigned long new_len, unsigned long *p) |
---|
| 694 | + unsigned long old_len, unsigned long new_len, unsigned long flags, |
---|
| 695 | + unsigned long *p) |
---|
372 | 696 | { |
---|
373 | 697 | struct mm_struct *mm = current->mm; |
---|
374 | 698 | struct vm_area_struct *vma = find_vma(mm, addr); |
---|
.. | .. |
---|
389 | 713 | pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); |
---|
390 | 714 | return ERR_PTR(-EINVAL); |
---|
391 | 715 | } |
---|
| 716 | + |
---|
| 717 | + if ((flags & MREMAP_DONTUNMAP) && |
---|
| 718 | + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) |
---|
| 719 | + return ERR_PTR(-EINVAL); |
---|
392 | 720 | |
---|
393 | 721 | if (is_vm_hugetlb_page(vma)) |
---|
394 | 722 | return ERR_PTR(-EINVAL); |
---|
.. | .. |
---|
434 | 762 | |
---|
435 | 763 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
---|
436 | 764 | unsigned long new_addr, unsigned long new_len, bool *locked, |
---|
437 | | - struct vm_userfaultfd_ctx *uf, |
---|
| 765 | + unsigned long flags, struct vm_userfaultfd_ctx *uf, |
---|
438 | 766 | struct list_head *uf_unmap_early, |
---|
439 | 767 | struct list_head *uf_unmap) |
---|
440 | 768 | { |
---|
.. | .. |
---|
442 | 770 | struct vm_area_struct *vma; |
---|
443 | 771 | unsigned long ret = -EINVAL; |
---|
444 | 772 | unsigned long charged = 0; |
---|
445 | | - unsigned long map_flags; |
---|
| 773 | + unsigned long map_flags = 0; |
---|
446 | 774 | |
---|
447 | 775 | if (offset_in_page(new_addr)) |
---|
448 | 776 | goto out; |
---|
.. | .. |
---|
454 | 782 | if (addr + old_len > new_addr && new_addr + new_len > addr) |
---|
455 | 783 | goto out; |
---|
456 | 784 | |
---|
457 | | - ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); |
---|
458 | | - if (ret) |
---|
459 | | - goto out; |
---|
| 785 | + /* |
---|
| 786 | + * move_vma() need us to stay 4 maps below the threshold, otherwise |
---|
| 787 | + * it will bail out at the very beginning. |
---|
| 788 | + * That is a problem if we have already unmaped the regions here |
---|
| 789 | + * (new_addr, and old_addr), because userspace will not know the |
---|
| 790 | + * state of the vma's after it gets -ENOMEM. |
---|
| 791 | + * So, to avoid such scenario we can pre-compute if the whole |
---|
| 792 | + * operation has high chances to success map-wise. |
---|
| 793 | + * Worst-scenario case is when both vma's (new_addr and old_addr) get |
---|
| 794 | + * split in 3 before unmaping it. |
---|
| 795 | + * That means 2 more maps (1 for each) to the ones we already hold. |
---|
| 796 | + * Check whether current map count plus 2 still leads us to 4 maps below |
---|
| 797 | + * the threshold, otherwise return -ENOMEM here to be more safe. |
---|
| 798 | + */ |
---|
| 799 | + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) |
---|
| 800 | + return -ENOMEM; |
---|
| 801 | + |
---|
| 802 | + if (flags & MREMAP_FIXED) { |
---|
| 803 | + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); |
---|
| 804 | + if (ret) |
---|
| 805 | + goto out; |
---|
| 806 | + } |
---|
460 | 807 | |
---|
461 | 808 | if (old_len >= new_len) { |
---|
462 | 809 | ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); |
---|
.. | .. |
---|
465 | 812 | old_len = new_len; |
---|
466 | 813 | } |
---|
467 | 814 | |
---|
468 | | - vma = vma_to_resize(addr, old_len, new_len, &charged); |
---|
| 815 | + vma = vma_to_resize(addr, old_len, new_len, flags, &charged); |
---|
469 | 816 | if (IS_ERR(vma)) { |
---|
470 | 817 | ret = PTR_ERR(vma); |
---|
471 | 818 | goto out; |
---|
472 | 819 | } |
---|
473 | 820 | |
---|
474 | | - map_flags = MAP_FIXED; |
---|
| 821 | + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ |
---|
| 822 | + if (flags & MREMAP_DONTUNMAP && |
---|
| 823 | + !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { |
---|
| 824 | + ret = -ENOMEM; |
---|
| 825 | + goto out; |
---|
| 826 | + } |
---|
| 827 | + |
---|
| 828 | + if (flags & MREMAP_FIXED) |
---|
| 829 | + map_flags |= MAP_FIXED; |
---|
| 830 | + |
---|
475 | 831 | if (vma->vm_flags & VM_MAYSHARE) |
---|
476 | 832 | map_flags |= MAP_SHARED; |
---|
477 | 833 | |
---|
478 | 834 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + |
---|
479 | 835 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
---|
480 | 836 | map_flags); |
---|
481 | | - if (offset_in_page(ret)) |
---|
| 837 | + if (IS_ERR_VALUE(ret)) |
---|
482 | 838 | goto out1; |
---|
483 | 839 | |
---|
484 | | - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, |
---|
| 840 | + /* We got a new mapping */ |
---|
| 841 | + if (!(flags & MREMAP_FIXED)) |
---|
| 842 | + new_addr = ret; |
---|
| 843 | + |
---|
| 844 | + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, |
---|
485 | 845 | uf_unmap); |
---|
| 846 | + |
---|
486 | 847 | if (!(offset_in_page(ret))) |
---|
487 | 848 | goto out; |
---|
| 849 | + |
---|
488 | 850 | out1: |
---|
489 | 851 | vm_unacct_memory(charged); |
---|
490 | 852 | |
---|
.. | .. |
---|
521 | 883 | unsigned long ret = -EINVAL; |
---|
522 | 884 | unsigned long charged = 0; |
---|
523 | 885 | bool locked = false; |
---|
| 886 | + bool downgraded = false; |
---|
524 | 887 | struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; |
---|
525 | 888 | LIST_HEAD(uf_unmap_early); |
---|
526 | 889 | LIST_HEAD(uf_unmap); |
---|
527 | 890 | |
---|
| 891 | + /* |
---|
| 892 | + * There is a deliberate asymmetry here: we strip the pointer tag |
---|
| 893 | + * from the old address but leave the new address alone. This is |
---|
| 894 | + * for consistency with mmap(), where we prevent the creation of |
---|
| 895 | + * aliasing mappings in userspace by leaving the tag bits of the |
---|
| 896 | + * mapping address intact. A non-zero tag will cause the subsequent |
---|
| 897 | + * range checks to reject the address as invalid. |
---|
| 898 | + * |
---|
| 899 | + * See Documentation/arm64/tagged-address-abi.rst for more information. |
---|
| 900 | + */ |
---|
528 | 901 | addr = untagged_addr(addr); |
---|
529 | 902 | |
---|
530 | | - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) |
---|
| 903 | + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) |
---|
531 | 904 | return ret; |
---|
532 | 905 | |
---|
533 | 906 | if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) |
---|
534 | 907 | return ret; |
---|
| 908 | + |
---|
| 909 | + /* |
---|
| 910 | + * MREMAP_DONTUNMAP is always a move and it does not allow resizing |
---|
| 911 | + * in the process. |
---|
| 912 | + */ |
---|
| 913 | + if (flags & MREMAP_DONTUNMAP && |
---|
| 914 | + (!(flags & MREMAP_MAYMOVE) || old_len != new_len)) |
---|
| 915 | + return ret; |
---|
| 916 | + |
---|
535 | 917 | |
---|
536 | 918 | if (offset_in_page(addr)) |
---|
537 | 919 | return ret; |
---|
.. | .. |
---|
547 | 929 | if (!new_len) |
---|
548 | 930 | return ret; |
---|
549 | 931 | |
---|
550 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 932 | + if (mmap_write_lock_killable(current->mm)) |
---|
551 | 933 | return -EINTR; |
---|
552 | 934 | |
---|
553 | | - if (flags & MREMAP_FIXED) { |
---|
| 935 | + if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { |
---|
554 | 936 | ret = mremap_to(addr, old_len, new_addr, new_len, |
---|
555 | | - &locked, &uf, &uf_unmap_early, &uf_unmap); |
---|
| 937 | + &locked, flags, &uf, &uf_unmap_early, |
---|
| 938 | + &uf_unmap); |
---|
556 | 939 | goto out; |
---|
557 | 940 | } |
---|
558 | 941 | |
---|
559 | 942 | /* |
---|
560 | 943 | * Always allow a shrinking remap: that just unmaps |
---|
561 | 944 | * the unnecessary pages.. |
---|
562 | | - * do_munmap does all the needed commit accounting |
---|
| 945 | + * __do_munmap does all the needed commit accounting, and |
---|
| 946 | + * downgrades mmap_lock to read if so directed. |
---|
563 | 947 | */ |
---|
564 | 948 | if (old_len >= new_len) { |
---|
565 | | - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); |
---|
566 | | - if (ret && old_len != new_len) |
---|
| 949 | + int retval; |
---|
| 950 | + |
---|
| 951 | + retval = __do_munmap(mm, addr+new_len, old_len - new_len, |
---|
| 952 | + &uf_unmap, true); |
---|
| 953 | + if (retval < 0 && old_len != new_len) { |
---|
| 954 | + ret = retval; |
---|
567 | 955 | goto out; |
---|
| 956 | + /* Returning 1 indicates mmap_lock is downgraded to read. */ |
---|
| 957 | + } else if (retval == 1) |
---|
| 958 | + downgraded = true; |
---|
568 | 959 | ret = addr; |
---|
569 | 960 | goto out; |
---|
570 | 961 | } |
---|
.. | .. |
---|
572 | 963 | /* |
---|
573 | 964 | * Ok, we need to grow.. |
---|
574 | 965 | */ |
---|
575 | | - vma = vma_to_resize(addr, old_len, new_len, &charged); |
---|
| 966 | + vma = vma_to_resize(addr, old_len, new_len, flags, &charged); |
---|
576 | 967 | if (IS_ERR(vma)) { |
---|
577 | 968 | ret = PTR_ERR(vma); |
---|
578 | 969 | goto out; |
---|
.. | .. |
---|
616 | 1007 | vma->vm_pgoff + |
---|
617 | 1008 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
---|
618 | 1009 | map_flags); |
---|
619 | | - if (offset_in_page(new_addr)) { |
---|
| 1010 | + if (IS_ERR_VALUE(new_addr)) { |
---|
620 | 1011 | ret = new_addr; |
---|
621 | 1012 | goto out; |
---|
622 | 1013 | } |
---|
623 | 1014 | |
---|
624 | 1015 | ret = move_vma(vma, addr, old_len, new_len, new_addr, |
---|
625 | | - &locked, &uf, &uf_unmap); |
---|
| 1016 | + &locked, flags, &uf, &uf_unmap); |
---|
626 | 1017 | } |
---|
627 | 1018 | out: |
---|
628 | 1019 | if (offset_in_page(ret)) { |
---|
629 | 1020 | vm_unacct_memory(charged); |
---|
630 | | - locked = 0; |
---|
| 1021 | + locked = false; |
---|
631 | 1022 | } |
---|
632 | | - up_write(¤t->mm->mmap_sem); |
---|
| 1023 | + if (downgraded) |
---|
| 1024 | + mmap_read_unlock(current->mm); |
---|
| 1025 | + else |
---|
| 1026 | + mmap_write_unlock(current->mm); |
---|
633 | 1027 | if (locked && new_len > old_len) |
---|
634 | 1028 | mm_populate(new_addr + old_len, new_len - old_len); |
---|
635 | 1029 | userfaultfd_unmap_complete(mm, &uf_unmap_early); |
---|
636 | | - mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); |
---|
| 1030 | + mremap_userfaultfd_complete(&uf, addr, ret, old_len); |
---|
637 | 1031 | userfaultfd_unmap_complete(mm, &uf_unmap); |
---|
638 | 1032 | return ret; |
---|
639 | 1033 | } |
---|