| .. | .. |
|---|
| 21 | 21 | * Lock ordering in mm: |
|---|
| 22 | 22 | * |
|---|
| 23 | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
|---|
| 24 | | - * mm->mmap_sem |
|---|
| 25 | | - * page->flags PG_locked (lock_page) |
|---|
| 24 | + * mm->mmap_lock |
|---|
| 25 | + * page->flags PG_locked (lock_page) * (see huegtlbfs below) |
|---|
| 26 | 26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) |
|---|
| 27 | 27 | * mapping->i_mmap_rwsem |
|---|
| 28 | + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) |
|---|
| 28 | 29 | * anon_vma->rwsem |
|---|
| 29 | 30 | * mm->page_table_lock or pte_lock |
|---|
| 30 | | - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) |
|---|
| 31 | + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) |
|---|
| 31 | 32 | * swap_lock (in swap_duplicate, swap_info_get) |
|---|
| 32 | 33 | * mmlist_lock (in mmput, drain_mmlist and others) |
|---|
| 33 | 34 | * mapping->private_lock (in __set_page_dirty_buffers) |
|---|
| .. | .. |
|---|
| 43 | 44 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
|---|
| 44 | 45 | * ->tasklist_lock |
|---|
| 45 | 46 | * pte map lock |
|---|
| 47 | + * |
|---|
| 48 | + * * hugetlbfs PageHuge() pages take locks in this order: |
|---|
| 49 | + * mapping->i_mmap_rwsem |
|---|
| 50 | + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) |
|---|
| 51 | + * page->flags PG_locked (lock_page) |
|---|
| 46 | 52 | */ |
|---|
| 47 | 53 | |
|---|
| 48 | 54 | #include <linux/mm.h> |
|---|
| .. | .. |
|---|
| 61 | 67 | #include <linux/mmu_notifier.h> |
|---|
| 62 | 68 | #include <linux/migrate.h> |
|---|
| 63 | 69 | #include <linux/hugetlb.h> |
|---|
| 70 | +#include <linux/huge_mm.h> |
|---|
| 64 | 71 | #include <linux/backing-dev.h> |
|---|
| 65 | 72 | #include <linux/page_idle.h> |
|---|
| 66 | 73 | #include <linux/memremap.h> |
|---|
| .. | .. |
|---|
| 69 | 76 | #include <asm/tlbflush.h> |
|---|
| 70 | 77 | |
|---|
| 71 | 78 | #include <trace/events/tlb.h> |
|---|
| 79 | + |
|---|
| 80 | +#include <trace/hooks/mm.h> |
|---|
| 72 | 81 | |
|---|
| 73 | 82 | #include "internal.h" |
|---|
| 74 | 83 | |
|---|
| .. | .. |
|---|
| 82 | 91 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
|---|
| 83 | 92 | if (anon_vma) { |
|---|
| 84 | 93 | atomic_set(&anon_vma->refcount, 1); |
|---|
| 85 | | - anon_vma->degree = 1; /* Reference for first vma */ |
|---|
| 94 | + anon_vma->num_children = 0; |
|---|
| 95 | + anon_vma->num_active_vmas = 0; |
|---|
| 86 | 96 | anon_vma->parent = anon_vma; |
|---|
| 87 | 97 | /* |
|---|
| 88 | 98 | * Initialise the anon_vma root to point to itself. If called |
|---|
| .. | .. |
|---|
| 170 | 180 | * to do any locking for the common case of already having |
|---|
| 171 | 181 | * an anon_vma. |
|---|
| 172 | 182 | * |
|---|
| 173 | | - * This must be called with the mmap_sem held for reading. |
|---|
| 183 | + * This must be called with the mmap_lock held for reading. |
|---|
| 174 | 184 | */ |
|---|
| 175 | 185 | int __anon_vma_prepare(struct vm_area_struct *vma) |
|---|
| 176 | 186 | { |
|---|
| .. | .. |
|---|
| 190 | 200 | anon_vma = anon_vma_alloc(); |
|---|
| 191 | 201 | if (unlikely(!anon_vma)) |
|---|
| 192 | 202 | goto out_enomem_free_avc; |
|---|
| 203 | + anon_vma->num_children++; /* self-parent link for new root */ |
|---|
| 193 | 204 | allocated = anon_vma; |
|---|
| 194 | 205 | } |
|---|
| 195 | 206 | |
|---|
| .. | .. |
|---|
| 199 | 210 | if (likely(!vma->anon_vma)) { |
|---|
| 200 | 211 | vma->anon_vma = anon_vma; |
|---|
| 201 | 212 | anon_vma_chain_link(vma, avc, anon_vma); |
|---|
| 202 | | - /* vma reference or self-parent link for new root */ |
|---|
| 203 | | - anon_vma->degree++; |
|---|
| 213 | + anon_vma->num_active_vmas++; |
|---|
| 204 | 214 | allocated = NULL; |
|---|
| 205 | 215 | avc = NULL; |
|---|
| 206 | 216 | } |
|---|
| .. | .. |
|---|
| 250 | 260 | * Attach the anon_vmas from src to dst. |
|---|
| 251 | 261 | * Returns 0 on success, -ENOMEM on failure. |
|---|
| 252 | 262 | * |
|---|
| 253 | | - * If dst->anon_vma is NULL this function tries to find and reuse existing |
|---|
| 254 | | - * anon_vma which has no vmas and only one child anon_vma. This prevents |
|---|
| 255 | | - * degradation of anon_vma hierarchy to endless linear chain in case of |
|---|
| 256 | | - * constantly forking task. On the other hand, an anon_vma with more than one |
|---|
| 257 | | - * child isn't reused even if there was no alive vma, thus rmap walker has a |
|---|
| 258 | | - * good chance of avoiding scanning the whole hierarchy when it searches where |
|---|
| 259 | | - * page is mapped. |
|---|
| 263 | + * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and |
|---|
| 264 | + * anon_vma_fork(). The first three want an exact copy of src, while the last |
|---|
| 265 | + * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent |
|---|
| 266 | + * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, |
|---|
| 267 | + * we can identify this case by checking (!dst->anon_vma && src->anon_vma). |
|---|
| 268 | + * |
|---|
| 269 | + * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find |
|---|
| 270 | + * and reuse existing anon_vma which has no vmas and only one child anon_vma. |
|---|
| 271 | + * This prevents degradation of anon_vma hierarchy to endless linear chain in |
|---|
| 272 | + * case of constantly forking task. On the other hand, an anon_vma with more |
|---|
| 273 | + * than one child isn't reused even if there was no alive vma, thus rmap |
|---|
| 274 | + * walker has a good chance of avoiding scanning the whole hierarchy when it |
|---|
| 275 | + * searches where page is mapped. |
|---|
| 260 | 276 | */ |
|---|
| 261 | 277 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
|---|
| 262 | 278 | { |
|---|
| .. | .. |
|---|
| 279 | 295 | anon_vma_chain_link(dst, avc, anon_vma); |
|---|
| 280 | 296 | |
|---|
| 281 | 297 | /* |
|---|
| 282 | | - * Reuse existing anon_vma if its degree lower than two, |
|---|
| 283 | | - * that means it has no vma and only one anon_vma child. |
|---|
| 298 | + * Reuse existing anon_vma if it has no vma and only one |
|---|
| 299 | + * anon_vma child. |
|---|
| 284 | 300 | * |
|---|
| 285 | | - * Do not chose parent anon_vma, otherwise first child |
|---|
| 286 | | - * will always reuse it. Root anon_vma is never reused: |
|---|
| 301 | + * Root anon_vma is never reused: |
|---|
| 287 | 302 | * it has self-parent reference and at least one child. |
|---|
| 288 | 303 | */ |
|---|
| 289 | | - if (!dst->anon_vma && anon_vma != src->anon_vma && |
|---|
| 290 | | - anon_vma->degree < 2) |
|---|
| 304 | + if (!dst->anon_vma && src->anon_vma && |
|---|
| 305 | + anon_vma->num_children < 2 && |
|---|
| 306 | + anon_vma->num_active_vmas == 0) |
|---|
| 291 | 307 | dst->anon_vma = anon_vma; |
|---|
| 292 | 308 | } |
|---|
| 293 | 309 | if (dst->anon_vma) |
|---|
| 294 | | - dst->anon_vma->degree++; |
|---|
| 310 | + dst->anon_vma->num_active_vmas++; |
|---|
| 295 | 311 | unlock_anon_vma_root(root); |
|---|
| 296 | 312 | return 0; |
|---|
| 297 | 313 | |
|---|
| .. | .. |
|---|
| 341 | 357 | anon_vma = anon_vma_alloc(); |
|---|
| 342 | 358 | if (!anon_vma) |
|---|
| 343 | 359 | goto out_error; |
|---|
| 360 | + anon_vma->num_active_vmas++; |
|---|
| 344 | 361 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
|---|
| 345 | 362 | if (!avc) |
|---|
| 346 | 363 | goto out_error_free_anon_vma; |
|---|
| .. | .. |
|---|
| 361 | 378 | vma->anon_vma = anon_vma; |
|---|
| 362 | 379 | anon_vma_lock_write(anon_vma); |
|---|
| 363 | 380 | anon_vma_chain_link(vma, avc, anon_vma); |
|---|
| 364 | | - anon_vma->parent->degree++; |
|---|
| 381 | + anon_vma->parent->num_children++; |
|---|
| 365 | 382 | anon_vma_unlock_write(anon_vma); |
|---|
| 366 | 383 | |
|---|
| 367 | 384 | return 0; |
|---|
| .. | .. |
|---|
| 393 | 410 | * to free them outside the lock. |
|---|
| 394 | 411 | */ |
|---|
| 395 | 412 | if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { |
|---|
| 396 | | - anon_vma->parent->degree--; |
|---|
| 413 | + anon_vma->parent->num_children--; |
|---|
| 397 | 414 | continue; |
|---|
| 398 | 415 | } |
|---|
| 399 | 416 | |
|---|
| .. | .. |
|---|
| 401 | 418 | anon_vma_chain_free(avc); |
|---|
| 402 | 419 | } |
|---|
| 403 | 420 | if (vma->anon_vma) |
|---|
| 404 | | - vma->anon_vma->degree--; |
|---|
| 421 | + vma->anon_vma->num_active_vmas--; |
|---|
| 422 | + |
|---|
| 405 | 423 | unlock_anon_vma_root(root); |
|---|
| 406 | 424 | |
|---|
| 407 | 425 | /* |
|---|
| .. | .. |
|---|
| 412 | 430 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
|---|
| 413 | 431 | struct anon_vma *anon_vma = avc->anon_vma; |
|---|
| 414 | 432 | |
|---|
| 415 | | - VM_WARN_ON(anon_vma->degree); |
|---|
| 433 | + VM_WARN_ON(anon_vma->num_children); |
|---|
| 434 | + VM_WARN_ON(anon_vma->num_active_vmas); |
|---|
| 416 | 435 | put_anon_vma(anon_vma); |
|---|
| 417 | 436 | |
|---|
| 418 | 437 | list_del(&avc->same_vma); |
|---|
| .. | .. |
|---|
| 457 | 476 | * chain and verify that the page in question is indeed mapped in it |
|---|
| 458 | 477 | * [ something equivalent to page_mapped_in_vma() ]. |
|---|
| 459 | 478 | * |
|---|
| 460 | | - * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() |
|---|
| 461 | | - * that the anon_vma pointer from page->mapping is valid if there is a |
|---|
| 462 | | - * mapcount, we can dereference the anon_vma after observing those. |
|---|
| 479 | + * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from |
|---|
| 480 | + * page_remove_rmap() that the anon_vma pointer from page->mapping is valid |
|---|
| 481 | + * if there is a mapcount, we can dereference the anon_vma after observing |
|---|
| 482 | + * those. |
|---|
| 463 | 483 | */ |
|---|
| 464 | 484 | struct anon_vma *page_get_anon_vma(struct page *page) |
|---|
| 465 | 485 | { |
|---|
| .. | .. |
|---|
| 502 | 522 | * |
|---|
| 503 | 523 | * Its a little more complex as it tries to keep the fast path to a single |
|---|
| 504 | 524 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
|---|
| 505 | | - * reference like with page_get_anon_vma() and then block on the mutex. |
|---|
| 525 | + * reference like with page_get_anon_vma() and then block on the mutex |
|---|
| 526 | + * on !rwc->try_lock case. |
|---|
| 506 | 527 | */ |
|---|
| 507 | | -struct anon_vma *page_lock_anon_vma_read(struct page *page) |
|---|
| 528 | +struct anon_vma *page_lock_anon_vma_read(struct page *page, |
|---|
| 529 | + struct rmap_walk_control *rwc) |
|---|
| 508 | 530 | { |
|---|
| 509 | 531 | struct anon_vma *anon_vma = NULL; |
|---|
| 510 | 532 | struct anon_vma *root_anon_vma; |
|---|
| 511 | 533 | unsigned long anon_mapping; |
|---|
| 534 | + bool success = false; |
|---|
| 512 | 535 | |
|---|
| 513 | 536 | rcu_read_lock(); |
|---|
| 514 | 537 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
|---|
| .. | .. |
|---|
| 529 | 552 | up_read(&root_anon_vma->rwsem); |
|---|
| 530 | 553 | anon_vma = NULL; |
|---|
| 531 | 554 | } |
|---|
| 555 | + goto out; |
|---|
| 556 | + } |
|---|
| 557 | + trace_android_vh_do_page_trylock(page, NULL, NULL, &success); |
|---|
| 558 | + if (success) { |
|---|
| 559 | + anon_vma = NULL; |
|---|
| 560 | + goto out; |
|---|
| 561 | + } |
|---|
| 562 | + |
|---|
| 563 | + if (rwc && rwc->try_lock) { |
|---|
| 564 | + anon_vma = NULL; |
|---|
| 565 | + rwc->contended = true; |
|---|
| 532 | 566 | goto out; |
|---|
| 533 | 567 | } |
|---|
| 534 | 568 | |
|---|
| .. | .. |
|---|
| 658 | 692 | */ |
|---|
| 659 | 693 | void flush_tlb_batched_pending(struct mm_struct *mm) |
|---|
| 660 | 694 | { |
|---|
| 661 | | - if (mm->tlb_flush_batched) { |
|---|
| 695 | + if (data_race(mm->tlb_flush_batched)) { |
|---|
| 662 | 696 | flush_tlb_mm(mm); |
|---|
| 663 | 697 | |
|---|
| 664 | 698 | /* |
|---|
| .. | .. |
|---|
| 768 | 802 | } |
|---|
| 769 | 803 | |
|---|
| 770 | 804 | if (pvmw.pte) { |
|---|
| 805 | + trace_android_vh_look_around(&pvmw, page, vma, &referenced); |
|---|
| 771 | 806 | if (ptep_clear_flush_young_notify(vma, address, |
|---|
| 772 | 807 | pvmw.pte)) { |
|---|
| 773 | 808 | /* |
|---|
| .. | .. |
|---|
| 803 | 838 | pra->vm_flags |= vma->vm_flags; |
|---|
| 804 | 839 | } |
|---|
| 805 | 840 | |
|---|
| 841 | + trace_android_vh_page_referenced_one_end(vma, page, referenced); |
|---|
| 806 | 842 | if (!pra->mapcount) |
|---|
| 807 | 843 | return false; /* To break the loop */ |
|---|
| 808 | 844 | |
|---|
| .. | .. |
|---|
| 827 | 863 | * @memcg: target memory cgroup |
|---|
| 828 | 864 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page |
|---|
| 829 | 865 | * |
|---|
| 830 | | - * Quick test_and_clear_referenced for all mappings to a page, |
|---|
| 831 | | - * returns the number of ptes which referenced the page. |
|---|
| 866 | + * Quick test_and_clear_referenced for all mappings of a page, |
|---|
| 867 | + * |
|---|
| 868 | + * Return: The number of mappings which referenced the page. Return -1 if |
|---|
| 869 | + * the function bailed out due to rmap lock contention. |
|---|
| 832 | 870 | */ |
|---|
| 833 | 871 | int page_referenced(struct page *page, |
|---|
| 834 | 872 | int is_locked, |
|---|
| .. | .. |
|---|
| 844 | 882 | .rmap_one = page_referenced_one, |
|---|
| 845 | 883 | .arg = (void *)&pra, |
|---|
| 846 | 884 | .anon_lock = page_lock_anon_vma_read, |
|---|
| 885 | + .try_lock = true, |
|---|
| 847 | 886 | }; |
|---|
| 848 | 887 | |
|---|
| 849 | 888 | *vm_flags = 0; |
|---|
| 850 | | - if (!page_mapped(page)) |
|---|
| 889 | + if (!pra.mapcount) |
|---|
| 851 | 890 | return 0; |
|---|
| 852 | 891 | |
|---|
| 853 | 892 | if (!page_rmapping(page)) |
|---|
| .. | .. |
|---|
| 874 | 913 | if (we_locked) |
|---|
| 875 | 914 | unlock_page(page); |
|---|
| 876 | 915 | |
|---|
| 877 | | - return pra.referenced; |
|---|
| 916 | + return rwc.contended ? -1 : pra.referenced; |
|---|
| 878 | 917 | } |
|---|
| 918 | +EXPORT_SYMBOL_GPL(page_referenced); |
|---|
| 879 | 919 | |
|---|
| 880 | 920 | static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
|---|
| 881 | 921 | unsigned long address, void *arg) |
|---|
| .. | .. |
|---|
| 886 | 926 | .address = address, |
|---|
| 887 | 927 | .flags = PVMW_SYNC, |
|---|
| 888 | 928 | }; |
|---|
| 889 | | - unsigned long start = address, end; |
|---|
| 929 | + struct mmu_notifier_range range; |
|---|
| 890 | 930 | int *cleaned = arg; |
|---|
| 891 | 931 | |
|---|
| 892 | 932 | /* |
|---|
| 893 | 933 | * We have to assume the worse case ie pmd for invalidation. Note that |
|---|
| 894 | 934 | * the page can not be free from this function. |
|---|
| 895 | 935 | */ |
|---|
| 896 | | - end = vma_address_end(page, vma); |
|---|
| 897 | | - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); |
|---|
| 936 | + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, |
|---|
| 937 | + 0, vma, vma->vm_mm, address, |
|---|
| 938 | + vma_address_end(page, vma)); |
|---|
| 939 | + mmu_notifier_invalidate_range_start(&range); |
|---|
| 898 | 940 | |
|---|
| 899 | 941 | while (page_vma_mapped_walk(&pvmw)) { |
|---|
| 900 | | - unsigned long cstart; |
|---|
| 901 | 942 | int ret = 0; |
|---|
| 902 | 943 | |
|---|
| 903 | | - cstart = address = pvmw.address; |
|---|
| 944 | + address = pvmw.address; |
|---|
| 904 | 945 | if (pvmw.pte) { |
|---|
| 905 | 946 | pte_t entry; |
|---|
| 906 | 947 | pte_t *pte = pvmw.pte; |
|---|
| .. | .. |
|---|
| 915 | 956 | set_pte_at(vma->vm_mm, address, pte, entry); |
|---|
| 916 | 957 | ret = 1; |
|---|
| 917 | 958 | } else { |
|---|
| 918 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 959 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 919 | 960 | pmd_t *pmd = pvmw.pmd; |
|---|
| 920 | 961 | pmd_t entry; |
|---|
| 921 | 962 | |
|---|
| .. | .. |
|---|
| 927 | 968 | entry = pmd_wrprotect(entry); |
|---|
| 928 | 969 | entry = pmd_mkclean(entry); |
|---|
| 929 | 970 | set_pmd_at(vma->vm_mm, address, pmd, entry); |
|---|
| 930 | | - cstart &= PMD_MASK; |
|---|
| 931 | 971 | ret = 1; |
|---|
| 932 | 972 | #else |
|---|
| 933 | 973 | /* unexpected pmd-mapped page? */ |
|---|
| .. | .. |
|---|
| 946 | 986 | (*cleaned)++; |
|---|
| 947 | 987 | } |
|---|
| 948 | 988 | |
|---|
| 949 | | - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
|---|
| 989 | + mmu_notifier_invalidate_range_end(&range); |
|---|
| 950 | 990 | |
|---|
| 951 | 991 | return true; |
|---|
| 952 | 992 | } |
|---|
| .. | .. |
|---|
| 1014 | 1054 | |
|---|
| 1015 | 1055 | /** |
|---|
| 1016 | 1056 | * __page_set_anon_rmap - set up new anonymous rmap |
|---|
| 1017 | | - * @page: Page to add to rmap |
|---|
| 1057 | + * @page: Page or Hugepage to add to rmap |
|---|
| 1018 | 1058 | * @vma: VM area to add page to. |
|---|
| 1019 | 1059 | * @address: User virtual address of the mapping |
|---|
| 1020 | 1060 | * @exclusive: the page is exclusively owned by the current process |
|---|
| .. | .. |
|---|
| 1051 | 1091 | static void __page_check_anon_rmap(struct page *page, |
|---|
| 1052 | 1092 | struct vm_area_struct *vma, unsigned long address) |
|---|
| 1053 | 1093 | { |
|---|
| 1054 | | -#ifdef CONFIG_DEBUG_VM |
|---|
| 1055 | 1094 | /* |
|---|
| 1056 | 1095 | * The page's anon-rmap details (mapping and index) are guaranteed to |
|---|
| 1057 | 1096 | * be set up correctly at this point. |
|---|
| .. | .. |
|---|
| 1064 | 1103 | * are initially only visible via the pagetables, and the pte is locked |
|---|
| 1065 | 1104 | * over the call to page_add_new_anon_rmap. |
|---|
| 1066 | 1105 | */ |
|---|
| 1067 | | - BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); |
|---|
| 1068 | | - BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); |
|---|
| 1069 | | -#endif |
|---|
| 1106 | + VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); |
|---|
| 1107 | + VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), |
|---|
| 1108 | + page); |
|---|
| 1070 | 1109 | } |
|---|
| 1071 | 1110 | |
|---|
| 1072 | 1111 | /** |
|---|
| .. | .. |
|---|
| 1097 | 1136 | { |
|---|
| 1098 | 1137 | bool compound = flags & RMAP_COMPOUND; |
|---|
| 1099 | 1138 | bool first; |
|---|
| 1139 | + bool success = false; |
|---|
| 1140 | + |
|---|
| 1141 | + if (unlikely(PageKsm(page))) |
|---|
| 1142 | + lock_page_memcg(page); |
|---|
| 1143 | + else |
|---|
| 1144 | + VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 1100 | 1145 | |
|---|
| 1101 | 1146 | if (compound) { |
|---|
| 1102 | 1147 | atomic_t *mapcount; |
|---|
| .. | .. |
|---|
| 1105 | 1150 | mapcount = compound_mapcount_ptr(page); |
|---|
| 1106 | 1151 | first = atomic_inc_and_test(mapcount); |
|---|
| 1107 | 1152 | } else { |
|---|
| 1108 | | - first = atomic_inc_and_test(&page->_mapcount); |
|---|
| 1153 | + trace_android_vh_update_page_mapcount(page, true, compound, |
|---|
| 1154 | + &first, &success); |
|---|
| 1155 | + if (!success) |
|---|
| 1156 | + first = atomic_inc_and_test(&page->_mapcount); |
|---|
| 1109 | 1157 | } |
|---|
| 1110 | 1158 | |
|---|
| 1111 | 1159 | if (first) { |
|---|
| 1112 | | - int nr = compound ? hpage_nr_pages(page) : 1; |
|---|
| 1160 | + int nr = compound ? thp_nr_pages(page) : 1; |
|---|
| 1113 | 1161 | /* |
|---|
| 1114 | 1162 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because |
|---|
| 1115 | 1163 | * these counters are not modified in interrupt context, and |
|---|
| .. | .. |
|---|
| 1117 | 1165 | * disabled. |
|---|
| 1118 | 1166 | */ |
|---|
| 1119 | 1167 | if (compound) |
|---|
| 1120 | | - __inc_node_page_state(page, NR_ANON_THPS); |
|---|
| 1121 | | - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); |
|---|
| 1168 | + __inc_lruvec_page_state(page, NR_ANON_THPS); |
|---|
| 1169 | + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); |
|---|
| 1122 | 1170 | } |
|---|
| 1123 | | - if (unlikely(PageKsm(page))) |
|---|
| 1124 | | - return; |
|---|
| 1125 | 1171 | |
|---|
| 1126 | | - VM_BUG_ON_PAGE(!PageLocked(page), page); |
|---|
| 1172 | + if (unlikely(PageKsm(page))) { |
|---|
| 1173 | + unlock_page_memcg(page); |
|---|
| 1174 | + return; |
|---|
| 1175 | + } |
|---|
| 1127 | 1176 | |
|---|
| 1128 | 1177 | /* address might be in next vma when migration races vma_adjust */ |
|---|
| 1129 | 1178 | if (first) |
|---|
| .. | .. |
|---|
| 1134 | 1183 | } |
|---|
| 1135 | 1184 | |
|---|
| 1136 | 1185 | /** |
|---|
| 1137 | | - * page_add_new_anon_rmap - add pte mapping to a new anonymous page |
|---|
| 1186 | + * __page_add_new_anon_rmap - add pte mapping to a new anonymous page |
|---|
| 1138 | 1187 | * @page: the page to add the mapping to |
|---|
| 1139 | 1188 | * @vma: the vm area in which the mapping is added |
|---|
| 1140 | 1189 | * @address: the user virtual address mapped |
|---|
| .. | .. |
|---|
| 1144 | 1193 | * This means the inc-and-test can be bypassed. |
|---|
| 1145 | 1194 | * Page does not have to be locked. |
|---|
| 1146 | 1195 | */ |
|---|
| 1147 | | -void page_add_new_anon_rmap(struct page *page, |
|---|
| 1196 | +void __page_add_new_anon_rmap(struct page *page, |
|---|
| 1148 | 1197 | struct vm_area_struct *vma, unsigned long address, bool compound) |
|---|
| 1149 | 1198 | { |
|---|
| 1150 | | - int nr = compound ? hpage_nr_pages(page) : 1; |
|---|
| 1199 | + int nr = compound ? thp_nr_pages(page) : 1; |
|---|
| 1151 | 1200 | |
|---|
| 1152 | | - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
|---|
| 1153 | 1201 | __SetPageSwapBacked(page); |
|---|
| 1154 | 1202 | if (compound) { |
|---|
| 1155 | 1203 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
|---|
| 1156 | 1204 | /* increment count (starts at -1) */ |
|---|
| 1157 | 1205 | atomic_set(compound_mapcount_ptr(page), 0); |
|---|
| 1158 | | - __inc_node_page_state(page, NR_ANON_THPS); |
|---|
| 1206 | + if (hpage_pincount_available(page)) |
|---|
| 1207 | + atomic_set(compound_pincount_ptr(page), 0); |
|---|
| 1208 | + |
|---|
| 1209 | + __inc_lruvec_page_state(page, NR_ANON_THPS); |
|---|
| 1159 | 1210 | } else { |
|---|
| 1160 | 1211 | /* Anon THP always mapped first with PMD */ |
|---|
| 1161 | 1212 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
|---|
| 1162 | 1213 | /* increment count (starts at -1) */ |
|---|
| 1163 | 1214 | atomic_set(&page->_mapcount, 0); |
|---|
| 1164 | 1215 | } |
|---|
| 1165 | | - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); |
|---|
| 1216 | + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); |
|---|
| 1166 | 1217 | __page_set_anon_rmap(page, vma, address, 1); |
|---|
| 1167 | 1218 | } |
|---|
| 1168 | 1219 | |
|---|
| .. | .. |
|---|
| 1176 | 1227 | void page_add_file_rmap(struct page *page, bool compound) |
|---|
| 1177 | 1228 | { |
|---|
| 1178 | 1229 | int i, nr = 1; |
|---|
| 1230 | + bool first_mapping; |
|---|
| 1231 | + bool success = false; |
|---|
| 1179 | 1232 | |
|---|
| 1180 | 1233 | VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); |
|---|
| 1181 | 1234 | lock_page_memcg(page); |
|---|
| 1182 | 1235 | if (compound && PageTransHuge(page)) { |
|---|
| 1183 | | - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { |
|---|
| 1184 | | - if (atomic_inc_and_test(&page[i]._mapcount)) |
|---|
| 1185 | | - nr++; |
|---|
| 1236 | + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
|---|
| 1237 | + trace_android_vh_update_page_mapcount(&page[i], true, |
|---|
| 1238 | + compound, &first_mapping, &success); |
|---|
| 1239 | + if ((success)) { |
|---|
| 1240 | + if (first_mapping) |
|---|
| 1241 | + nr++; |
|---|
| 1242 | + } else { |
|---|
| 1243 | + if (atomic_inc_and_test(&page[i]._mapcount)) |
|---|
| 1244 | + nr++; |
|---|
| 1245 | + } |
|---|
| 1186 | 1246 | } |
|---|
| 1187 | 1247 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) |
|---|
| 1188 | 1248 | goto out; |
|---|
| 1189 | | - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
|---|
| 1190 | | - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); |
|---|
| 1249 | + if (PageSwapBacked(page)) |
|---|
| 1250 | + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); |
|---|
| 1251 | + else |
|---|
| 1252 | + __inc_node_page_state(page, NR_FILE_PMDMAPPED); |
|---|
| 1191 | 1253 | } else { |
|---|
| 1192 | 1254 | if (PageTransCompound(page) && page_mapping(page)) { |
|---|
| 1193 | 1255 | VM_WARN_ON_ONCE(!PageLocked(page)); |
|---|
| .. | .. |
|---|
| 1196 | 1258 | if (PageMlocked(page)) |
|---|
| 1197 | 1259 | clear_page_mlock(compound_head(page)); |
|---|
| 1198 | 1260 | } |
|---|
| 1199 | | - if (!atomic_inc_and_test(&page->_mapcount)) |
|---|
| 1200 | | - goto out; |
|---|
| 1261 | + trace_android_vh_update_page_mapcount(page, true, |
|---|
| 1262 | + compound, &first_mapping, &success); |
|---|
| 1263 | + if (success) { |
|---|
| 1264 | + if (!first_mapping) |
|---|
| 1265 | + goto out; |
|---|
| 1266 | + } else { |
|---|
| 1267 | + if (!atomic_inc_and_test(&page->_mapcount)) |
|---|
| 1268 | + goto out; |
|---|
| 1269 | + } |
|---|
| 1201 | 1270 | } |
|---|
| 1202 | 1271 | __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); |
|---|
| 1203 | 1272 | out: |
|---|
| .. | .. |
|---|
| 1207 | 1276 | static void page_remove_file_rmap(struct page *page, bool compound) |
|---|
| 1208 | 1277 | { |
|---|
| 1209 | 1278 | int i, nr = 1; |
|---|
| 1279 | + bool first_mapping; |
|---|
| 1280 | + bool success = false; |
|---|
| 1210 | 1281 | |
|---|
| 1211 | 1282 | VM_BUG_ON_PAGE(compound && !PageHead(page), page); |
|---|
| 1212 | | - lock_page_memcg(page); |
|---|
| 1213 | 1283 | |
|---|
| 1214 | 1284 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ |
|---|
| 1215 | 1285 | if (unlikely(PageHuge(page))) { |
|---|
| 1216 | 1286 | /* hugetlb pages are always mapped with pmds */ |
|---|
| 1217 | 1287 | atomic_dec(compound_mapcount_ptr(page)); |
|---|
| 1218 | | - goto out; |
|---|
| 1288 | + return; |
|---|
| 1219 | 1289 | } |
|---|
| 1220 | 1290 | |
|---|
| 1221 | 1291 | /* page still mapped by someone else? */ |
|---|
| 1222 | 1292 | if (compound && PageTransHuge(page)) { |
|---|
| 1223 | | - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { |
|---|
| 1224 | | - if (atomic_add_negative(-1, &page[i]._mapcount)) |
|---|
| 1225 | | - nr++; |
|---|
| 1293 | + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
|---|
| 1294 | + trace_android_vh_update_page_mapcount(&page[i], false, |
|---|
| 1295 | + compound, &first_mapping, &success); |
|---|
| 1296 | + if (success) { |
|---|
| 1297 | + if (first_mapping) |
|---|
| 1298 | + nr++; |
|---|
| 1299 | + } else { |
|---|
| 1300 | + if (atomic_add_negative(-1, &page[i]._mapcount)) |
|---|
| 1301 | + nr++; |
|---|
| 1302 | + } |
|---|
| 1226 | 1303 | } |
|---|
| 1227 | 1304 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) |
|---|
| 1228 | | - goto out; |
|---|
| 1229 | | - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
|---|
| 1230 | | - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); |
|---|
| 1305 | + return; |
|---|
| 1306 | + if (PageSwapBacked(page)) |
|---|
| 1307 | + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); |
|---|
| 1308 | + else |
|---|
| 1309 | + __dec_node_page_state(page, NR_FILE_PMDMAPPED); |
|---|
| 1231 | 1310 | } else { |
|---|
| 1232 | | - if (!atomic_add_negative(-1, &page->_mapcount)) |
|---|
| 1233 | | - goto out; |
|---|
| 1311 | + trace_android_vh_update_page_mapcount(page, false, |
|---|
| 1312 | + compound, &first_mapping, &success); |
|---|
| 1313 | + if (success) { |
|---|
| 1314 | + if (!first_mapping) |
|---|
| 1315 | + return; |
|---|
| 1316 | + } else { |
|---|
| 1317 | + if (!atomic_add_negative(-1, &page->_mapcount)) |
|---|
| 1318 | + return; |
|---|
| 1319 | + } |
|---|
| 1234 | 1320 | } |
|---|
| 1235 | 1321 | |
|---|
| 1236 | 1322 | /* |
|---|
| .. | .. |
|---|
| 1242 | 1328 | |
|---|
| 1243 | 1329 | if (unlikely(PageMlocked(page))) |
|---|
| 1244 | 1330 | clear_page_mlock(page); |
|---|
| 1245 | | -out: |
|---|
| 1246 | | - unlock_page_memcg(page); |
|---|
| 1247 | 1331 | } |
|---|
| 1248 | 1332 | |
|---|
| 1249 | 1333 | static void page_remove_anon_compound_rmap(struct page *page) |
|---|
| 1250 | 1334 | { |
|---|
| 1251 | 1335 | int i, nr; |
|---|
| 1336 | + bool first_mapping; |
|---|
| 1337 | + bool success = false; |
|---|
| 1252 | 1338 | |
|---|
| 1253 | 1339 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) |
|---|
| 1254 | 1340 | return; |
|---|
| .. | .. |
|---|
| 1260 | 1346 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) |
|---|
| 1261 | 1347 | return; |
|---|
| 1262 | 1348 | |
|---|
| 1263 | | - __dec_node_page_state(page, NR_ANON_THPS); |
|---|
| 1349 | + __dec_lruvec_page_state(page, NR_ANON_THPS); |
|---|
| 1264 | 1350 | |
|---|
| 1265 | 1351 | if (TestClearPageDoubleMap(page)) { |
|---|
| 1266 | 1352 | /* |
|---|
| 1267 | 1353 | * Subpages can be mapped with PTEs too. Check how many of |
|---|
| 1268 | | - * themi are still mapped. |
|---|
| 1354 | + * them are still mapped. |
|---|
| 1269 | 1355 | */ |
|---|
| 1270 | | - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { |
|---|
| 1271 | | - if (atomic_add_negative(-1, &page[i]._mapcount)) |
|---|
| 1272 | | - nr++; |
|---|
| 1356 | + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { |
|---|
| 1357 | + trace_android_vh_update_page_mapcount(&page[i], false, |
|---|
| 1358 | + false, &first_mapping, &success); |
|---|
| 1359 | + if (success) { |
|---|
| 1360 | + if (first_mapping) |
|---|
| 1361 | + nr++; |
|---|
| 1362 | + } else { |
|---|
| 1363 | + if (atomic_add_negative(-1, &page[i]._mapcount)) |
|---|
| 1364 | + nr++; |
|---|
| 1365 | + } |
|---|
| 1273 | 1366 | } |
|---|
| 1367 | + |
|---|
| 1368 | + /* |
|---|
| 1369 | + * Queue the page for deferred split if at least one small |
|---|
| 1370 | + * page of the compound page is unmapped, but at least one |
|---|
| 1371 | + * small page is still mapped. |
|---|
| 1372 | + */ |
|---|
| 1373 | + if (nr && nr < thp_nr_pages(page)) |
|---|
| 1374 | + deferred_split_huge_page(page); |
|---|
| 1274 | 1375 | } else { |
|---|
| 1275 | | - nr = HPAGE_PMD_NR; |
|---|
| 1376 | + nr = thp_nr_pages(page); |
|---|
| 1276 | 1377 | } |
|---|
| 1277 | 1378 | |
|---|
| 1278 | 1379 | if (unlikely(PageMlocked(page))) |
|---|
| 1279 | 1380 | clear_page_mlock(page); |
|---|
| 1280 | 1381 | |
|---|
| 1281 | | - if (nr) { |
|---|
| 1282 | | - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); |
|---|
| 1283 | | - deferred_split_huge_page(page); |
|---|
| 1284 | | - } |
|---|
| 1382 | + if (nr) |
|---|
| 1383 | + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); |
|---|
| 1285 | 1384 | } |
|---|
| 1286 | 1385 | |
|---|
| 1287 | 1386 | /** |
|---|
| .. | .. |
|---|
| 1293 | 1392 | */ |
|---|
| 1294 | 1393 | void page_remove_rmap(struct page *page, bool compound) |
|---|
| 1295 | 1394 | { |
|---|
| 1296 | | - if (!PageAnon(page)) |
|---|
| 1297 | | - return page_remove_file_rmap(page, compound); |
|---|
| 1395 | + bool first_mapping; |
|---|
| 1396 | + bool success = false; |
|---|
| 1397 | + lock_page_memcg(page); |
|---|
| 1298 | 1398 | |
|---|
| 1299 | | - if (compound) |
|---|
| 1300 | | - return page_remove_anon_compound_rmap(page); |
|---|
| 1399 | + if (!PageAnon(page)) { |
|---|
| 1400 | + page_remove_file_rmap(page, compound); |
|---|
| 1401 | + goto out; |
|---|
| 1402 | + } |
|---|
| 1301 | 1403 | |
|---|
| 1302 | | - /* page still mapped by someone else? */ |
|---|
| 1303 | | - if (!atomic_add_negative(-1, &page->_mapcount)) |
|---|
| 1304 | | - return; |
|---|
| 1404 | + if (compound) { |
|---|
| 1405 | + page_remove_anon_compound_rmap(page); |
|---|
| 1406 | + goto out; |
|---|
| 1407 | + } |
|---|
| 1305 | 1408 | |
|---|
| 1409 | + trace_android_vh_update_page_mapcount(page, false, |
|---|
| 1410 | + compound, &first_mapping, &success); |
|---|
| 1411 | + if (success) { |
|---|
| 1412 | + if (!first_mapping) |
|---|
| 1413 | + goto out; |
|---|
| 1414 | + } else { |
|---|
| 1415 | + /* page still mapped by someone else? */ |
|---|
| 1416 | + if (!atomic_add_negative(-1, &page->_mapcount)) |
|---|
| 1417 | + goto out; |
|---|
| 1418 | + } |
|---|
| 1306 | 1419 | /* |
|---|
| 1307 | 1420 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because |
|---|
| 1308 | 1421 | * these counters are not modified in interrupt context, and |
|---|
| 1309 | 1422 | * pte lock(a spinlock) is held, which implies preemption disabled. |
|---|
| 1310 | 1423 | */ |
|---|
| 1311 | | - __dec_node_page_state(page, NR_ANON_MAPPED); |
|---|
| 1424 | + __dec_lruvec_page_state(page, NR_ANON_MAPPED); |
|---|
| 1312 | 1425 | |
|---|
| 1313 | 1426 | if (unlikely(PageMlocked(page))) |
|---|
| 1314 | 1427 | clear_page_mlock(page); |
|---|
| .. | .. |
|---|
| 1325 | 1438 | * Leaving it set also helps swapoff to reinstate ptes |
|---|
| 1326 | 1439 | * faster for those pages still in swapcache. |
|---|
| 1327 | 1440 | */ |
|---|
| 1441 | +out: |
|---|
| 1442 | + unlock_page_memcg(page); |
|---|
| 1328 | 1443 | } |
|---|
| 1329 | 1444 | |
|---|
| 1330 | 1445 | /* |
|---|
| .. | .. |
|---|
| 1342 | 1457 | pte_t pteval; |
|---|
| 1343 | 1458 | struct page *subpage; |
|---|
| 1344 | 1459 | bool ret = true; |
|---|
| 1345 | | - unsigned long start = address, end; |
|---|
| 1346 | | - enum ttu_flags flags = (enum ttu_flags)arg; |
|---|
| 1460 | + struct mmu_notifier_range range; |
|---|
| 1461 | + enum ttu_flags flags = (enum ttu_flags)(long)arg; |
|---|
| 1347 | 1462 | |
|---|
| 1348 | 1463 | /* |
|---|
| 1349 | 1464 | * When racing against e.g. zap_pte_range() on another cpu, |
|---|
| .. | .. |
|---|
| 1375 | 1490 | * Note that the page can not be free in this function as call of |
|---|
| 1376 | 1491 | * try_to_unmap() must hold a reference on the page. |
|---|
| 1377 | 1492 | */ |
|---|
| 1378 | | - end = PageKsm(page) ? |
|---|
| 1493 | + range.end = PageKsm(page) ? |
|---|
| 1379 | 1494 | address + PAGE_SIZE : vma_address_end(page, vma); |
|---|
| 1495 | + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, |
|---|
| 1496 | + address, range.end); |
|---|
| 1380 | 1497 | if (PageHuge(page)) { |
|---|
| 1381 | 1498 | /* |
|---|
| 1382 | 1499 | * If sharing is possible, start and end will be adjusted |
|---|
| 1383 | 1500 | * accordingly. |
|---|
| 1384 | 1501 | */ |
|---|
| 1385 | | - adjust_range_if_pmd_sharing_possible(vma, &start, &end); |
|---|
| 1502 | + adjust_range_if_pmd_sharing_possible(vma, &range.start, |
|---|
| 1503 | + &range.end); |
|---|
| 1386 | 1504 | } |
|---|
| 1387 | | - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); |
|---|
| 1505 | + mmu_notifier_invalidate_range_start(&range); |
|---|
| 1388 | 1506 | |
|---|
| 1389 | 1507 | while (page_vma_mapped_walk(&pvmw)) { |
|---|
| 1390 | 1508 | #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION |
|---|
| .. | .. |
|---|
| 1408 | 1526 | if (!PageTransCompound(page)) { |
|---|
| 1409 | 1527 | /* |
|---|
| 1410 | 1528 | * Holding pte lock, we do *not* need |
|---|
| 1411 | | - * mmap_sem here |
|---|
| 1529 | + * mmap_lock here |
|---|
| 1412 | 1530 | */ |
|---|
| 1413 | 1531 | mlock_vma_page(page); |
|---|
| 1414 | 1532 | } |
|---|
| .. | .. |
|---|
| 1426 | 1544 | subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); |
|---|
| 1427 | 1545 | address = pvmw.address; |
|---|
| 1428 | 1546 | |
|---|
| 1429 | | - if (PageHuge(page)) { |
|---|
| 1430 | | - if (huge_pmd_unshare(mm, &address, pvmw.pte)) { |
|---|
| 1547 | + if (PageHuge(page) && !PageAnon(page)) { |
|---|
| 1548 | + /* |
|---|
| 1549 | + * To call huge_pmd_unshare, i_mmap_rwsem must be |
|---|
| 1550 | + * held in write mode. Caller needs to explicitly |
|---|
| 1551 | + * do this outside rmap routines. |
|---|
| 1552 | + */ |
|---|
| 1553 | + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); |
|---|
| 1554 | + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { |
|---|
| 1431 | 1555 | /* |
|---|
| 1432 | 1556 | * huge_pmd_unshare unmapped an entire PMD |
|---|
| 1433 | 1557 | * page. There is no way of knowing exactly |
|---|
| .. | .. |
|---|
| 1435 | 1559 | * we must flush them all. start/end were |
|---|
| 1436 | 1560 | * already adjusted above to cover this range. |
|---|
| 1437 | 1561 | */ |
|---|
| 1438 | | - flush_cache_range(vma, start, end); |
|---|
| 1439 | | - flush_tlb_range(vma, start, end); |
|---|
| 1440 | | - mmu_notifier_invalidate_range(mm, start, end); |
|---|
| 1562 | + flush_cache_range(vma, range.start, range.end); |
|---|
| 1563 | + flush_tlb_range(vma, range.start, range.end); |
|---|
| 1564 | + mmu_notifier_invalidate_range(mm, range.start, |
|---|
| 1565 | + range.end); |
|---|
| 1441 | 1566 | |
|---|
| 1442 | 1567 | /* |
|---|
| 1443 | 1568 | * The ref count of the PMD page was dropped |
|---|
| .. | .. |
|---|
| 1468 | 1593 | */ |
|---|
| 1469 | 1594 | entry = make_migration_entry(page, 0); |
|---|
| 1470 | 1595 | swp_pte = swp_entry_to_pte(entry); |
|---|
| 1471 | | - if (pte_soft_dirty(pteval)) |
|---|
| 1596 | + |
|---|
| 1597 | + /* |
|---|
| 1598 | + * pteval maps a zone device page and is therefore |
|---|
| 1599 | + * a swap pte. |
|---|
| 1600 | + */ |
|---|
| 1601 | + if (pte_swp_soft_dirty(pteval)) |
|---|
| 1472 | 1602 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
|---|
| 1603 | + if (pte_swp_uffd_wp(pteval)) |
|---|
| 1604 | + swp_pte = pte_swp_mkuffd_wp(swp_pte); |
|---|
| 1473 | 1605 | set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); |
|---|
| 1474 | 1606 | /* |
|---|
| 1475 | 1607 | * No need to invalidate here it will synchronize on |
|---|
| .. | .. |
|---|
| 1484 | 1616 | */ |
|---|
| 1485 | 1617 | subpage = page; |
|---|
| 1486 | 1618 | goto discard; |
|---|
| 1487 | | - } |
|---|
| 1488 | | - |
|---|
| 1489 | | - if (!(flags & TTU_IGNORE_ACCESS)) { |
|---|
| 1490 | | - if (ptep_clear_flush_young_notify(vma, address, |
|---|
| 1491 | | - pvmw.pte)) { |
|---|
| 1492 | | - ret = false; |
|---|
| 1493 | | - page_vma_mapped_walk_done(&pvmw); |
|---|
| 1494 | | - break; |
|---|
| 1495 | | - } |
|---|
| 1496 | 1619 | } |
|---|
| 1497 | 1620 | |
|---|
| 1498 | 1621 | /* Nuke the page table entry. */ |
|---|
| .. | .. |
|---|
| 1523 | 1646 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
|---|
| 1524 | 1647 | pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); |
|---|
| 1525 | 1648 | if (PageHuge(page)) { |
|---|
| 1526 | | - int nr = 1 << compound_order(page); |
|---|
| 1527 | | - hugetlb_count_sub(nr, mm); |
|---|
| 1649 | + hugetlb_count_sub(compound_nr(page), mm); |
|---|
| 1528 | 1650 | set_huge_swap_pte_at(mm, address, |
|---|
| 1529 | 1651 | pvmw.pte, pteval, |
|---|
| 1530 | 1652 | vma_mmu_pagesize(vma)); |
|---|
| .. | .. |
|---|
| 1570 | 1692 | swp_pte = swp_entry_to_pte(entry); |
|---|
| 1571 | 1693 | if (pte_soft_dirty(pteval)) |
|---|
| 1572 | 1694 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
|---|
| 1695 | + if (pte_uffd_wp(pteval)) |
|---|
| 1696 | + swp_pte = pte_swp_mkuffd_wp(swp_pte); |
|---|
| 1573 | 1697 | set_pte_at(mm, address, pvmw.pte, swp_pte); |
|---|
| 1574 | 1698 | /* |
|---|
| 1575 | 1699 | * No need to invalidate here it will synchronize on |
|---|
| .. | .. |
|---|
| 1594 | 1718 | |
|---|
| 1595 | 1719 | /* MADV_FREE page check */ |
|---|
| 1596 | 1720 | if (!PageSwapBacked(page)) { |
|---|
| 1597 | | - if (!PageDirty(page)) { |
|---|
| 1721 | + int ref_count, map_count; |
|---|
| 1722 | + |
|---|
| 1723 | + /* |
|---|
| 1724 | + * Synchronize with gup_pte_range(): |
|---|
| 1725 | + * - clear PTE; barrier; read refcount |
|---|
| 1726 | + * - inc refcount; barrier; read PTE |
|---|
| 1727 | + */ |
|---|
| 1728 | + smp_mb(); |
|---|
| 1729 | + |
|---|
| 1730 | + ref_count = page_ref_count(page); |
|---|
| 1731 | + map_count = page_mapcount(page); |
|---|
| 1732 | + |
|---|
| 1733 | + /* |
|---|
| 1734 | + * Order reads for page refcount and dirty flag |
|---|
| 1735 | + * (see comments in __remove_mapping()). |
|---|
| 1736 | + */ |
|---|
| 1737 | + smp_rmb(); |
|---|
| 1738 | + |
|---|
| 1739 | + /* |
|---|
| 1740 | + * The only page refs must be one from isolation |
|---|
| 1741 | + * plus the rmap(s) (dropped by discard:). |
|---|
| 1742 | + */ |
|---|
| 1743 | + if (ref_count == 1 + map_count && |
|---|
| 1744 | + !PageDirty(page)) { |
|---|
| 1598 | 1745 | /* Invalidate as we cleared the pte */ |
|---|
| 1599 | 1746 | mmu_notifier_invalidate_range(mm, |
|---|
| 1600 | 1747 | address, address + PAGE_SIZE); |
|---|
| .. | .. |
|---|
| 1636 | 1783 | swp_pte = swp_entry_to_pte(entry); |
|---|
| 1637 | 1784 | if (pte_soft_dirty(pteval)) |
|---|
| 1638 | 1785 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
|---|
| 1786 | + if (pte_uffd_wp(pteval)) |
|---|
| 1787 | + swp_pte = pte_swp_mkuffd_wp(swp_pte); |
|---|
| 1639 | 1788 | set_pte_at(mm, address, pvmw.pte, swp_pte); |
|---|
| 1640 | 1789 | /* Invalidate as we cleared the pte */ |
|---|
| 1641 | 1790 | mmu_notifier_invalidate_range(mm, address, |
|---|
| .. | .. |
|---|
| 1665 | 1814 | put_page(page); |
|---|
| 1666 | 1815 | } |
|---|
| 1667 | 1816 | |
|---|
| 1668 | | - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
|---|
| 1817 | + mmu_notifier_invalidate_range_end(&range); |
|---|
| 1818 | + trace_android_vh_try_to_unmap_one(vma, page, address, ret); |
|---|
| 1669 | 1819 | |
|---|
| 1670 | 1820 | return ret; |
|---|
| 1671 | 1821 | } |
|---|
| 1672 | 1822 | |
|---|
| 1673 | | -bool is_vma_temporary_stack(struct vm_area_struct *vma) |
|---|
| 1674 | | -{ |
|---|
| 1675 | | - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
|---|
| 1676 | | - |
|---|
| 1677 | | - if (!maybe_stack) |
|---|
| 1678 | | - return false; |
|---|
| 1679 | | - |
|---|
| 1680 | | - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == |
|---|
| 1681 | | - VM_STACK_INCOMPLETE_SETUP) |
|---|
| 1682 | | - return true; |
|---|
| 1683 | | - |
|---|
| 1684 | | - return false; |
|---|
| 1685 | | -} |
|---|
| 1686 | | - |
|---|
| 1687 | 1823 | static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) |
|---|
| 1688 | 1824 | { |
|---|
| 1689 | | - return is_vma_temporary_stack(vma); |
|---|
| 1825 | + return vma_is_temporary_stack(vma); |
|---|
| 1690 | 1826 | } |
|---|
| 1691 | 1827 | |
|---|
| 1692 | 1828 | static int page_not_mapped(struct page *page) |
|---|
| .. | .. |
|---|
| 1779 | 1915 | struct anon_vma *anon_vma; |
|---|
| 1780 | 1916 | |
|---|
| 1781 | 1917 | if (rwc->anon_lock) |
|---|
| 1782 | | - return rwc->anon_lock(page); |
|---|
| 1918 | + return rwc->anon_lock(page, rwc); |
|---|
| 1783 | 1919 | |
|---|
| 1784 | 1920 | /* |
|---|
| 1785 | 1921 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
|---|
| 1786 | 1922 | * because that depends on page_mapped(); but not all its usages |
|---|
| 1787 | | - * are holding mmap_sem. Users without mmap_sem are required to |
|---|
| 1923 | + * are holding mmap_lock. Users without mmap_lock are required to |
|---|
| 1788 | 1924 | * take a reference count to prevent the anon_vma disappearing |
|---|
| 1789 | 1925 | */ |
|---|
| 1790 | 1926 | anon_vma = page_anon_vma(page); |
|---|
| 1791 | 1927 | if (!anon_vma) |
|---|
| 1792 | 1928 | return NULL; |
|---|
| 1793 | 1929 | |
|---|
| 1930 | + if (anon_vma_trylock_read(anon_vma)) |
|---|
| 1931 | + goto out; |
|---|
| 1932 | + |
|---|
| 1933 | + if (rwc->try_lock) { |
|---|
| 1934 | + anon_vma = NULL; |
|---|
| 1935 | + rwc->contended = true; |
|---|
| 1936 | + goto out; |
|---|
| 1937 | + } |
|---|
| 1938 | + |
|---|
| 1794 | 1939 | anon_vma_lock_read(anon_vma); |
|---|
| 1940 | +out: |
|---|
| 1795 | 1941 | return anon_vma; |
|---|
| 1796 | 1942 | } |
|---|
| 1797 | 1943 | |
|---|
| .. | .. |
|---|
| 1804 | 1950 | * Find all the mappings of a page using the mapping pointer and the vma chains |
|---|
| 1805 | 1951 | * contained in the anon_vma struct it points to. |
|---|
| 1806 | 1952 | * |
|---|
| 1807 | | - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma |
|---|
| 1953 | + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma |
|---|
| 1808 | 1954 | * where the page was found will be held for write. So, we won't recheck |
|---|
| 1809 | 1955 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
|---|
| 1810 | 1956 | * LOCKED. |
|---|
| .. | .. |
|---|
| 1827 | 1973 | return; |
|---|
| 1828 | 1974 | |
|---|
| 1829 | 1975 | pgoff_start = page_to_pgoff(page); |
|---|
| 1830 | | - pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; |
|---|
| 1976 | + pgoff_end = pgoff_start + thp_nr_pages(page) - 1; |
|---|
| 1831 | 1977 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, |
|---|
| 1832 | 1978 | pgoff_start, pgoff_end) { |
|---|
| 1833 | 1979 | struct vm_area_struct *vma = avc->vma; |
|---|
| .. | .. |
|---|
| 1857 | 2003 | * Find all the mappings of a page using the mapping pointer and the vma chains |
|---|
| 1858 | 2004 | * contained in the address_space struct it points to. |
|---|
| 1859 | 2005 | * |
|---|
| 1860 | | - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma |
|---|
| 2006 | + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma |
|---|
| 1861 | 2007 | * where the page was found will be held for write. So, we won't recheck |
|---|
| 1862 | 2008 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
|---|
| 1863 | 2009 | * LOCKED. |
|---|
| .. | .. |
|---|
| 1868 | 2014 | struct address_space *mapping = page_mapping(page); |
|---|
| 1869 | 2015 | pgoff_t pgoff_start, pgoff_end; |
|---|
| 1870 | 2016 | struct vm_area_struct *vma; |
|---|
| 2017 | + bool got_lock = false, success = false; |
|---|
| 1871 | 2018 | |
|---|
| 1872 | 2019 | /* |
|---|
| 1873 | 2020 | * The page lock not only makes sure that page->mapping cannot |
|---|
| .. | .. |
|---|
| 1881 | 2028 | return; |
|---|
| 1882 | 2029 | |
|---|
| 1883 | 2030 | pgoff_start = page_to_pgoff(page); |
|---|
| 1884 | | - pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; |
|---|
| 1885 | | - if (!locked) |
|---|
| 1886 | | - i_mmap_lock_read(mapping); |
|---|
| 2031 | + pgoff_end = pgoff_start + thp_nr_pages(page) - 1; |
|---|
| 2032 | + if (!locked) { |
|---|
| 2033 | + trace_android_vh_do_page_trylock(page, |
|---|
| 2034 | + &mapping->i_mmap_rwsem, &got_lock, &success); |
|---|
| 2035 | + if (success) { |
|---|
| 2036 | + if (!got_lock) |
|---|
| 2037 | + return; |
|---|
| 2038 | + } else { |
|---|
| 2039 | + if (i_mmap_trylock_read(mapping)) |
|---|
| 2040 | + goto lookup; |
|---|
| 2041 | + |
|---|
| 2042 | + if (rwc->try_lock) { |
|---|
| 2043 | + rwc->contended = true; |
|---|
| 2044 | + return; |
|---|
| 2045 | + } |
|---|
| 2046 | + |
|---|
| 2047 | + i_mmap_lock_read(mapping); |
|---|
| 2048 | + } |
|---|
| 2049 | + } |
|---|
| 2050 | +lookup: |
|---|
| 1887 | 2051 | vma_interval_tree_foreach(vma, &mapping->i_mmap, |
|---|
| 1888 | 2052 | pgoff_start, pgoff_end) { |
|---|
| 1889 | 2053 | unsigned long address = vma_address(page, vma); |
|---|
| .. | .. |
|---|
| 1928 | 2092 | |
|---|
| 1929 | 2093 | #ifdef CONFIG_HUGETLB_PAGE |
|---|
| 1930 | 2094 | /* |
|---|
| 1931 | | - * The following three functions are for anonymous (private mapped) hugepages. |
|---|
| 2095 | + * The following two functions are for anonymous (private mapped) hugepages. |
|---|
| 1932 | 2096 | * Unlike common anonymous pages, anonymous hugepages have no accounting code |
|---|
| 1933 | 2097 | * and no lru code, because we handle hugepages differently from common pages. |
|---|
| 1934 | 2098 | */ |
|---|
| 1935 | | -static void __hugepage_set_anon_rmap(struct page *page, |
|---|
| 1936 | | - struct vm_area_struct *vma, unsigned long address, int exclusive) |
|---|
| 1937 | | -{ |
|---|
| 1938 | | - struct anon_vma *anon_vma = vma->anon_vma; |
|---|
| 1939 | | - |
|---|
| 1940 | | - BUG_ON(!anon_vma); |
|---|
| 1941 | | - |
|---|
| 1942 | | - if (PageAnon(page)) |
|---|
| 1943 | | - return; |
|---|
| 1944 | | - if (!exclusive) |
|---|
| 1945 | | - anon_vma = anon_vma->root; |
|---|
| 1946 | | - |
|---|
| 1947 | | - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
|---|
| 1948 | | - page->mapping = (struct address_space *) anon_vma; |
|---|
| 1949 | | - page->index = linear_page_index(vma, address); |
|---|
| 1950 | | -} |
|---|
| 1951 | | - |
|---|
| 1952 | 2099 | void hugepage_add_anon_rmap(struct page *page, |
|---|
| 1953 | 2100 | struct vm_area_struct *vma, unsigned long address) |
|---|
| 1954 | 2101 | { |
|---|
| .. | .. |
|---|
| 1960 | 2107 | /* address might be in next vma when migration races vma_adjust */ |
|---|
| 1961 | 2108 | first = atomic_inc_and_test(compound_mapcount_ptr(page)); |
|---|
| 1962 | 2109 | if (first) |
|---|
| 1963 | | - __hugepage_set_anon_rmap(page, vma, address, 0); |
|---|
| 2110 | + __page_set_anon_rmap(page, vma, address, 0); |
|---|
| 1964 | 2111 | } |
|---|
| 1965 | 2112 | |
|---|
| 1966 | 2113 | void hugepage_add_new_anon_rmap(struct page *page, |
|---|
| .. | .. |
|---|
| 1968 | 2115 | { |
|---|
| 1969 | 2116 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
|---|
| 1970 | 2117 | atomic_set(compound_mapcount_ptr(page), 0); |
|---|
| 1971 | | - __hugepage_set_anon_rmap(page, vma, address, 1); |
|---|
| 2118 | + if (hpage_pincount_available(page)) |
|---|
| 2119 | + atomic_set(compound_pincount_ptr(page), 0); |
|---|
| 2120 | + |
|---|
| 2121 | + __page_set_anon_rmap(page, vma, address, 1); |
|---|
| 1972 | 2122 | } |
|---|
| 1973 | 2123 | #endif /* CONFIG_HUGETLB_PAGE */ |
|---|