From 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 22 Oct 2024 10:36:11 +0000 Subject: [PATCH] 修改4g拨号为QMI,需要在系统里后台执行quectel-CM --- kernel/mm/rmap.c | 482 +++++++++++++++++++++++++++++++++++------------------ 1 files changed, 316 insertions(+), 166 deletions(-) diff --git a/kernel/mm/rmap.c b/kernel/mm/rmap.c index 699f445..5338a8b 100644 --- a/kernel/mm/rmap.c +++ b/kernel/mm/rmap.c @@ -21,13 +21,14 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * mm->mmap_sem - * page->flags PG_locked (lock_page) + * mm->mmap_lock + * page->flags PG_locked (lock_page) * (see huegtlbfs below) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) @@ -43,6 +44,11 @@ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * * hugetlbfs PageHuge() pages take locks in this order: + * mapping->i_mmap_rwsem + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * page->flags PG_locked (lock_page) */ #include <linux/mm.h> @@ -61,6 +67,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> +#include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> @@ -69,6 +76,8 @@ #include <asm/tlbflush.h> #include <trace/events/tlb.h> + +#include <trace/hooks/mm.h> #include "internal.h" @@ -82,7 +91,8 @@ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); - anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called @@ -170,7 +180,7 @@ * to do any locking for the common case of already having * an anon_vma. * - * This must be called with the mmap_sem held for reading. + * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { @@ -190,6 +200,7 @@ anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } @@ -199,8 +210,7 @@ if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; + anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } @@ -250,13 +260,19 @@ * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * If dst->anon_vma is NULL this function tries to find and reuse existing - * anon_vma which has no vmas and only one child anon_vma. This prevents - * degradation of anon_vma hierarchy to endless linear chain in case of - * constantly forking task. On the other hand, an anon_vma with more than one - * child isn't reused even if there was no alive vma, thus rmap walker has a - * good chance of avoiding scanning the whole hierarchy when it searches where - * page is mapped. + * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and + * anon_vma_fork(). The first three want an exact copy of src, while the last + * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent + * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, + * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * + * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find + * and reuse existing anon_vma which has no vmas and only one child anon_vma. + * This prevents degradation of anon_vma hierarchy to endless linear chain in + * case of constantly forking task. On the other hand, an anon_vma with more + * than one child isn't reused even if there was no alive vma, thus rmap + * walker has a good chance of avoiding scanning the whole hierarchy when it + * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { @@ -279,19 +295,19 @@ anon_vma_chain_link(dst, avc, anon_vma); /* - * Reuse existing anon_vma if its degree lower than two, - * that means it has no vma and only one anon_vma child. + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. * - * Do not chose parent anon_vma, otherwise first child - * will always reuse it. Root anon_vma is never reused: + * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ - if (!dst->anon_vma && anon_vma != src->anon_vma && - anon_vma->degree < 2) + if (!dst->anon_vma && src->anon_vma && + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) - dst->anon_vma->degree++; + dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; @@ -341,6 +357,7 @@ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; + anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -361,7 +378,7 @@ vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma->parent->degree++; + anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; @@ -393,7 +410,7 @@ * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { - anon_vma->parent->degree--; + anon_vma->parent->num_children--; continue; } @@ -401,7 +418,8 @@ anon_vma_chain_free(avc); } if (vma->anon_vma) - vma->anon_vma->degree--; + vma->anon_vma->num_active_vmas--; + unlock_anon_vma_root(root); /* @@ -412,7 +430,8 @@ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - VM_WARN_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); @@ -457,9 +476,10 @@ * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * - * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() - * that the anon_vma pointer from page->mapping is valid if there is a - * mapcount, we can dereference the anon_vma after observing those. + * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from + * page_remove_rmap() that the anon_vma pointer from page->mapping is valid + * if there is a mapcount, we can dereference the anon_vma after observing + * those. */ struct anon_vma *page_get_anon_vma(struct page *page) { @@ -502,13 +522,16 @@ * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex. + * reference like with page_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. */ -struct anon_vma *page_lock_anon_vma_read(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; + bool success = false; rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(page->mapping); @@ -529,6 +552,17 @@ up_read(&root_anon_vma->rwsem); anon_vma = NULL; } + goto out; + } + trace_android_vh_do_page_trylock(page, NULL, NULL, &success); + if (success) { + anon_vma = NULL; + goto out; + } + + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; goto out; } @@ -658,7 +692,7 @@ */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (mm->tlb_flush_batched) { + if (data_race(mm->tlb_flush_batched)) { flush_tlb_mm(mm); /* @@ -768,6 +802,7 @@ } if (pvmw.pte) { + trace_android_vh_look_around(&pvmw, page, vma, &referenced); if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* @@ -803,6 +838,7 @@ pra->vm_flags |= vma->vm_flags; } + trace_android_vh_page_referenced_one_end(vma, page, referenced); if (!pra->mapcount) return false; /* To break the loop */ @@ -827,8 +863,10 @@ * @memcg: target memory cgroup * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of ptes which referenced the page. + * Quick test_and_clear_referenced for all mappings of a page, + * + * Return: The number of mappings which referenced the page. Return -1 if + * the function bailed out due to rmap lock contention. */ int page_referenced(struct page *page, int is_locked, @@ -844,10 +882,11 @@ .rmap_one = page_referenced_one, .arg = (void *)&pra, .anon_lock = page_lock_anon_vma_read, + .try_lock = true, }; *vm_flags = 0; - if (!page_mapped(page)) + if (!pra.mapcount) return 0; if (!page_rmapping(page)) @@ -874,8 +913,9 @@ if (we_locked) unlock_page(page); - return pra.referenced; + return rwc.contended ? -1 : pra.referenced; } +EXPORT_SYMBOL_GPL(page_referenced); static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) @@ -886,21 +926,22 @@ .address = address, .flags = PVMW_SYNC, }; - unsigned long start = address, end; + struct mmu_notifier_range range; int *cleaned = arg; /* * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - end = vma_address_end(page, vma); - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, + vma_address_end(page, vma)); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart; int ret = 0; - cstart = address = pvmw.address; + address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -915,7 +956,7 @@ set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw.pmd; pmd_t entry; @@ -927,7 +968,6 @@ entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); - cstart &= PMD_MASK; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -946,7 +986,7 @@ (*cleaned)++; } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); return true; } @@ -1014,7 +1054,7 @@ /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page to add to rmap + * @page: Page or Hugepage to add to rmap * @vma: VM area to add page to. * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process @@ -1051,7 +1091,6 @@ static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { -#ifdef CONFIG_DEBUG_VM /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1064,9 +1103,9 @@ * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); -#endif + VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + page); } /** @@ -1097,6 +1136,12 @@ { bool compound = flags & RMAP_COMPOUND; bool first; + bool success = false; + + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound) { atomic_t *mapcount; @@ -1105,11 +1150,14 @@ mapcount = compound_mapcount_ptr(page); first = atomic_inc_and_test(mapcount); } else { - first = atomic_inc_and_test(&page->_mapcount); + trace_android_vh_update_page_mapcount(page, true, compound, + &first, &success); + if (!success) + first = atomic_inc_and_test(&page->_mapcount); } if (first) { - int nr = compound ? hpage_nr_pages(page) : 1; + int nr = compound ? thp_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and @@ -1117,13 +1165,14 @@ * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1134,7 +1183,7 @@ } /** - * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * __page_add_new_anon_rmap - add pte mapping to a new anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped @@ -1144,25 +1193,27 @@ * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ -void page_add_new_anon_rmap(struct page *page, +void __page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, bool compound) { - int nr = compound ? hpage_nr_pages(page) : 1; + int nr = compound ? thp_nr_pages(page) : 1; - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); __SetPageSwapBacked(page); if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); + + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1176,18 +1227,29 @@ void page_add_file_rmap(struct page *page, bool compound) { int i, nr = 1; + bool first_mapping; + bool success = false; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { - if (atomic_inc_and_test(&page[i]._mapcount)) - nr++; + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + trace_android_vh_update_page_mapcount(&page[i], true, + compound, &first_mapping, &success); + if ((success)) { + if (first_mapping) + nr++; + } else { + if (atomic_inc_and_test(&page[i]._mapcount)) + nr++; + } } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1196,8 +1258,15 @@ if (PageMlocked(page)) clear_page_mlock(compound_head(page)); } - if (!atomic_inc_and_test(&page->_mapcount)) - goto out; + trace_android_vh_update_page_mapcount(page, true, + compound, &first_mapping, &success); + if (success) { + if (!first_mapping) + goto out; + } else { + if (!atomic_inc_and_test(&page->_mapcount)) + goto out; + } } __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: @@ -1207,30 +1276,47 @@ static void page_remove_file_rmap(struct page *page, bool compound) { int i, nr = 1; + bool first_mapping; + bool success = false; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + trace_android_vh_update_page_mapcount(&page[i], false, + compound, &first_mapping, &success); + if (success) { + if (first_mapping) + nr++; + } else { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + return; + if (PageSwapBacked(page)) + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { - if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + trace_android_vh_update_page_mapcount(page, false, + compound, &first_mapping, &success); + if (success) { + if (!first_mapping) + return; + } else { + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + } } /* @@ -1242,13 +1328,13 @@ if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) { int i, nr; + bool first_mapping; + bool success = false; if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; @@ -1260,28 +1346,41 @@ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* * Subpages can be mapped with PTEs too. Check how many of - * themi are still mapped. + * them are still mapped. */ - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + trace_android_vh_update_page_mapcount(&page[i], false, + false, &first_mapping, &success); + if (success) { + if (first_mapping) + nr++; + } else { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } } + + /* + * Queue the page for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (nr && nr < thp_nr_pages(page)) + deferred_split_huge_page(page); } else { - nr = HPAGE_PMD_NR; + nr = thp_nr_pages(page); } if (unlikely(PageMlocked(page))) clear_page_mlock(page); - if (nr) { - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); - deferred_split_huge_page(page); - } + if (nr) + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1293,22 +1392,36 @@ */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + bool first_mapping; + bool success = false; + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) - return; + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } + trace_android_vh_update_page_mapcount(page, false, + compound, &first_mapping, &success); + if (success) { + if (!first_mapping) + goto out; + } else { + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + } /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1325,6 +1438,8 @@ * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* @@ -1342,8 +1457,8 @@ pte_t pteval; struct page *subpage; bool ret = true; - unsigned long start = address, end; - enum ttu_flags flags = (enum ttu_flags)arg; + struct mmu_notifier_range range; + enum ttu_flags flags = (enum ttu_flags)(long)arg; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1375,16 +1490,19 @@ * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - end = PageKsm(page) ? + range.end = PageKsm(page) ? address + PAGE_SIZE : vma_address_end(page, vma); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, range.end); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ - adjust_range_if_pmd_sharing_possible(vma, &start, &end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); } - mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -1408,7 +1526,7 @@ if (!PageTransCompound(page)) { /* * Holding pte lock, we do *not* need - * mmap_sem here + * mmap_lock here */ mlock_vma_page(page); } @@ -1426,8 +1544,14 @@ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; - if (PageHuge(page)) { - if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + if (PageHuge(page) && !PageAnon(page)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { /* * huge_pmd_unshare unmapped an entire PMD * page. There is no way of knowing exactly @@ -1435,9 +1559,10 @@ * we must flush them all. start/end were * already adjusted above to cover this range. */ - flush_cache_range(vma, start, end); - flush_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + flush_cache_range(vma, range.start, range.end); + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); /* * The ref count of the PMD page was dropped @@ -1468,8 +1593,15 @@ */ entry = make_migration_entry(page, 0); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) + + /* + * pteval maps a zone device page and is therefore + * a swap pte. + */ + if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1484,15 +1616,6 @@ */ subpage = page; goto discard; - } - - if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } } /* Nuke the page table entry. */ @@ -1523,8 +1646,7 @@ if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { - int nr = 1 << compound_order(page); - hugetlb_count_sub(nr, mm); + hugetlb_count_sub(compound_nr(page), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); @@ -1570,6 +1692,8 @@ swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1594,7 +1718,30 @@ /* MADV_FREE page check */ if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = page_ref_count(page); + map_count = page_mapcount(page); + + /* + * Order reads for page refcount and dirty flag + * (see comments in __remove_mapping()). + */ + smp_rmb(); + + /* + * The only page refs must be one from isolation + * plus the rmap(s) (dropped by discard:). + */ + if (ref_count == 1 + map_count && + !PageDirty(page)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1636,6 +1783,8 @@ swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, @@ -1665,28 +1814,15 @@ put_page(page); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); + trace_android_vh_try_to_unmap_one(vma, page, address, ret); return ret; } -bool is_vma_temporary_stack(struct vm_area_struct *vma) -{ - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); - - if (!maybe_stack) - return false; - - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == - VM_STACK_INCOMPLETE_SETUP) - return true; - - return false; -} - static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { - return is_vma_temporary_stack(vma); + return vma_is_temporary_stack(vma); } static int page_not_mapped(struct page *page) @@ -1779,19 +1915,29 @@ struct anon_vma *anon_vma; if (rwc->anon_lock) - return rwc->anon_lock(page); + return rwc->anon_lock(page, rwc); /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem. Users without mmap_sem are required to + * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) return NULL; + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + anon_vma_lock_read(anon_vma); +out: return anon_vma; } @@ -1804,7 +1950,7 @@ * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. * - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. @@ -1827,7 +1973,7 @@ return; pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; + pgoff_end = pgoff_start + thp_nr_pages(page) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; @@ -1857,7 +2003,7 @@ * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. @@ -1868,6 +2014,7 @@ struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; + bool got_lock = false, success = false; /* * The page lock not only makes sure that page->mapping cannot @@ -1881,9 +2028,26 @@ return; pgoff_start = page_to_pgoff(page); - pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; - if (!locked) - i_mmap_lock_read(mapping); + pgoff_end = pgoff_start + thp_nr_pages(page) - 1; + if (!locked) { + trace_android_vh_do_page_trylock(page, + &mapping->i_mmap_rwsem, &got_lock, &success); + if (success) { + if (!got_lock) + return; + } else { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + + i_mmap_lock_read(mapping); + } + } +lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); @@ -1928,27 +2092,10 @@ #ifdef CONFIG_HUGETLB_PAGE /* - * The following three functions are for anonymous (private mapped) hugepages. + * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ -static void __hugepage_set_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) -{ - struct anon_vma *anon_vma = vma->anon_vma; - - BUG_ON(!anon_vma); - - if (PageAnon(page)) - return; - if (!exclusive) - anon_vma = anon_vma->root; - - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; - page->index = linear_page_index(vma, address); -} - void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { @@ -1960,7 +2107,7 @@ /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) - __hugepage_set_anon_rmap(page, vma, address, 0); + __page_set_anon_rmap(page, vma, address, 0); } void hugepage_add_new_anon_rmap(struct page *page, @@ -1968,6 +2115,9 @@ { BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); - __hugepage_set_anon_rmap(page, vma, address, 1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); + + __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ -- Gitblit v1.6.2