hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/mm/rmap.c
....@@ -21,13 +21,14 @@
2121 * Lock ordering in mm:
2222 *
2323 * inode->i_mutex (while writing or truncating, not reading or faulting)
24
- * mm->mmap_sem
25
- * page->flags PG_locked (lock_page)
24
+ * mm->mmap_lock
25
+ * page->flags PG_locked (lock_page) * (see huegtlbfs below)
2626 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
2727 * mapping->i_mmap_rwsem
28
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
2829 * anon_vma->rwsem
2930 * mm->page_table_lock or pte_lock
30
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
31
+ * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
3132 * swap_lock (in swap_duplicate, swap_info_get)
3233 * mmlist_lock (in mmput, drain_mmlist and others)
3334 * mapping->private_lock (in __set_page_dirty_buffers)
....@@ -43,6 +44,11 @@
4344 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
4445 * ->tasklist_lock
4546 * pte map lock
47
+ *
48
+ * * hugetlbfs PageHuge() pages take locks in this order:
49
+ * mapping->i_mmap_rwsem
50
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
51
+ * page->flags PG_locked (lock_page)
4652 */
4753
4854 #include <linux/mm.h>
....@@ -61,6 +67,7 @@
6167 #include <linux/mmu_notifier.h>
6268 #include <linux/migrate.h>
6369 #include <linux/hugetlb.h>
70
+#include <linux/huge_mm.h>
6471 #include <linux/backing-dev.h>
6572 #include <linux/page_idle.h>
6673 #include <linux/memremap.h>
....@@ -69,6 +76,8 @@
6976 #include <asm/tlbflush.h>
7077
7178 #include <trace/events/tlb.h>
79
+
80
+#include <trace/hooks/mm.h>
7281
7382 #include "internal.h"
7483
....@@ -82,7 +91,8 @@
8291 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
8392 if (anon_vma) {
8493 atomic_set(&anon_vma->refcount, 1);
85
- anon_vma->degree = 1; /* Reference for first vma */
94
+ anon_vma->num_children = 0;
95
+ anon_vma->num_active_vmas = 0;
8696 anon_vma->parent = anon_vma;
8797 /*
8898 * Initialise the anon_vma root to point to itself. If called
....@@ -170,7 +180,7 @@
170180 * to do any locking for the common case of already having
171181 * an anon_vma.
172182 *
173
- * This must be called with the mmap_sem held for reading.
183
+ * This must be called with the mmap_lock held for reading.
174184 */
175185 int __anon_vma_prepare(struct vm_area_struct *vma)
176186 {
....@@ -190,6 +200,7 @@
190200 anon_vma = anon_vma_alloc();
191201 if (unlikely(!anon_vma))
192202 goto out_enomem_free_avc;
203
+ anon_vma->num_children++; /* self-parent link for new root */
193204 allocated = anon_vma;
194205 }
195206
....@@ -199,8 +210,7 @@
199210 if (likely(!vma->anon_vma)) {
200211 vma->anon_vma = anon_vma;
201212 anon_vma_chain_link(vma, avc, anon_vma);
202
- /* vma reference or self-parent link for new root */
203
- anon_vma->degree++;
213
+ anon_vma->num_active_vmas++;
204214 allocated = NULL;
205215 avc = NULL;
206216 }
....@@ -250,13 +260,19 @@
250260 * Attach the anon_vmas from src to dst.
251261 * Returns 0 on success, -ENOMEM on failure.
252262 *
253
- * If dst->anon_vma is NULL this function tries to find and reuse existing
254
- * anon_vma which has no vmas and only one child anon_vma. This prevents
255
- * degradation of anon_vma hierarchy to endless linear chain in case of
256
- * constantly forking task. On the other hand, an anon_vma with more than one
257
- * child isn't reused even if there was no alive vma, thus rmap walker has a
258
- * good chance of avoiding scanning the whole hierarchy when it searches where
259
- * page is mapped.
263
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
264
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
265
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
266
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
267
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
268
+ *
269
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
270
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
271
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
272
+ * case of constantly forking task. On the other hand, an anon_vma with more
273
+ * than one child isn't reused even if there was no alive vma, thus rmap
274
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
275
+ * searches where page is mapped.
260276 */
261277 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
262278 {
....@@ -279,19 +295,19 @@
279295 anon_vma_chain_link(dst, avc, anon_vma);
280296
281297 /*
282
- * Reuse existing anon_vma if its degree lower than two,
283
- * that means it has no vma and only one anon_vma child.
298
+ * Reuse existing anon_vma if it has no vma and only one
299
+ * anon_vma child.
284300 *
285
- * Do not chose parent anon_vma, otherwise first child
286
- * will always reuse it. Root anon_vma is never reused:
301
+ * Root anon_vma is never reused:
287302 * it has self-parent reference and at least one child.
288303 */
289
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
290
- anon_vma->degree < 2)
304
+ if (!dst->anon_vma && src->anon_vma &&
305
+ anon_vma->num_children < 2 &&
306
+ anon_vma->num_active_vmas == 0)
291307 dst->anon_vma = anon_vma;
292308 }
293309 if (dst->anon_vma)
294
- dst->anon_vma->degree++;
310
+ dst->anon_vma->num_active_vmas++;
295311 unlock_anon_vma_root(root);
296312 return 0;
297313
....@@ -341,6 +357,7 @@
341357 anon_vma = anon_vma_alloc();
342358 if (!anon_vma)
343359 goto out_error;
360
+ anon_vma->num_active_vmas++;
344361 avc = anon_vma_chain_alloc(GFP_KERNEL);
345362 if (!avc)
346363 goto out_error_free_anon_vma;
....@@ -361,7 +378,7 @@
361378 vma->anon_vma = anon_vma;
362379 anon_vma_lock_write(anon_vma);
363380 anon_vma_chain_link(vma, avc, anon_vma);
364
- anon_vma->parent->degree++;
381
+ anon_vma->parent->num_children++;
365382 anon_vma_unlock_write(anon_vma);
366383
367384 return 0;
....@@ -393,7 +410,7 @@
393410 * to free them outside the lock.
394411 */
395412 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
396
- anon_vma->parent->degree--;
413
+ anon_vma->parent->num_children--;
397414 continue;
398415 }
399416
....@@ -401,7 +418,8 @@
401418 anon_vma_chain_free(avc);
402419 }
403420 if (vma->anon_vma)
404
- vma->anon_vma->degree--;
421
+ vma->anon_vma->num_active_vmas--;
422
+
405423 unlock_anon_vma_root(root);
406424
407425 /*
....@@ -412,7 +430,8 @@
412430 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
413431 struct anon_vma *anon_vma = avc->anon_vma;
414432
415
- VM_WARN_ON(anon_vma->degree);
433
+ VM_WARN_ON(anon_vma->num_children);
434
+ VM_WARN_ON(anon_vma->num_active_vmas);
416435 put_anon_vma(anon_vma);
417436
418437 list_del(&avc->same_vma);
....@@ -457,9 +476,10 @@
457476 * chain and verify that the page in question is indeed mapped in it
458477 * [ something equivalent to page_mapped_in_vma() ].
459478 *
460
- * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
461
- * that the anon_vma pointer from page->mapping is valid if there is a
462
- * mapcount, we can dereference the anon_vma after observing those.
479
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
480
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
481
+ * if there is a mapcount, we can dereference the anon_vma after observing
482
+ * those.
463483 */
464484 struct anon_vma *page_get_anon_vma(struct page *page)
465485 {
....@@ -502,13 +522,16 @@
502522 *
503523 * Its a little more complex as it tries to keep the fast path to a single
504524 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
505
- * reference like with page_get_anon_vma() and then block on the mutex.
525
+ * reference like with page_get_anon_vma() and then block on the mutex
526
+ * on !rwc->try_lock case.
506527 */
507
-struct anon_vma *page_lock_anon_vma_read(struct page *page)
528
+struct anon_vma *page_lock_anon_vma_read(struct page *page,
529
+ struct rmap_walk_control *rwc)
508530 {
509531 struct anon_vma *anon_vma = NULL;
510532 struct anon_vma *root_anon_vma;
511533 unsigned long anon_mapping;
534
+ bool success = false;
512535
513536 rcu_read_lock();
514537 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
....@@ -529,6 +552,17 @@
529552 up_read(&root_anon_vma->rwsem);
530553 anon_vma = NULL;
531554 }
555
+ goto out;
556
+ }
557
+ trace_android_vh_do_page_trylock(page, NULL, NULL, &success);
558
+ if (success) {
559
+ anon_vma = NULL;
560
+ goto out;
561
+ }
562
+
563
+ if (rwc && rwc->try_lock) {
564
+ anon_vma = NULL;
565
+ rwc->contended = true;
532566 goto out;
533567 }
534568
....@@ -658,7 +692,7 @@
658692 */
659693 void flush_tlb_batched_pending(struct mm_struct *mm)
660694 {
661
- if (mm->tlb_flush_batched) {
695
+ if (data_race(mm->tlb_flush_batched)) {
662696 flush_tlb_mm(mm);
663697
664698 /*
....@@ -768,6 +802,7 @@
768802 }
769803
770804 if (pvmw.pte) {
805
+ trace_android_vh_look_around(&pvmw, page, vma, &referenced);
771806 if (ptep_clear_flush_young_notify(vma, address,
772807 pvmw.pte)) {
773808 /*
....@@ -803,6 +838,7 @@
803838 pra->vm_flags |= vma->vm_flags;
804839 }
805840
841
+ trace_android_vh_page_referenced_one_end(vma, page, referenced);
806842 if (!pra->mapcount)
807843 return false; /* To break the loop */
808844
....@@ -827,8 +863,10 @@
827863 * @memcg: target memory cgroup
828864 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
829865 *
830
- * Quick test_and_clear_referenced for all mappings to a page,
831
- * returns the number of ptes which referenced the page.
866
+ * Quick test_and_clear_referenced for all mappings of a page,
867
+ *
868
+ * Return: The number of mappings which referenced the page. Return -1 if
869
+ * the function bailed out due to rmap lock contention.
832870 */
833871 int page_referenced(struct page *page,
834872 int is_locked,
....@@ -844,10 +882,11 @@
844882 .rmap_one = page_referenced_one,
845883 .arg = (void *)&pra,
846884 .anon_lock = page_lock_anon_vma_read,
885
+ .try_lock = true,
847886 };
848887
849888 *vm_flags = 0;
850
- if (!page_mapped(page))
889
+ if (!pra.mapcount)
851890 return 0;
852891
853892 if (!page_rmapping(page))
....@@ -874,8 +913,9 @@
874913 if (we_locked)
875914 unlock_page(page);
876915
877
- return pra.referenced;
916
+ return rwc.contended ? -1 : pra.referenced;
878917 }
918
+EXPORT_SYMBOL_GPL(page_referenced);
879919
880920 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
881921 unsigned long address, void *arg)
....@@ -886,21 +926,22 @@
886926 .address = address,
887927 .flags = PVMW_SYNC,
888928 };
889
- unsigned long start = address, end;
929
+ struct mmu_notifier_range range;
890930 int *cleaned = arg;
891931
892932 /*
893933 * We have to assume the worse case ie pmd for invalidation. Note that
894934 * the page can not be free from this function.
895935 */
896
- end = vma_address_end(page, vma);
897
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
936
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
937
+ 0, vma, vma->vm_mm, address,
938
+ vma_address_end(page, vma));
939
+ mmu_notifier_invalidate_range_start(&range);
898940
899941 while (page_vma_mapped_walk(&pvmw)) {
900
- unsigned long cstart;
901942 int ret = 0;
902943
903
- cstart = address = pvmw.address;
944
+ address = pvmw.address;
904945 if (pvmw.pte) {
905946 pte_t entry;
906947 pte_t *pte = pvmw.pte;
....@@ -915,7 +956,7 @@
915956 set_pte_at(vma->vm_mm, address, pte, entry);
916957 ret = 1;
917958 } else {
918
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
959
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
919960 pmd_t *pmd = pvmw.pmd;
920961 pmd_t entry;
921962
....@@ -927,7 +968,6 @@
927968 entry = pmd_wrprotect(entry);
928969 entry = pmd_mkclean(entry);
929970 set_pmd_at(vma->vm_mm, address, pmd, entry);
930
- cstart &= PMD_MASK;
931971 ret = 1;
932972 #else
933973 /* unexpected pmd-mapped page? */
....@@ -946,7 +986,7 @@
946986 (*cleaned)++;
947987 }
948988
949
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
989
+ mmu_notifier_invalidate_range_end(&range);
950990
951991 return true;
952992 }
....@@ -1014,7 +1054,7 @@
10141054
10151055 /**
10161056 * __page_set_anon_rmap - set up new anonymous rmap
1017
- * @page: Page to add to rmap
1057
+ * @page: Page or Hugepage to add to rmap
10181058 * @vma: VM area to add page to.
10191059 * @address: User virtual address of the mapping
10201060 * @exclusive: the page is exclusively owned by the current process
....@@ -1051,7 +1091,6 @@
10511091 static void __page_check_anon_rmap(struct page *page,
10521092 struct vm_area_struct *vma, unsigned long address)
10531093 {
1054
-#ifdef CONFIG_DEBUG_VM
10551094 /*
10561095 * The page's anon-rmap details (mapping and index) are guaranteed to
10571096 * be set up correctly at this point.
....@@ -1064,9 +1103,9 @@
10641103 * are initially only visible via the pagetables, and the pte is locked
10651104 * over the call to page_add_new_anon_rmap.
10661105 */
1067
- BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
1068
- BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
1069
-#endif
1106
+ VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
1107
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1108
+ page);
10701109 }
10711110
10721111 /**
....@@ -1097,6 +1136,12 @@
10971136 {
10981137 bool compound = flags & RMAP_COMPOUND;
10991138 bool first;
1139
+ bool success = false;
1140
+
1141
+ if (unlikely(PageKsm(page)))
1142
+ lock_page_memcg(page);
1143
+ else
1144
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
11001145
11011146 if (compound) {
11021147 atomic_t *mapcount;
....@@ -1105,11 +1150,14 @@
11051150 mapcount = compound_mapcount_ptr(page);
11061151 first = atomic_inc_and_test(mapcount);
11071152 } else {
1108
- first = atomic_inc_and_test(&page->_mapcount);
1153
+ trace_android_vh_update_page_mapcount(page, true, compound,
1154
+ &first, &success);
1155
+ if (!success)
1156
+ first = atomic_inc_and_test(&page->_mapcount);
11091157 }
11101158
11111159 if (first) {
1112
- int nr = compound ? hpage_nr_pages(page) : 1;
1160
+ int nr = compound ? thp_nr_pages(page) : 1;
11131161 /*
11141162 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
11151163 * these counters are not modified in interrupt context, and
....@@ -1117,13 +1165,14 @@
11171165 * disabled.
11181166 */
11191167 if (compound)
1120
- __inc_node_page_state(page, NR_ANON_THPS);
1121
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1168
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
1169
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
11221170 }
1123
- if (unlikely(PageKsm(page)))
1124
- return;
11251171
1126
- VM_BUG_ON_PAGE(!PageLocked(page), page);
1172
+ if (unlikely(PageKsm(page))) {
1173
+ unlock_page_memcg(page);
1174
+ return;
1175
+ }
11271176
11281177 /* address might be in next vma when migration races vma_adjust */
11291178 if (first)
....@@ -1134,7 +1183,7 @@
11341183 }
11351184
11361185 /**
1137
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1186
+ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
11381187 * @page: the page to add the mapping to
11391188 * @vma: the vm area in which the mapping is added
11401189 * @address: the user virtual address mapped
....@@ -1144,25 +1193,27 @@
11441193 * This means the inc-and-test can be bypassed.
11451194 * Page does not have to be locked.
11461195 */
1147
-void page_add_new_anon_rmap(struct page *page,
1196
+void __page_add_new_anon_rmap(struct page *page,
11481197 struct vm_area_struct *vma, unsigned long address, bool compound)
11491198 {
1150
- int nr = compound ? hpage_nr_pages(page) : 1;
1199
+ int nr = compound ? thp_nr_pages(page) : 1;
11511200
1152
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
11531201 __SetPageSwapBacked(page);
11541202 if (compound) {
11551203 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
11561204 /* increment count (starts at -1) */
11571205 atomic_set(compound_mapcount_ptr(page), 0);
1158
- __inc_node_page_state(page, NR_ANON_THPS);
1206
+ if (hpage_pincount_available(page))
1207
+ atomic_set(compound_pincount_ptr(page), 0);
1208
+
1209
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
11591210 } else {
11601211 /* Anon THP always mapped first with PMD */
11611212 VM_BUG_ON_PAGE(PageTransCompound(page), page);
11621213 /* increment count (starts at -1) */
11631214 atomic_set(&page->_mapcount, 0);
11641215 }
1165
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1216
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
11661217 __page_set_anon_rmap(page, vma, address, 1);
11671218 }
11681219
....@@ -1176,18 +1227,29 @@
11761227 void page_add_file_rmap(struct page *page, bool compound)
11771228 {
11781229 int i, nr = 1;
1230
+ bool first_mapping;
1231
+ bool success = false;
11791232
11801233 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
11811234 lock_page_memcg(page);
11821235 if (compound && PageTransHuge(page)) {
1183
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1184
- if (atomic_inc_and_test(&page[i]._mapcount))
1185
- nr++;
1236
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1237
+ trace_android_vh_update_page_mapcount(&page[i], true,
1238
+ compound, &first_mapping, &success);
1239
+ if ((success)) {
1240
+ if (first_mapping)
1241
+ nr++;
1242
+ } else {
1243
+ if (atomic_inc_and_test(&page[i]._mapcount))
1244
+ nr++;
1245
+ }
11861246 }
11871247 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
11881248 goto out;
1189
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1190
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1249
+ if (PageSwapBacked(page))
1250
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1251
+ else
1252
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
11911253 } else {
11921254 if (PageTransCompound(page) && page_mapping(page)) {
11931255 VM_WARN_ON_ONCE(!PageLocked(page));
....@@ -1196,8 +1258,15 @@
11961258 if (PageMlocked(page))
11971259 clear_page_mlock(compound_head(page));
11981260 }
1199
- if (!atomic_inc_and_test(&page->_mapcount))
1200
- goto out;
1261
+ trace_android_vh_update_page_mapcount(page, true,
1262
+ compound, &first_mapping, &success);
1263
+ if (success) {
1264
+ if (!first_mapping)
1265
+ goto out;
1266
+ } else {
1267
+ if (!atomic_inc_and_test(&page->_mapcount))
1268
+ goto out;
1269
+ }
12011270 }
12021271 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
12031272 out:
....@@ -1207,30 +1276,47 @@
12071276 static void page_remove_file_rmap(struct page *page, bool compound)
12081277 {
12091278 int i, nr = 1;
1279
+ bool first_mapping;
1280
+ bool success = false;
12101281
12111282 VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1212
- lock_page_memcg(page);
12131283
12141284 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
12151285 if (unlikely(PageHuge(page))) {
12161286 /* hugetlb pages are always mapped with pmds */
12171287 atomic_dec(compound_mapcount_ptr(page));
1218
- goto out;
1288
+ return;
12191289 }
12201290
12211291 /* page still mapped by someone else? */
12221292 if (compound && PageTransHuge(page)) {
1223
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1224
- if (atomic_add_negative(-1, &page[i]._mapcount))
1225
- nr++;
1293
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1294
+ trace_android_vh_update_page_mapcount(&page[i], false,
1295
+ compound, &first_mapping, &success);
1296
+ if (success) {
1297
+ if (first_mapping)
1298
+ nr++;
1299
+ } else {
1300
+ if (atomic_add_negative(-1, &page[i]._mapcount))
1301
+ nr++;
1302
+ }
12261303 }
12271304 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1228
- goto out;
1229
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1230
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1305
+ return;
1306
+ if (PageSwapBacked(page))
1307
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1308
+ else
1309
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
12311310 } else {
1232
- if (!atomic_add_negative(-1, &page->_mapcount))
1233
- goto out;
1311
+ trace_android_vh_update_page_mapcount(page, false,
1312
+ compound, &first_mapping, &success);
1313
+ if (success) {
1314
+ if (!first_mapping)
1315
+ return;
1316
+ } else {
1317
+ if (!atomic_add_negative(-1, &page->_mapcount))
1318
+ return;
1319
+ }
12341320 }
12351321
12361322 /*
....@@ -1242,13 +1328,13 @@
12421328
12431329 if (unlikely(PageMlocked(page)))
12441330 clear_page_mlock(page);
1245
-out:
1246
- unlock_page_memcg(page);
12471331 }
12481332
12491333 static void page_remove_anon_compound_rmap(struct page *page)
12501334 {
12511335 int i, nr;
1336
+ bool first_mapping;
1337
+ bool success = false;
12521338
12531339 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
12541340 return;
....@@ -1260,28 +1346,41 @@
12601346 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
12611347 return;
12621348
1263
- __dec_node_page_state(page, NR_ANON_THPS);
1349
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
12641350
12651351 if (TestClearPageDoubleMap(page)) {
12661352 /*
12671353 * Subpages can be mapped with PTEs too. Check how many of
1268
- * themi are still mapped.
1354
+ * them are still mapped.
12691355 */
1270
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1271
- if (atomic_add_negative(-1, &page[i]._mapcount))
1272
- nr++;
1356
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1357
+ trace_android_vh_update_page_mapcount(&page[i], false,
1358
+ false, &first_mapping, &success);
1359
+ if (success) {
1360
+ if (first_mapping)
1361
+ nr++;
1362
+ } else {
1363
+ if (atomic_add_negative(-1, &page[i]._mapcount))
1364
+ nr++;
1365
+ }
12731366 }
1367
+
1368
+ /*
1369
+ * Queue the page for deferred split if at least one small
1370
+ * page of the compound page is unmapped, but at least one
1371
+ * small page is still mapped.
1372
+ */
1373
+ if (nr && nr < thp_nr_pages(page))
1374
+ deferred_split_huge_page(page);
12741375 } else {
1275
- nr = HPAGE_PMD_NR;
1376
+ nr = thp_nr_pages(page);
12761377 }
12771378
12781379 if (unlikely(PageMlocked(page)))
12791380 clear_page_mlock(page);
12801381
1281
- if (nr) {
1282
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
1283
- deferred_split_huge_page(page);
1284
- }
1382
+ if (nr)
1383
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
12851384 }
12861385
12871386 /**
....@@ -1293,22 +1392,36 @@
12931392 */
12941393 void page_remove_rmap(struct page *page, bool compound)
12951394 {
1296
- if (!PageAnon(page))
1297
- return page_remove_file_rmap(page, compound);
1395
+ bool first_mapping;
1396
+ bool success = false;
1397
+ lock_page_memcg(page);
12981398
1299
- if (compound)
1300
- return page_remove_anon_compound_rmap(page);
1399
+ if (!PageAnon(page)) {
1400
+ page_remove_file_rmap(page, compound);
1401
+ goto out;
1402
+ }
13011403
1302
- /* page still mapped by someone else? */
1303
- if (!atomic_add_negative(-1, &page->_mapcount))
1304
- return;
1404
+ if (compound) {
1405
+ page_remove_anon_compound_rmap(page);
1406
+ goto out;
1407
+ }
13051408
1409
+ trace_android_vh_update_page_mapcount(page, false,
1410
+ compound, &first_mapping, &success);
1411
+ if (success) {
1412
+ if (!first_mapping)
1413
+ goto out;
1414
+ } else {
1415
+ /* page still mapped by someone else? */
1416
+ if (!atomic_add_negative(-1, &page->_mapcount))
1417
+ goto out;
1418
+ }
13061419 /*
13071420 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
13081421 * these counters are not modified in interrupt context, and
13091422 * pte lock(a spinlock) is held, which implies preemption disabled.
13101423 */
1311
- __dec_node_page_state(page, NR_ANON_MAPPED);
1424
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
13121425
13131426 if (unlikely(PageMlocked(page)))
13141427 clear_page_mlock(page);
....@@ -1325,6 +1438,8 @@
13251438 * Leaving it set also helps swapoff to reinstate ptes
13261439 * faster for those pages still in swapcache.
13271440 */
1441
+out:
1442
+ unlock_page_memcg(page);
13281443 }
13291444
13301445 /*
....@@ -1342,8 +1457,8 @@
13421457 pte_t pteval;
13431458 struct page *subpage;
13441459 bool ret = true;
1345
- unsigned long start = address, end;
1346
- enum ttu_flags flags = (enum ttu_flags)arg;
1460
+ struct mmu_notifier_range range;
1461
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
13471462
13481463 /*
13491464 * When racing against e.g. zap_pte_range() on another cpu,
....@@ -1375,16 +1490,19 @@
13751490 * Note that the page can not be free in this function as call of
13761491 * try_to_unmap() must hold a reference on the page.
13771492 */
1378
- end = PageKsm(page) ?
1493
+ range.end = PageKsm(page) ?
13791494 address + PAGE_SIZE : vma_address_end(page, vma);
1495
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1496
+ address, range.end);
13801497 if (PageHuge(page)) {
13811498 /*
13821499 * If sharing is possible, start and end will be adjusted
13831500 * accordingly.
13841501 */
1385
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
1502
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
1503
+ &range.end);
13861504 }
1387
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
1505
+ mmu_notifier_invalidate_range_start(&range);
13881506
13891507 while (page_vma_mapped_walk(&pvmw)) {
13901508 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
....@@ -1408,7 +1526,7 @@
14081526 if (!PageTransCompound(page)) {
14091527 /*
14101528 * Holding pte lock, we do *not* need
1411
- * mmap_sem here
1529
+ * mmap_lock here
14121530 */
14131531 mlock_vma_page(page);
14141532 }
....@@ -1426,8 +1544,14 @@
14261544 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
14271545 address = pvmw.address;
14281546
1429
- if (PageHuge(page)) {
1430
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
1547
+ if (PageHuge(page) && !PageAnon(page)) {
1548
+ /*
1549
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
1550
+ * held in write mode. Caller needs to explicitly
1551
+ * do this outside rmap routines.
1552
+ */
1553
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1554
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
14311555 /*
14321556 * huge_pmd_unshare unmapped an entire PMD
14331557 * page. There is no way of knowing exactly
....@@ -1435,9 +1559,10 @@
14351559 * we must flush them all. start/end were
14361560 * already adjusted above to cover this range.
14371561 */
1438
- flush_cache_range(vma, start, end);
1439
- flush_tlb_range(vma, start, end);
1440
- mmu_notifier_invalidate_range(mm, start, end);
1562
+ flush_cache_range(vma, range.start, range.end);
1563
+ flush_tlb_range(vma, range.start, range.end);
1564
+ mmu_notifier_invalidate_range(mm, range.start,
1565
+ range.end);
14411566
14421567 /*
14431568 * The ref count of the PMD page was dropped
....@@ -1468,8 +1593,15 @@
14681593 */
14691594 entry = make_migration_entry(page, 0);
14701595 swp_pte = swp_entry_to_pte(entry);
1471
- if (pte_soft_dirty(pteval))
1596
+
1597
+ /*
1598
+ * pteval maps a zone device page and is therefore
1599
+ * a swap pte.
1600
+ */
1601
+ if (pte_swp_soft_dirty(pteval))
14721602 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1603
+ if (pte_swp_uffd_wp(pteval))
1604
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
14731605 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
14741606 /*
14751607 * No need to invalidate here it will synchronize on
....@@ -1484,15 +1616,6 @@
14841616 */
14851617 subpage = page;
14861618 goto discard;
1487
- }
1488
-
1489
- if (!(flags & TTU_IGNORE_ACCESS)) {
1490
- if (ptep_clear_flush_young_notify(vma, address,
1491
- pvmw.pte)) {
1492
- ret = false;
1493
- page_vma_mapped_walk_done(&pvmw);
1494
- break;
1495
- }
14961619 }
14971620
14981621 /* Nuke the page table entry. */
....@@ -1523,8 +1646,7 @@
15231646 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
15241647 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
15251648 if (PageHuge(page)) {
1526
- int nr = 1 << compound_order(page);
1527
- hugetlb_count_sub(nr, mm);
1649
+ hugetlb_count_sub(compound_nr(page), mm);
15281650 set_huge_swap_pte_at(mm, address,
15291651 pvmw.pte, pteval,
15301652 vma_mmu_pagesize(vma));
....@@ -1570,6 +1692,8 @@
15701692 swp_pte = swp_entry_to_pte(entry);
15711693 if (pte_soft_dirty(pteval))
15721694 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1695
+ if (pte_uffd_wp(pteval))
1696
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
15731697 set_pte_at(mm, address, pvmw.pte, swp_pte);
15741698 /*
15751699 * No need to invalidate here it will synchronize on
....@@ -1594,7 +1718,30 @@
15941718
15951719 /* MADV_FREE page check */
15961720 if (!PageSwapBacked(page)) {
1597
- if (!PageDirty(page)) {
1721
+ int ref_count, map_count;
1722
+
1723
+ /*
1724
+ * Synchronize with gup_pte_range():
1725
+ * - clear PTE; barrier; read refcount
1726
+ * - inc refcount; barrier; read PTE
1727
+ */
1728
+ smp_mb();
1729
+
1730
+ ref_count = page_ref_count(page);
1731
+ map_count = page_mapcount(page);
1732
+
1733
+ /*
1734
+ * Order reads for page refcount and dirty flag
1735
+ * (see comments in __remove_mapping()).
1736
+ */
1737
+ smp_rmb();
1738
+
1739
+ /*
1740
+ * The only page refs must be one from isolation
1741
+ * plus the rmap(s) (dropped by discard:).
1742
+ */
1743
+ if (ref_count == 1 + map_count &&
1744
+ !PageDirty(page)) {
15981745 /* Invalidate as we cleared the pte */
15991746 mmu_notifier_invalidate_range(mm,
16001747 address, address + PAGE_SIZE);
....@@ -1636,6 +1783,8 @@
16361783 swp_pte = swp_entry_to_pte(entry);
16371784 if (pte_soft_dirty(pteval))
16381785 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1786
+ if (pte_uffd_wp(pteval))
1787
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
16391788 set_pte_at(mm, address, pvmw.pte, swp_pte);
16401789 /* Invalidate as we cleared the pte */
16411790 mmu_notifier_invalidate_range(mm, address,
....@@ -1665,28 +1814,15 @@
16651814 put_page(page);
16661815 }
16671816
1668
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
1817
+ mmu_notifier_invalidate_range_end(&range);
1818
+ trace_android_vh_try_to_unmap_one(vma, page, address, ret);
16691819
16701820 return ret;
16711821 }
16721822
1673
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
1674
-{
1675
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1676
-
1677
- if (!maybe_stack)
1678
- return false;
1679
-
1680
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1681
- VM_STACK_INCOMPLETE_SETUP)
1682
- return true;
1683
-
1684
- return false;
1685
-}
1686
-
16871823 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
16881824 {
1689
- return is_vma_temporary_stack(vma);
1825
+ return vma_is_temporary_stack(vma);
16901826 }
16911827
16921828 static int page_not_mapped(struct page *page)
....@@ -1779,19 +1915,29 @@
17791915 struct anon_vma *anon_vma;
17801916
17811917 if (rwc->anon_lock)
1782
- return rwc->anon_lock(page);
1918
+ return rwc->anon_lock(page, rwc);
17831919
17841920 /*
17851921 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
17861922 * because that depends on page_mapped(); but not all its usages
1787
- * are holding mmap_sem. Users without mmap_sem are required to
1923
+ * are holding mmap_lock. Users without mmap_lock are required to
17881924 * take a reference count to prevent the anon_vma disappearing
17891925 */
17901926 anon_vma = page_anon_vma(page);
17911927 if (!anon_vma)
17921928 return NULL;
17931929
1930
+ if (anon_vma_trylock_read(anon_vma))
1931
+ goto out;
1932
+
1933
+ if (rwc->try_lock) {
1934
+ anon_vma = NULL;
1935
+ rwc->contended = true;
1936
+ goto out;
1937
+ }
1938
+
17941939 anon_vma_lock_read(anon_vma);
1940
+out:
17951941 return anon_vma;
17961942 }
17971943
....@@ -1804,7 +1950,7 @@
18041950 * Find all the mappings of a page using the mapping pointer and the vma chains
18051951 * contained in the anon_vma struct it points to.
18061952 *
1807
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1953
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
18081954 * where the page was found will be held for write. So, we won't recheck
18091955 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
18101956 * LOCKED.
....@@ -1827,7 +1973,7 @@
18271973 return;
18281974
18291975 pgoff_start = page_to_pgoff(page);
1830
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1976
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
18311977 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
18321978 pgoff_start, pgoff_end) {
18331979 struct vm_area_struct *vma = avc->vma;
....@@ -1857,7 +2003,7 @@
18572003 * Find all the mappings of a page using the mapping pointer and the vma chains
18582004 * contained in the address_space struct it points to.
18592005 *
1860
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
2006
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
18612007 * where the page was found will be held for write. So, we won't recheck
18622008 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
18632009 * LOCKED.
....@@ -1868,6 +2014,7 @@
18682014 struct address_space *mapping = page_mapping(page);
18692015 pgoff_t pgoff_start, pgoff_end;
18702016 struct vm_area_struct *vma;
2017
+ bool got_lock = false, success = false;
18712018
18722019 /*
18732020 * The page lock not only makes sure that page->mapping cannot
....@@ -1881,9 +2028,26 @@
18812028 return;
18822029
18832030 pgoff_start = page_to_pgoff(page);
1884
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1885
- if (!locked)
1886
- i_mmap_lock_read(mapping);
2031
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
2032
+ if (!locked) {
2033
+ trace_android_vh_do_page_trylock(page,
2034
+ &mapping->i_mmap_rwsem, &got_lock, &success);
2035
+ if (success) {
2036
+ if (!got_lock)
2037
+ return;
2038
+ } else {
2039
+ if (i_mmap_trylock_read(mapping))
2040
+ goto lookup;
2041
+
2042
+ if (rwc->try_lock) {
2043
+ rwc->contended = true;
2044
+ return;
2045
+ }
2046
+
2047
+ i_mmap_lock_read(mapping);
2048
+ }
2049
+ }
2050
+lookup:
18872051 vma_interval_tree_foreach(vma, &mapping->i_mmap,
18882052 pgoff_start, pgoff_end) {
18892053 unsigned long address = vma_address(page, vma);
....@@ -1928,27 +2092,10 @@
19282092
19292093 #ifdef CONFIG_HUGETLB_PAGE
19302094 /*
1931
- * The following three functions are for anonymous (private mapped) hugepages.
2095
+ * The following two functions are for anonymous (private mapped) hugepages.
19322096 * Unlike common anonymous pages, anonymous hugepages have no accounting code
19332097 * and no lru code, because we handle hugepages differently from common pages.
19342098 */
1935
-static void __hugepage_set_anon_rmap(struct page *page,
1936
- struct vm_area_struct *vma, unsigned long address, int exclusive)
1937
-{
1938
- struct anon_vma *anon_vma = vma->anon_vma;
1939
-
1940
- BUG_ON(!anon_vma);
1941
-
1942
- if (PageAnon(page))
1943
- return;
1944
- if (!exclusive)
1945
- anon_vma = anon_vma->root;
1946
-
1947
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1948
- page->mapping = (struct address_space *) anon_vma;
1949
- page->index = linear_page_index(vma, address);
1950
-}
1951
-
19522099 void hugepage_add_anon_rmap(struct page *page,
19532100 struct vm_area_struct *vma, unsigned long address)
19542101 {
....@@ -1960,7 +2107,7 @@
19602107 /* address might be in next vma when migration races vma_adjust */
19612108 first = atomic_inc_and_test(compound_mapcount_ptr(page));
19622109 if (first)
1963
- __hugepage_set_anon_rmap(page, vma, address, 0);
2110
+ __page_set_anon_rmap(page, vma, address, 0);
19642111 }
19652112
19662113 void hugepage_add_new_anon_rmap(struct page *page,
....@@ -1968,6 +2115,9 @@
19682115 {
19692116 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
19702117 atomic_set(compound_mapcount_ptr(page), 0);
1971
- __hugepage_set_anon_rmap(page, vma, address, 1);
2118
+ if (hpage_pincount_available(page))
2119
+ atomic_set(compound_pincount_ptr(page), 0);
2120
+
2121
+ __page_set_anon_rmap(page, vma, address, 1);
19722122 }
19732123 #endif /* CONFIG_HUGETLB_PAGE */