hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/mm/rmap.c
....@@ -21,13 +21,14 @@
2121 * Lock ordering in mm:
2222 *
2323 * inode->i_mutex (while writing or truncating, not reading or faulting)
24
- * mm->mmap_sem
25
- * page->flags PG_locked (lock_page)
24
+ * mm->mmap_lock
25
+ * page->flags PG_locked (lock_page) * (see huegtlbfs below)
2626 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
2727 * mapping->i_mmap_rwsem
28
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
2829 * anon_vma->rwsem
2930 * mm->page_table_lock or pte_lock
30
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
31
+ * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
3132 * swap_lock (in swap_duplicate, swap_info_get)
3233 * mmlist_lock (in mmput, drain_mmlist and others)
3334 * mapping->private_lock (in __set_page_dirty_buffers)
....@@ -43,6 +44,11 @@
4344 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
4445 * ->tasklist_lock
4546 * pte map lock
47
+ *
48
+ * * hugetlbfs PageHuge() pages take locks in this order:
49
+ * mapping->i_mmap_rwsem
50
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
51
+ * page->flags PG_locked (lock_page)
4652 */
4753
4854 #include <linux/mm.h>
....@@ -61,6 +67,7 @@
6167 #include <linux/mmu_notifier.h>
6268 #include <linux/migrate.h>
6369 #include <linux/hugetlb.h>
70
+#include <linux/huge_mm.h>
6471 #include <linux/backing-dev.h>
6572 #include <linux/page_idle.h>
6673 #include <linux/memremap.h>
....@@ -69,6 +76,8 @@
6976 #include <asm/tlbflush.h>
7077
7178 #include <trace/events/tlb.h>
79
+
80
+#include <trace/hooks/mm.h>
7281
7382 #include "internal.h"
7483
....@@ -170,7 +179,7 @@
170179 * to do any locking for the common case of already having
171180 * an anon_vma.
172181 *
173
- * This must be called with the mmap_sem held for reading.
182
+ * This must be called with the mmap_lock held for reading.
174183 */
175184 int __anon_vma_prepare(struct vm_area_struct *vma)
176185 {
....@@ -250,13 +259,19 @@
250259 * Attach the anon_vmas from src to dst.
251260 * Returns 0 on success, -ENOMEM on failure.
252261 *
253
- * If dst->anon_vma is NULL this function tries to find and reuse existing
254
- * anon_vma which has no vmas and only one child anon_vma. This prevents
255
- * degradation of anon_vma hierarchy to endless linear chain in case of
256
- * constantly forking task. On the other hand, an anon_vma with more than one
257
- * child isn't reused even if there was no alive vma, thus rmap walker has a
258
- * good chance of avoiding scanning the whole hierarchy when it searches where
259
- * page is mapped.
262
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
263
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
264
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
265
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
266
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
267
+ *
268
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
269
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
270
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
271
+ * case of constantly forking task. On the other hand, an anon_vma with more
272
+ * than one child isn't reused even if there was no alive vma, thus rmap
273
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
274
+ * searches where page is mapped.
260275 */
261276 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
262277 {
....@@ -286,8 +301,8 @@
286301 * will always reuse it. Root anon_vma is never reused:
287302 * it has self-parent reference and at least one child.
288303 */
289
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
290
- anon_vma->degree < 2)
304
+ if (!dst->anon_vma && src->anon_vma &&
305
+ anon_vma != src->anon_vma && anon_vma->degree < 2)
291306 dst->anon_vma = anon_vma;
292307 }
293308 if (dst->anon_vma)
....@@ -457,9 +472,10 @@
457472 * chain and verify that the page in question is indeed mapped in it
458473 * [ something equivalent to page_mapped_in_vma() ].
459474 *
460
- * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
461
- * that the anon_vma pointer from page->mapping is valid if there is a
462
- * mapcount, we can dereference the anon_vma after observing those.
475
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
476
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
477
+ * if there is a mapcount, we can dereference the anon_vma after observing
478
+ * those.
463479 */
464480 struct anon_vma *page_get_anon_vma(struct page *page)
465481 {
....@@ -502,13 +518,16 @@
502518 *
503519 * Its a little more complex as it tries to keep the fast path to a single
504520 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
505
- * reference like with page_get_anon_vma() and then block on the mutex.
521
+ * reference like with page_get_anon_vma() and then block on the mutex
522
+ * on !rwc->try_lock case.
506523 */
507
-struct anon_vma *page_lock_anon_vma_read(struct page *page)
524
+struct anon_vma *page_lock_anon_vma_read(struct page *page,
525
+ struct rmap_walk_control *rwc)
508526 {
509527 struct anon_vma *anon_vma = NULL;
510528 struct anon_vma *root_anon_vma;
511529 unsigned long anon_mapping;
530
+ bool success = false;
512531
513532 rcu_read_lock();
514533 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
....@@ -529,6 +548,17 @@
529548 up_read(&root_anon_vma->rwsem);
530549 anon_vma = NULL;
531550 }
551
+ goto out;
552
+ }
553
+ trace_android_vh_do_page_trylock(page, NULL, NULL, &success);
554
+ if (success) {
555
+ anon_vma = NULL;
556
+ goto out;
557
+ }
558
+
559
+ if (rwc && rwc->try_lock) {
560
+ anon_vma = NULL;
561
+ rwc->contended = true;
532562 goto out;
533563 }
534564
....@@ -658,7 +688,7 @@
658688 */
659689 void flush_tlb_batched_pending(struct mm_struct *mm)
660690 {
661
- if (mm->tlb_flush_batched) {
691
+ if (data_race(mm->tlb_flush_batched)) {
662692 flush_tlb_mm(mm);
663693
664694 /*
....@@ -768,6 +798,7 @@
768798 }
769799
770800 if (pvmw.pte) {
801
+ trace_android_vh_look_around(&pvmw, page, vma, &referenced);
771802 if (ptep_clear_flush_young_notify(vma, address,
772803 pvmw.pte)) {
773804 /*
....@@ -803,6 +834,7 @@
803834 pra->vm_flags |= vma->vm_flags;
804835 }
805836
837
+ trace_android_vh_page_referenced_one_end(vma, page, referenced);
806838 if (!pra->mapcount)
807839 return false; /* To break the loop */
808840
....@@ -827,8 +859,10 @@
827859 * @memcg: target memory cgroup
828860 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
829861 *
830
- * Quick test_and_clear_referenced for all mappings to a page,
831
- * returns the number of ptes which referenced the page.
862
+ * Quick test_and_clear_referenced for all mappings of a page,
863
+ *
864
+ * Return: The number of mappings which referenced the page. Return -1 if
865
+ * the function bailed out due to rmap lock contention.
832866 */
833867 int page_referenced(struct page *page,
834868 int is_locked,
....@@ -844,10 +878,11 @@
844878 .rmap_one = page_referenced_one,
845879 .arg = (void *)&pra,
846880 .anon_lock = page_lock_anon_vma_read,
881
+ .try_lock = true,
847882 };
848883
849884 *vm_flags = 0;
850
- if (!page_mapped(page))
885
+ if (!pra.mapcount)
851886 return 0;
852887
853888 if (!page_rmapping(page))
....@@ -874,7 +909,7 @@
874909 if (we_locked)
875910 unlock_page(page);
876911
877
- return pra.referenced;
912
+ return rwc.contended ? -1 : pra.referenced;
878913 }
879914
880915 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
....@@ -886,21 +921,22 @@
886921 .address = address,
887922 .flags = PVMW_SYNC,
888923 };
889
- unsigned long start = address, end;
924
+ struct mmu_notifier_range range;
890925 int *cleaned = arg;
891926
892927 /*
893928 * We have to assume the worse case ie pmd for invalidation. Note that
894929 * the page can not be free from this function.
895930 */
896
- end = vma_address_end(page, vma);
897
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
931
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
932
+ 0, vma, vma->vm_mm, address,
933
+ vma_address_end(page, vma));
934
+ mmu_notifier_invalidate_range_start(&range);
898935
899936 while (page_vma_mapped_walk(&pvmw)) {
900
- unsigned long cstart;
901937 int ret = 0;
902938
903
- cstart = address = pvmw.address;
939
+ address = pvmw.address;
904940 if (pvmw.pte) {
905941 pte_t entry;
906942 pte_t *pte = pvmw.pte;
....@@ -915,7 +951,7 @@
915951 set_pte_at(vma->vm_mm, address, pte, entry);
916952 ret = 1;
917953 } else {
918
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
954
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
919955 pmd_t *pmd = pvmw.pmd;
920956 pmd_t entry;
921957
....@@ -927,7 +963,6 @@
927963 entry = pmd_wrprotect(entry);
928964 entry = pmd_mkclean(entry);
929965 set_pmd_at(vma->vm_mm, address, pmd, entry);
930
- cstart &= PMD_MASK;
931966 ret = 1;
932967 #else
933968 /* unexpected pmd-mapped page? */
....@@ -946,7 +981,7 @@
946981 (*cleaned)++;
947982 }
948983
949
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
984
+ mmu_notifier_invalidate_range_end(&range);
950985
951986 return true;
952987 }
....@@ -1014,7 +1049,7 @@
10141049
10151050 /**
10161051 * __page_set_anon_rmap - set up new anonymous rmap
1017
- * @page: Page to add to rmap
1052
+ * @page: Page or Hugepage to add to rmap
10181053 * @vma: VM area to add page to.
10191054 * @address: User virtual address of the mapping
10201055 * @exclusive: the page is exclusively owned by the current process
....@@ -1051,7 +1086,6 @@
10511086 static void __page_check_anon_rmap(struct page *page,
10521087 struct vm_area_struct *vma, unsigned long address)
10531088 {
1054
-#ifdef CONFIG_DEBUG_VM
10551089 /*
10561090 * The page's anon-rmap details (mapping and index) are guaranteed to
10571091 * be set up correctly at this point.
....@@ -1064,9 +1098,9 @@
10641098 * are initially only visible via the pagetables, and the pte is locked
10651099 * over the call to page_add_new_anon_rmap.
10661100 */
1067
- BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
1068
- BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
1069
-#endif
1101
+ VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
1102
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1103
+ page);
10701104 }
10711105
10721106 /**
....@@ -1097,6 +1131,12 @@
10971131 {
10981132 bool compound = flags & RMAP_COMPOUND;
10991133 bool first;
1134
+ bool success = false;
1135
+
1136
+ if (unlikely(PageKsm(page)))
1137
+ lock_page_memcg(page);
1138
+ else
1139
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
11001140
11011141 if (compound) {
11021142 atomic_t *mapcount;
....@@ -1105,11 +1145,14 @@
11051145 mapcount = compound_mapcount_ptr(page);
11061146 first = atomic_inc_and_test(mapcount);
11071147 } else {
1108
- first = atomic_inc_and_test(&page->_mapcount);
1148
+ trace_android_vh_update_page_mapcount(page, true, compound,
1149
+ &first, &success);
1150
+ if (!success)
1151
+ first = atomic_inc_and_test(&page->_mapcount);
11091152 }
11101153
11111154 if (first) {
1112
- int nr = compound ? hpage_nr_pages(page) : 1;
1155
+ int nr = compound ? thp_nr_pages(page) : 1;
11131156 /*
11141157 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
11151158 * these counters are not modified in interrupt context, and
....@@ -1117,13 +1160,14 @@
11171160 * disabled.
11181161 */
11191162 if (compound)
1120
- __inc_node_page_state(page, NR_ANON_THPS);
1121
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1163
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
1164
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
11221165 }
1123
- if (unlikely(PageKsm(page)))
1124
- return;
11251166
1126
- VM_BUG_ON_PAGE(!PageLocked(page), page);
1167
+ if (unlikely(PageKsm(page))) {
1168
+ unlock_page_memcg(page);
1169
+ return;
1170
+ }
11271171
11281172 /* address might be in next vma when migration races vma_adjust */
11291173 if (first)
....@@ -1134,7 +1178,7 @@
11341178 }
11351179
11361180 /**
1137
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
1181
+ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
11381182 * @page: the page to add the mapping to
11391183 * @vma: the vm area in which the mapping is added
11401184 * @address: the user virtual address mapped
....@@ -1144,25 +1188,27 @@
11441188 * This means the inc-and-test can be bypassed.
11451189 * Page does not have to be locked.
11461190 */
1147
-void page_add_new_anon_rmap(struct page *page,
1191
+void __page_add_new_anon_rmap(struct page *page,
11481192 struct vm_area_struct *vma, unsigned long address, bool compound)
11491193 {
1150
- int nr = compound ? hpage_nr_pages(page) : 1;
1194
+ int nr = compound ? thp_nr_pages(page) : 1;
11511195
1152
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
11531196 __SetPageSwapBacked(page);
11541197 if (compound) {
11551198 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
11561199 /* increment count (starts at -1) */
11571200 atomic_set(compound_mapcount_ptr(page), 0);
1158
- __inc_node_page_state(page, NR_ANON_THPS);
1201
+ if (hpage_pincount_available(page))
1202
+ atomic_set(compound_pincount_ptr(page), 0);
1203
+
1204
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
11591205 } else {
11601206 /* Anon THP always mapped first with PMD */
11611207 VM_BUG_ON_PAGE(PageTransCompound(page), page);
11621208 /* increment count (starts at -1) */
11631209 atomic_set(&page->_mapcount, 0);
11641210 }
1165
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1211
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
11661212 __page_set_anon_rmap(page, vma, address, 1);
11671213 }
11681214
....@@ -1176,18 +1222,29 @@
11761222 void page_add_file_rmap(struct page *page, bool compound)
11771223 {
11781224 int i, nr = 1;
1225
+ bool first_mapping;
1226
+ bool success = false;
11791227
11801228 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
11811229 lock_page_memcg(page);
11821230 if (compound && PageTransHuge(page)) {
1183
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1184
- if (atomic_inc_and_test(&page[i]._mapcount))
1185
- nr++;
1231
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1232
+ trace_android_vh_update_page_mapcount(&page[i], true,
1233
+ compound, &first_mapping, &success);
1234
+ if ((success)) {
1235
+ if (first_mapping)
1236
+ nr++;
1237
+ } else {
1238
+ if (atomic_inc_and_test(&page[i]._mapcount))
1239
+ nr++;
1240
+ }
11861241 }
11871242 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
11881243 goto out;
1189
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1190
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1244
+ if (PageSwapBacked(page))
1245
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1246
+ else
1247
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
11911248 } else {
11921249 if (PageTransCompound(page) && page_mapping(page)) {
11931250 VM_WARN_ON_ONCE(!PageLocked(page));
....@@ -1196,8 +1253,15 @@
11961253 if (PageMlocked(page))
11971254 clear_page_mlock(compound_head(page));
11981255 }
1199
- if (!atomic_inc_and_test(&page->_mapcount))
1200
- goto out;
1256
+ trace_android_vh_update_page_mapcount(page, true,
1257
+ compound, &first_mapping, &success);
1258
+ if (success) {
1259
+ if (!first_mapping)
1260
+ goto out;
1261
+ } else {
1262
+ if (!atomic_inc_and_test(&page->_mapcount))
1263
+ goto out;
1264
+ }
12011265 }
12021266 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
12031267 out:
....@@ -1207,30 +1271,47 @@
12071271 static void page_remove_file_rmap(struct page *page, bool compound)
12081272 {
12091273 int i, nr = 1;
1274
+ bool first_mapping;
1275
+ bool success = false;
12101276
12111277 VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1212
- lock_page_memcg(page);
12131278
12141279 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
12151280 if (unlikely(PageHuge(page))) {
12161281 /* hugetlb pages are always mapped with pmds */
12171282 atomic_dec(compound_mapcount_ptr(page));
1218
- goto out;
1283
+ return;
12191284 }
12201285
12211286 /* page still mapped by someone else? */
12221287 if (compound && PageTransHuge(page)) {
1223
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1224
- if (atomic_add_negative(-1, &page[i]._mapcount))
1225
- nr++;
1288
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1289
+ trace_android_vh_update_page_mapcount(&page[i], false,
1290
+ compound, &first_mapping, &success);
1291
+ if (success) {
1292
+ if (first_mapping)
1293
+ nr++;
1294
+ } else {
1295
+ if (atomic_add_negative(-1, &page[i]._mapcount))
1296
+ nr++;
1297
+ }
12261298 }
12271299 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1228
- goto out;
1229
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1230
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1300
+ return;
1301
+ if (PageSwapBacked(page))
1302
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1303
+ else
1304
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
12311305 } else {
1232
- if (!atomic_add_negative(-1, &page->_mapcount))
1233
- goto out;
1306
+ trace_android_vh_update_page_mapcount(page, false,
1307
+ compound, &first_mapping, &success);
1308
+ if (success) {
1309
+ if (!first_mapping)
1310
+ return;
1311
+ } else {
1312
+ if (!atomic_add_negative(-1, &page->_mapcount))
1313
+ return;
1314
+ }
12341315 }
12351316
12361317 /*
....@@ -1242,13 +1323,13 @@
12421323
12431324 if (unlikely(PageMlocked(page)))
12441325 clear_page_mlock(page);
1245
-out:
1246
- unlock_page_memcg(page);
12471326 }
12481327
12491328 static void page_remove_anon_compound_rmap(struct page *page)
12501329 {
12511330 int i, nr;
1331
+ bool first_mapping;
1332
+ bool success = false;
12521333
12531334 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
12541335 return;
....@@ -1260,28 +1341,41 @@
12601341 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
12611342 return;
12621343
1263
- __dec_node_page_state(page, NR_ANON_THPS);
1344
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
12641345
12651346 if (TestClearPageDoubleMap(page)) {
12661347 /*
12671348 * Subpages can be mapped with PTEs too. Check how many of
1268
- * themi are still mapped.
1349
+ * them are still mapped.
12691350 */
1270
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1271
- if (atomic_add_negative(-1, &page[i]._mapcount))
1272
- nr++;
1351
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1352
+ trace_android_vh_update_page_mapcount(&page[i], false,
1353
+ false, &first_mapping, &success);
1354
+ if (success) {
1355
+ if (first_mapping)
1356
+ nr++;
1357
+ } else {
1358
+ if (atomic_add_negative(-1, &page[i]._mapcount))
1359
+ nr++;
1360
+ }
12731361 }
1362
+
1363
+ /*
1364
+ * Queue the page for deferred split if at least one small
1365
+ * page of the compound page is unmapped, but at least one
1366
+ * small page is still mapped.
1367
+ */
1368
+ if (nr && nr < thp_nr_pages(page))
1369
+ deferred_split_huge_page(page);
12741370 } else {
1275
- nr = HPAGE_PMD_NR;
1371
+ nr = thp_nr_pages(page);
12761372 }
12771373
12781374 if (unlikely(PageMlocked(page)))
12791375 clear_page_mlock(page);
12801376
1281
- if (nr) {
1282
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
1283
- deferred_split_huge_page(page);
1284
- }
1377
+ if (nr)
1378
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
12851379 }
12861380
12871381 /**
....@@ -1293,22 +1387,36 @@
12931387 */
12941388 void page_remove_rmap(struct page *page, bool compound)
12951389 {
1296
- if (!PageAnon(page))
1297
- return page_remove_file_rmap(page, compound);
1390
+ bool first_mapping;
1391
+ bool success = false;
1392
+ lock_page_memcg(page);
12981393
1299
- if (compound)
1300
- return page_remove_anon_compound_rmap(page);
1394
+ if (!PageAnon(page)) {
1395
+ page_remove_file_rmap(page, compound);
1396
+ goto out;
1397
+ }
13011398
1302
- /* page still mapped by someone else? */
1303
- if (!atomic_add_negative(-1, &page->_mapcount))
1304
- return;
1399
+ if (compound) {
1400
+ page_remove_anon_compound_rmap(page);
1401
+ goto out;
1402
+ }
13051403
1404
+ trace_android_vh_update_page_mapcount(page, false,
1405
+ compound, &first_mapping, &success);
1406
+ if (success) {
1407
+ if (!first_mapping)
1408
+ goto out;
1409
+ } else {
1410
+ /* page still mapped by someone else? */
1411
+ if (!atomic_add_negative(-1, &page->_mapcount))
1412
+ goto out;
1413
+ }
13061414 /*
13071415 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
13081416 * these counters are not modified in interrupt context, and
13091417 * pte lock(a spinlock) is held, which implies preemption disabled.
13101418 */
1311
- __dec_node_page_state(page, NR_ANON_MAPPED);
1419
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
13121420
13131421 if (unlikely(PageMlocked(page)))
13141422 clear_page_mlock(page);
....@@ -1325,6 +1433,8 @@
13251433 * Leaving it set also helps swapoff to reinstate ptes
13261434 * faster for those pages still in swapcache.
13271435 */
1436
+out:
1437
+ unlock_page_memcg(page);
13281438 }
13291439
13301440 /*
....@@ -1342,8 +1452,8 @@
13421452 pte_t pteval;
13431453 struct page *subpage;
13441454 bool ret = true;
1345
- unsigned long start = address, end;
1346
- enum ttu_flags flags = (enum ttu_flags)arg;
1455
+ struct mmu_notifier_range range;
1456
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
13471457
13481458 /*
13491459 * When racing against e.g. zap_pte_range() on another cpu,
....@@ -1375,16 +1485,19 @@
13751485 * Note that the page can not be free in this function as call of
13761486 * try_to_unmap() must hold a reference on the page.
13771487 */
1378
- end = PageKsm(page) ?
1488
+ range.end = PageKsm(page) ?
13791489 address + PAGE_SIZE : vma_address_end(page, vma);
1490
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1491
+ address, range.end);
13801492 if (PageHuge(page)) {
13811493 /*
13821494 * If sharing is possible, start and end will be adjusted
13831495 * accordingly.
13841496 */
1385
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
1497
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
1498
+ &range.end);
13861499 }
1387
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
1500
+ mmu_notifier_invalidate_range_start(&range);
13881501
13891502 while (page_vma_mapped_walk(&pvmw)) {
13901503 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
....@@ -1408,7 +1521,7 @@
14081521 if (!PageTransCompound(page)) {
14091522 /*
14101523 * Holding pte lock, we do *not* need
1411
- * mmap_sem here
1524
+ * mmap_lock here
14121525 */
14131526 mlock_vma_page(page);
14141527 }
....@@ -1426,8 +1539,14 @@
14261539 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
14271540 address = pvmw.address;
14281541
1429
- if (PageHuge(page)) {
1430
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
1542
+ if (PageHuge(page) && !PageAnon(page)) {
1543
+ /*
1544
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
1545
+ * held in write mode. Caller needs to explicitly
1546
+ * do this outside rmap routines.
1547
+ */
1548
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1549
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
14311550 /*
14321551 * huge_pmd_unshare unmapped an entire PMD
14331552 * page. There is no way of knowing exactly
....@@ -1435,9 +1554,10 @@
14351554 * we must flush them all. start/end were
14361555 * already adjusted above to cover this range.
14371556 */
1438
- flush_cache_range(vma, start, end);
1439
- flush_tlb_range(vma, start, end);
1440
- mmu_notifier_invalidate_range(mm, start, end);
1557
+ flush_cache_range(vma, range.start, range.end);
1558
+ flush_tlb_range(vma, range.start, range.end);
1559
+ mmu_notifier_invalidate_range(mm, range.start,
1560
+ range.end);
14411561
14421562 /*
14431563 * The ref count of the PMD page was dropped
....@@ -1468,8 +1588,15 @@
14681588 */
14691589 entry = make_migration_entry(page, 0);
14701590 swp_pte = swp_entry_to_pte(entry);
1471
- if (pte_soft_dirty(pteval))
1591
+
1592
+ /*
1593
+ * pteval maps a zone device page and is therefore
1594
+ * a swap pte.
1595
+ */
1596
+ if (pte_swp_soft_dirty(pteval))
14721597 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1598
+ if (pte_swp_uffd_wp(pteval))
1599
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
14731600 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
14741601 /*
14751602 * No need to invalidate here it will synchronize on
....@@ -1484,15 +1611,6 @@
14841611 */
14851612 subpage = page;
14861613 goto discard;
1487
- }
1488
-
1489
- if (!(flags & TTU_IGNORE_ACCESS)) {
1490
- if (ptep_clear_flush_young_notify(vma, address,
1491
- pvmw.pte)) {
1492
- ret = false;
1493
- page_vma_mapped_walk_done(&pvmw);
1494
- break;
1495
- }
14961614 }
14971615
14981616 /* Nuke the page table entry. */
....@@ -1523,8 +1641,7 @@
15231641 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
15241642 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
15251643 if (PageHuge(page)) {
1526
- int nr = 1 << compound_order(page);
1527
- hugetlb_count_sub(nr, mm);
1644
+ hugetlb_count_sub(compound_nr(page), mm);
15281645 set_huge_swap_pte_at(mm, address,
15291646 pvmw.pte, pteval,
15301647 vma_mmu_pagesize(vma));
....@@ -1570,6 +1687,8 @@
15701687 swp_pte = swp_entry_to_pte(entry);
15711688 if (pte_soft_dirty(pteval))
15721689 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1690
+ if (pte_uffd_wp(pteval))
1691
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
15731692 set_pte_at(mm, address, pvmw.pte, swp_pte);
15741693 /*
15751694 * No need to invalidate here it will synchronize on
....@@ -1594,7 +1713,30 @@
15941713
15951714 /* MADV_FREE page check */
15961715 if (!PageSwapBacked(page)) {
1597
- if (!PageDirty(page)) {
1716
+ int ref_count, map_count;
1717
+
1718
+ /*
1719
+ * Synchronize with gup_pte_range():
1720
+ * - clear PTE; barrier; read refcount
1721
+ * - inc refcount; barrier; read PTE
1722
+ */
1723
+ smp_mb();
1724
+
1725
+ ref_count = page_ref_count(page);
1726
+ map_count = page_mapcount(page);
1727
+
1728
+ /*
1729
+ * Order reads for page refcount and dirty flag
1730
+ * (see comments in __remove_mapping()).
1731
+ */
1732
+ smp_rmb();
1733
+
1734
+ /*
1735
+ * The only page refs must be one from isolation
1736
+ * plus the rmap(s) (dropped by discard:).
1737
+ */
1738
+ if (ref_count == 1 + map_count &&
1739
+ !PageDirty(page)) {
15981740 /* Invalidate as we cleared the pte */
15991741 mmu_notifier_invalidate_range(mm,
16001742 address, address + PAGE_SIZE);
....@@ -1636,6 +1778,8 @@
16361778 swp_pte = swp_entry_to_pte(entry);
16371779 if (pte_soft_dirty(pteval))
16381780 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1781
+ if (pte_uffd_wp(pteval))
1782
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
16391783 set_pte_at(mm, address, pvmw.pte, swp_pte);
16401784 /* Invalidate as we cleared the pte */
16411785 mmu_notifier_invalidate_range(mm, address,
....@@ -1665,28 +1809,15 @@
16651809 put_page(page);
16661810 }
16671811
1668
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
1812
+ mmu_notifier_invalidate_range_end(&range);
1813
+ trace_android_vh_try_to_unmap_one(vma, page, address, ret);
16691814
16701815 return ret;
16711816 }
16721817
1673
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
1674
-{
1675
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1676
-
1677
- if (!maybe_stack)
1678
- return false;
1679
-
1680
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1681
- VM_STACK_INCOMPLETE_SETUP)
1682
- return true;
1683
-
1684
- return false;
1685
-}
1686
-
16871818 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
16881819 {
1689
- return is_vma_temporary_stack(vma);
1820
+ return vma_is_temporary_stack(vma);
16901821 }
16911822
16921823 static int page_not_mapped(struct page *page)
....@@ -1779,19 +1910,29 @@
17791910 struct anon_vma *anon_vma;
17801911
17811912 if (rwc->anon_lock)
1782
- return rwc->anon_lock(page);
1913
+ return rwc->anon_lock(page, rwc);
17831914
17841915 /*
17851916 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
17861917 * because that depends on page_mapped(); but not all its usages
1787
- * are holding mmap_sem. Users without mmap_sem are required to
1918
+ * are holding mmap_lock. Users without mmap_lock are required to
17881919 * take a reference count to prevent the anon_vma disappearing
17891920 */
17901921 anon_vma = page_anon_vma(page);
17911922 if (!anon_vma)
17921923 return NULL;
17931924
1925
+ if (anon_vma_trylock_read(anon_vma))
1926
+ goto out;
1927
+
1928
+ if (rwc->try_lock) {
1929
+ anon_vma = NULL;
1930
+ rwc->contended = true;
1931
+ goto out;
1932
+ }
1933
+
17941934 anon_vma_lock_read(anon_vma);
1935
+out:
17951936 return anon_vma;
17961937 }
17971938
....@@ -1804,7 +1945,7 @@
18041945 * Find all the mappings of a page using the mapping pointer and the vma chains
18051946 * contained in the anon_vma struct it points to.
18061947 *
1807
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1948
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
18081949 * where the page was found will be held for write. So, we won't recheck
18091950 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
18101951 * LOCKED.
....@@ -1827,7 +1968,7 @@
18271968 return;
18281969
18291970 pgoff_start = page_to_pgoff(page);
1830
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1971
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
18311972 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
18321973 pgoff_start, pgoff_end) {
18331974 struct vm_area_struct *vma = avc->vma;
....@@ -1857,7 +1998,7 @@
18571998 * Find all the mappings of a page using the mapping pointer and the vma chains
18581999 * contained in the address_space struct it points to.
18592000 *
1860
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
2001
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
18612002 * where the page was found will be held for write. So, we won't recheck
18622003 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
18632004 * LOCKED.
....@@ -1868,6 +2009,7 @@
18682009 struct address_space *mapping = page_mapping(page);
18692010 pgoff_t pgoff_start, pgoff_end;
18702011 struct vm_area_struct *vma;
2012
+ bool got_lock = false, success = false;
18712013
18722014 /*
18732015 * The page lock not only makes sure that page->mapping cannot
....@@ -1881,9 +2023,26 @@
18812023 return;
18822024
18832025 pgoff_start = page_to_pgoff(page);
1884
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1885
- if (!locked)
1886
- i_mmap_lock_read(mapping);
2026
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
2027
+ if (!locked) {
2028
+ trace_android_vh_do_page_trylock(page,
2029
+ &mapping->i_mmap_rwsem, &got_lock, &success);
2030
+ if (success) {
2031
+ if (!got_lock)
2032
+ return;
2033
+ } else {
2034
+ if (i_mmap_trylock_read(mapping))
2035
+ goto lookup;
2036
+
2037
+ if (rwc->try_lock) {
2038
+ rwc->contended = true;
2039
+ return;
2040
+ }
2041
+
2042
+ i_mmap_lock_read(mapping);
2043
+ }
2044
+ }
2045
+lookup:
18872046 vma_interval_tree_foreach(vma, &mapping->i_mmap,
18882047 pgoff_start, pgoff_end) {
18892048 unsigned long address = vma_address(page, vma);
....@@ -1928,27 +2087,10 @@
19282087
19292088 #ifdef CONFIG_HUGETLB_PAGE
19302089 /*
1931
- * The following three functions are for anonymous (private mapped) hugepages.
2090
+ * The following two functions are for anonymous (private mapped) hugepages.
19322091 * Unlike common anonymous pages, anonymous hugepages have no accounting code
19332092 * and no lru code, because we handle hugepages differently from common pages.
19342093 */
1935
-static void __hugepage_set_anon_rmap(struct page *page,
1936
- struct vm_area_struct *vma, unsigned long address, int exclusive)
1937
-{
1938
- struct anon_vma *anon_vma = vma->anon_vma;
1939
-
1940
- BUG_ON(!anon_vma);
1941
-
1942
- if (PageAnon(page))
1943
- return;
1944
- if (!exclusive)
1945
- anon_vma = anon_vma->root;
1946
-
1947
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1948
- page->mapping = (struct address_space *) anon_vma;
1949
- page->index = linear_page_index(vma, address);
1950
-}
1951
-
19522094 void hugepage_add_anon_rmap(struct page *page,
19532095 struct vm_area_struct *vma, unsigned long address)
19542096 {
....@@ -1960,7 +2102,7 @@
19602102 /* address might be in next vma when migration races vma_adjust */
19612103 first = atomic_inc_and_test(compound_mapcount_ptr(page));
19622104 if (first)
1963
- __hugepage_set_anon_rmap(page, vma, address, 0);
2105
+ __page_set_anon_rmap(page, vma, address, 0);
19642106 }
19652107
19662108 void hugepage_add_new_anon_rmap(struct page *page,
....@@ -1968,6 +2110,9 @@
19682110 {
19692111 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
19702112 atomic_set(compound_mapcount_ptr(page), 0);
1971
- __hugepage_set_anon_rmap(page, vma, address, 1);
2113
+ if (hpage_pincount_available(page))
2114
+ atomic_set(compound_pincount_ptr(page), 0);
2115
+
2116
+ __page_set_anon_rmap(page, vma, address, 1);
19722117 }
19732118 #endif /* CONFIG_HUGETLB_PAGE */