hc
2024-01-05 071106ecf68c401173c58808b1cf5f68cc50d390
kernel/mm/migrate.c
....@@ -38,6 +38,7 @@
3838 #include <linux/hugetlb.h>
3939 #include <linux/hugetlb_cgroup.h>
4040 #include <linux/gfp.h>
41
+#include <linux/pagewalk.h>
4142 #include <linux/pfn_t.h>
4243 #include <linux/memremap.h>
4344 #include <linux/userfaultfd_k.h>
....@@ -47,39 +48,17 @@
4748 #include <linux/page_owner.h>
4849 #include <linux/sched/mm.h>
4950 #include <linux/ptrace.h>
51
+#include <linux/oom.h>
5052
5153 #include <asm/tlbflush.h>
5254
5355 #define CREATE_TRACE_POINTS
5456 #include <trace/events/migrate.h>
57
+#undef CREATE_TRACE_POINTS
58
+#include <trace/hooks/mm.h>
59
+#include <trace/hooks/vmscan.h>
5560
5661 #include "internal.h"
57
-
58
-/*
59
- * migrate_prep() needs to be called before we start compiling a list of pages
60
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
61
- * undesirable, use migrate_prep_local()
62
- */
63
-int migrate_prep(void)
64
-{
65
- /*
66
- * Clear the LRU lists so pages can be isolated.
67
- * Note that pages may be moved off the LRU after we have
68
- * drained them. Those pages will fail to migrate like other
69
- * pages that may be busy.
70
- */
71
- lru_add_drain_all();
72
-
73
- return 0;
74
-}
75
-
76
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
77
-int migrate_prep_local(void)
78
-{
79
- lru_add_drain();
80
-
81
- return 0;
82
-}
8362
8463 int isolate_movable_page(struct page *page, isolate_mode_t mode)
8564 {
....@@ -100,7 +79,7 @@
10079 /*
10180 * Check PageMovable before holding a PG_lock because page's owner
10281 * assumes anybody doesn't touch PG_lock of newly allocated page
103
- * so unconditionally grapping the lock ruins page's owner side.
82
+ * so unconditionally grabbing the lock ruins page's owner side.
10483 */
10584 if (unlikely(!__PageMovable(page)))
10685 goto out_putpage;
....@@ -129,7 +108,7 @@
129108
130109 /* Driver shouldn't use PG_isolated bit of page->flags */
131110 WARN_ON_ONCE(PageIsolated(page));
132
- __SetPageIsolated(page);
111
+ SetPageIsolated(page);
133112 unlock_page(page);
134113
135114 return 0;
....@@ -153,7 +132,7 @@
153132
154133 mapping = page_mapping(page);
155134 mapping->a_ops->putback_page(page);
156
- __ClearPageIsolated(page);
135
+ ClearPageIsolated(page);
157136 }
158137
159138 /*
....@@ -162,7 +141,7 @@
162141 *
163142 * This function shall be used whenever the isolated pageset has been
164143 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
165
- * and isolate_huge_page().
144
+ * and isolate_hugetlb().
166145 */
167146 void putback_movable_pages(struct list_head *l)
168147 {
....@@ -186,16 +165,17 @@
186165 if (PageMovable(page))
187166 putback_movable_page(page);
188167 else
189
- __ClearPageIsolated(page);
168
+ ClearPageIsolated(page);
190169 unlock_page(page);
191170 put_page(page);
192171 } else {
193172 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
194
- page_is_file_cache(page), -hpage_nr_pages(page));
173
+ page_is_file_lru(page), -thp_nr_pages(page));
195174 putback_lru_page(page);
196175 }
197176 }
198177 }
178
+EXPORT_SYMBOL_GPL(putback_movable_pages);
199179
200180 /*
201181 * Restore a potential migration pte to a working pte entry
....@@ -240,15 +220,17 @@
240220 */
241221 entry = pte_to_swp_entry(*pvmw.pte);
242222 if (is_write_migration_entry(entry))
243
- pte = maybe_mkwrite(pte, vma);
223
+ pte = maybe_mkwrite(pte, vma->vm_flags);
224
+ else if (pte_swp_uffd_wp(*pvmw.pte))
225
+ pte = pte_mkuffd_wp(pte);
244226
245
- if (unlikely(is_zone_device_page(new))) {
246
- if (is_device_private_page(new)) {
247
- entry = make_device_private_entry(new, pte_write(pte));
248
- pte = swp_entry_to_pte(entry);
249
- } else if (is_device_public_page(new)) {
250
- pte = pte_mkdevmap(pte);
251
- }
227
+ if (unlikely(is_device_private_page(new))) {
228
+ entry = make_device_private_entry(new, pte_write(pte));
229
+ pte = swp_entry_to_pte(entry);
230
+ if (pte_swp_soft_dirty(*pvmw.pte))
231
+ pte = pte_swp_mksoft_dirty(pte);
232
+ if (pte_swp_uffd_wp(*pvmw.pte))
233
+ pte = pte_swp_mkuffd_wp(pte);
252234 }
253235
254236 #ifdef CONFIG_HUGETLB_PAGE
....@@ -322,19 +304,18 @@
322304 goto out;
323305
324306 page = migration_entry_to_page(entry);
307
+ page = compound_head(page);
325308
326309 /*
327
- * Once radix-tree replacement of page migration started, page_count
328
- * *must* be zero. And, we don't want to call wait_on_page_locked()
329
- * against a page without get_page().
330
- * So, we use get_page_unless_zero(), here. Even failed, page fault
331
- * will occur again.
310
+ * Once page cache replacement of page migration started, page_count
311
+ * is zero; but we must not call put_and_wait_on_page_locked() without
312
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
332313 */
333314 if (!get_page_unless_zero(page))
334315 goto out;
335316 pte_unmap_unlock(ptep, ptl);
336
- wait_on_page_locked(page);
337
- put_page(page);
317
+ trace_android_vh_waiting_for_page_migration(page);
318
+ put_and_wait_on_page_locked(page);
338319 return;
339320 out:
340321 pte_unmap_unlock(ptep, ptl);
....@@ -368,63 +349,27 @@
368349 if (!get_page_unless_zero(page))
369350 goto unlock;
370351 spin_unlock(ptl);
371
- wait_on_page_locked(page);
372
- put_page(page);
352
+ put_and_wait_on_page_locked(page);
373353 return;
374354 unlock:
375355 spin_unlock(ptl);
376356 }
377357 #endif
378358
379
-#ifdef CONFIG_BLOCK
380
-/* Returns true if all buffers are successfully locked */
381
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
382
- enum migrate_mode mode)
359
+static int expected_page_refs(struct address_space *mapping, struct page *page)
383360 {
384
- struct buffer_head *bh = head;
361
+ int expected_count = 1;
385362
386
- /* Simple case, sync compaction */
387
- if (mode != MIGRATE_ASYNC) {
388
- do {
389
- get_bh(bh);
390
- lock_buffer(bh);
391
- bh = bh->b_this_page;
363
+ /*
364
+ * Device private pages have an extra refcount as they are
365
+ * ZONE_DEVICE pages.
366
+ */
367
+ expected_count += is_device_private_page(page);
368
+ if (mapping)
369
+ expected_count += thp_nr_pages(page) + page_has_private(page);
392370
393
- } while (bh != head);
394
-
395
- return true;
396
- }
397
-
398
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
399
- do {
400
- get_bh(bh);
401
- if (!trylock_buffer(bh)) {
402
- /*
403
- * We failed to lock the buffer and cannot stall in
404
- * async migration. Release the taken locks
405
- */
406
- struct buffer_head *failed_bh = bh;
407
- put_bh(failed_bh);
408
- bh = head;
409
- while (bh != failed_bh) {
410
- unlock_buffer(bh);
411
- put_bh(bh);
412
- bh = bh->b_this_page;
413
- }
414
- return false;
415
- }
416
-
417
- bh = bh->b_this_page;
418
- } while (bh != head);
419
- return true;
371
+ return expected_count;
420372 }
421
-#else
422
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
423
- enum migrate_mode mode)
424
-{
425
- return true;
426
-}
427
-#endif /* CONFIG_BLOCK */
428373
429374 /*
430375 * Replace the page in the mapping.
....@@ -435,21 +380,13 @@
435380 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
436381 */
437382 int migrate_page_move_mapping(struct address_space *mapping,
438
- struct page *newpage, struct page *page,
439
- struct buffer_head *head, enum migrate_mode mode,
440
- int extra_count)
383
+ struct page *newpage, struct page *page, int extra_count)
441384 {
385
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
442386 struct zone *oldzone, *newzone;
443387 int dirty;
444
- int expected_count = 1 + extra_count;
445
- void **pslot;
446
-
447
- /*
448
- * Device public or private pages have an extra refcount as they are
449
- * ZONE_DEVICE pages.
450
- */
451
- expected_count += is_device_private_page(page);
452
- expected_count += is_device_public_page(page);
388
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
389
+ int nr = thp_nr_pages(page);
453390
454391 if (!mapping) {
455392 /* Anonymous page without mapping */
....@@ -468,35 +405,14 @@
468405 oldzone = page_zone(page);
469406 newzone = page_zone(newpage);
470407
471
- xa_lock_irq(&mapping->i_pages);
472
-
473
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
474
- page_index(page));
475
-
476
- expected_count += hpage_nr_pages(page) + page_has_private(page);
477
- if (page_count(page) != expected_count ||
478
- radix_tree_deref_slot_protected(pslot,
479
- &mapping->i_pages.xa_lock) != page) {
480
- xa_unlock_irq(&mapping->i_pages);
408
+ xas_lock_irq(&xas);
409
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
410
+ xas_unlock_irq(&xas);
481411 return -EAGAIN;
482412 }
483413
484414 if (!page_ref_freeze(page, expected_count)) {
485
- xa_unlock_irq(&mapping->i_pages);
486
- return -EAGAIN;
487
- }
488
-
489
- /*
490
- * In the async migration case of moving a page with buffers, lock the
491
- * buffers using trylock before the mapping is moved. If the mapping
492
- * was moved, we later failed to lock the buffers and could not move
493
- * the mapping back due to an elevated page count, we would have to
494
- * block waiting on other references to be dropped.
495
- */
496
- if (mode == MIGRATE_ASYNC && head &&
497
- !buffer_migrate_lock_buffers(head, mode)) {
498
- page_ref_unfreeze(page, expected_count);
499
- xa_unlock_irq(&mapping->i_pages);
415
+ xas_unlock_irq(&xas);
500416 return -EAGAIN;
501417 }
502418
....@@ -506,7 +422,7 @@
506422 */
507423 newpage->index = page->index;
508424 newpage->mapping = page->mapping;
509
- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
425
+ page_ref_add(newpage, nr); /* add cache reference */
510426 if (PageSwapBacked(page)) {
511427 __SetPageSwapBacked(newpage);
512428 if (PageSwapCache(page)) {
....@@ -524,16 +440,13 @@
524440 SetPageDirty(newpage);
525441 }
526442
527
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
443
+ xas_store(&xas, newpage);
528444 if (PageTransHuge(page)) {
529445 int i;
530
- int index = page_index(page);
531446
532
- for (i = 1; i < HPAGE_PMD_NR; i++) {
533
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
534
- index + i);
535
- radix_tree_replace_slot(&mapping->i_pages, pslot,
536
- newpage + i);
447
+ for (i = 1; i < nr; i++) {
448
+ xas_next(&xas);
449
+ xas_store(&xas, newpage);
537450 }
538451 }
539452
....@@ -542,9 +455,9 @@
542455 * to one less reference.
543456 * We know this isn't the last reference.
544457 */
545
- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
458
+ page_ref_unfreeze(page, expected_count - nr);
546459
547
- xa_unlock(&mapping->i_pages);
460
+ xas_unlock(&xas);
548461 /* Leave irq disabled to prevent preemption while updating stats */
549462
550463 /*
....@@ -558,17 +471,24 @@
558471 * are mapped to swap space.
559472 */
560473 if (newzone != oldzone) {
561
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
562
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
474
+ struct lruvec *old_lruvec, *new_lruvec;
475
+ struct mem_cgroup *memcg;
476
+
477
+ memcg = page_memcg(page);
478
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
479
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
480
+
481
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
482
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
563483 if (PageSwapBacked(page) && !PageSwapCache(page)) {
564
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
565
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
484
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
485
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
566486 }
567
- if (dirty && mapping_cap_account_dirty(mapping)) {
568
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
569
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
570
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
571
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
487
+ if (dirty && mapping_can_writeback(mapping)) {
488
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
489
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
490
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
491
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
572492 }
573493 }
574494 local_irq_enable();
....@@ -584,22 +504,18 @@
584504 int migrate_huge_page_move_mapping(struct address_space *mapping,
585505 struct page *newpage, struct page *page)
586506 {
507
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
587508 int expected_count;
588
- void **pslot;
589509
590
- xa_lock_irq(&mapping->i_pages);
591
-
592
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
593
-
510
+ xas_lock_irq(&xas);
594511 expected_count = 2 + page_has_private(page);
595
- if (page_count(page) != expected_count ||
596
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
597
- xa_unlock_irq(&mapping->i_pages);
512
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
513
+ xas_unlock_irq(&xas);
598514 return -EAGAIN;
599515 }
600516
601517 if (!page_ref_freeze(page, expected_count)) {
602
- xa_unlock_irq(&mapping->i_pages);
518
+ xas_unlock_irq(&xas);
603519 return -EAGAIN;
604520 }
605521
....@@ -608,11 +524,11 @@
608524
609525 get_page(newpage);
610526
611
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
527
+ xas_store(&xas, newpage);
612528
613529 page_ref_unfreeze(page, expected_count - 1);
614530
615
- xa_unlock_irq(&mapping->i_pages);
531
+ xas_unlock_irq(&xas);
616532
617533 return MIGRATEPAGE_SUCCESS;
618534 }
....@@ -656,7 +572,7 @@
656572 } else {
657573 /* thp page */
658574 BUG_ON(!PageTransHuge(src));
659
- nr_pages = hpage_nr_pages(src);
575
+ nr_pages = thp_nr_pages(src);
660576 }
661577
662578 for (i = 0; i < nr_pages; i++) {
....@@ -671,6 +587,8 @@
671587 void migrate_page_states(struct page *newpage, struct page *page)
672588 {
673589 int cpupid;
590
+
591
+ trace_android_vh_migrate_page_states(page, newpage);
674592
675593 if (PageError(page))
676594 SetPageError(newpage);
....@@ -689,6 +607,7 @@
689607 SetPageChecked(newpage);
690608 if (PageMappedToDisk(page))
691609 SetPageMappedToDisk(newpage);
610
+ trace_android_vh_look_around_migrate_page(page, newpage);
692611
693612 /* Move dirty on pages not done by migrate_page_move_mapping() */
694613 if (PageDirty(page))
....@@ -723,9 +642,18 @@
723642 if (PageWriteback(newpage))
724643 end_page_writeback(newpage);
725644
645
+ /*
646
+ * PG_readahead shares the same bit with PG_reclaim. The above
647
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
648
+ * bit after that.
649
+ */
650
+ if (PageReadahead(page))
651
+ SetPageReadahead(newpage);
652
+
726653 copy_page_owner(page, newpage);
727654
728
- mem_cgroup_migrate(page, newpage);
655
+ if (!PageHuge(page))
656
+ mem_cgroup_migrate(page, newpage);
729657 }
730658 EXPORT_SYMBOL(migrate_page_states);
731659
....@@ -758,7 +686,7 @@
758686
759687 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
760688
761
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
689
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
762690
763691 if (rc != MIGRATEPAGE_SUCCESS)
764692 return rc;
....@@ -772,40 +700,96 @@
772700 EXPORT_SYMBOL(migrate_page);
773701
774702 #ifdef CONFIG_BLOCK
775
-/*
776
- * Migration function for pages with buffers. This function can only be used
777
- * if the underlying filesystem guarantees that no other references to "page"
778
- * exist.
779
- */
780
-int buffer_migrate_page(struct address_space *mapping,
781
- struct page *newpage, struct page *page, enum migrate_mode mode)
703
+/* Returns true if all buffers are successfully locked */
704
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
705
+ enum migrate_mode mode)
706
+{
707
+ struct buffer_head *bh = head;
708
+
709
+ /* Simple case, sync compaction */
710
+ if (mode != MIGRATE_ASYNC) {
711
+ do {
712
+ lock_buffer(bh);
713
+ bh = bh->b_this_page;
714
+
715
+ } while (bh != head);
716
+
717
+ return true;
718
+ }
719
+
720
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
721
+ do {
722
+ if (!trylock_buffer(bh)) {
723
+ /*
724
+ * We failed to lock the buffer and cannot stall in
725
+ * async migration. Release the taken locks
726
+ */
727
+ struct buffer_head *failed_bh = bh;
728
+ bh = head;
729
+ while (bh != failed_bh) {
730
+ unlock_buffer(bh);
731
+ bh = bh->b_this_page;
732
+ }
733
+ return false;
734
+ }
735
+
736
+ bh = bh->b_this_page;
737
+ } while (bh != head);
738
+ return true;
739
+}
740
+
741
+static int __buffer_migrate_page(struct address_space *mapping,
742
+ struct page *newpage, struct page *page, enum migrate_mode mode,
743
+ bool check_refs)
782744 {
783745 struct buffer_head *bh, *head;
784746 int rc;
747
+ int expected_count;
785748
786749 if (!page_has_buffers(page))
787750 return migrate_page(mapping, newpage, page, mode);
788751
752
+ /* Check whether page does not have extra refs before we do more work */
753
+ expected_count = expected_page_refs(mapping, page);
754
+ if (page_count(page) != expected_count)
755
+ return -EAGAIN;
756
+
789757 head = page_buffers(page);
758
+ if (!buffer_migrate_lock_buffers(head, mode))
759
+ return -EAGAIN;
790760
791
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
761
+ if (check_refs) {
762
+ bool busy;
763
+ bool invalidated = false;
792764
765
+recheck_buffers:
766
+ busy = false;
767
+ spin_lock(&mapping->private_lock);
768
+ bh = head;
769
+ do {
770
+ if (atomic_read(&bh->b_count)) {
771
+ busy = true;
772
+ break;
773
+ }
774
+ bh = bh->b_this_page;
775
+ } while (bh != head);
776
+ if (busy) {
777
+ if (invalidated) {
778
+ rc = -EAGAIN;
779
+ goto unlock_buffers;
780
+ }
781
+ spin_unlock(&mapping->private_lock);
782
+ invalidate_bh_lrus();
783
+ invalidated = true;
784
+ goto recheck_buffers;
785
+ }
786
+ }
787
+
788
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
793789 if (rc != MIGRATEPAGE_SUCCESS)
794
- return rc;
790
+ goto unlock_buffers;
795791
796
- /*
797
- * In the async case, migrate_page_move_mapping locked the buffers
798
- * with an IRQ-safe spinlock held. In the sync case, the buffers
799
- * need to be locked now
800
- */
801
- if (mode != MIGRATE_ASYNC)
802
- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
803
-
804
- ClearPagePrivate(page);
805
- set_page_private(newpage, page_private(page));
806
- set_page_private(page, 0);
807
- put_page(page);
808
- get_page(newpage);
792
+ attach_page_private(newpage, detach_page_private(page));
809793
810794 bh = head;
811795 do {
....@@ -814,24 +798,48 @@
814798
815799 } while (bh != head);
816800
817
- SetPagePrivate(newpage);
818
-
819801 if (mode != MIGRATE_SYNC_NO_COPY)
820802 migrate_page_copy(newpage, page);
821803 else
822804 migrate_page_states(newpage, page);
823805
806
+ rc = MIGRATEPAGE_SUCCESS;
807
+unlock_buffers:
808
+ if (check_refs)
809
+ spin_unlock(&mapping->private_lock);
824810 bh = head;
825811 do {
826812 unlock_buffer(bh);
827
- put_bh(bh);
828813 bh = bh->b_this_page;
829814
830815 } while (bh != head);
831816
832
- return MIGRATEPAGE_SUCCESS;
817
+ return rc;
818
+}
819
+
820
+/*
821
+ * Migration function for pages with buffers. This function can only be used
822
+ * if the underlying filesystem guarantees that no other references to "page"
823
+ * exist. For example attached buffer heads are accessed only under page lock.
824
+ */
825
+int buffer_migrate_page(struct address_space *mapping,
826
+ struct page *newpage, struct page *page, enum migrate_mode mode)
827
+{
828
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
833829 }
834830 EXPORT_SYMBOL(buffer_migrate_page);
831
+
832
+/*
833
+ * Same as above except that this variant is more careful and checks that there
834
+ * are also no buffer head references. This function is the right one for
835
+ * mappings where buffer heads are directly looked up and referenced (such as
836
+ * block device mappings).
837
+ */
838
+int buffer_migrate_page_norefs(struct address_space *mapping,
839
+ struct page *newpage, struct page *page, enum migrate_mode mode)
840
+{
841
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
842
+}
835843 #endif
836844
837845 /*
....@@ -899,7 +907,7 @@
899907 */
900908 if (page_has_private(page) &&
901909 !try_to_release_page(page, GFP_KERNEL))
902
- return -EAGAIN;
910
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
903911
904912 return migrate_page(mapping, newpage, page, mode);
905913 }
....@@ -951,7 +959,7 @@
951959 VM_BUG_ON_PAGE(!PageIsolated(page), page);
952960 if (!PageMovable(page)) {
953961 rc = MIGRATEPAGE_SUCCESS;
954
- __ClearPageIsolated(page);
962
+ ClearPageIsolated(page);
955963 goto out;
956964 }
957965
....@@ -973,23 +981,23 @@
973981 * We clear PG_movable under page_lock so any compactor
974982 * cannot try to migrate this page.
975983 */
976
- __ClearPageIsolated(page);
984
+ ClearPageIsolated(page);
977985 }
978986
979987 /*
980
- * Anonymous and movable page->mapping will be cleard by
988
+ * Anonymous and movable page->mapping will be cleared by
981989 * free_pages_prepare so don't reset it here for keeping
982990 * the type to work PageAnon, for example.
983991 */
984992 if (!PageMappingFlags(page))
985993 page->mapping = NULL;
986994
987
- if (unlikely(is_zone_device_page(newpage))) {
988
- if (is_device_public_page(newpage))
989
- flush_dcache_page(newpage);
990
- } else
991
- flush_dcache_page(newpage);
995
+ if (likely(!is_zone_device_page(newpage))) {
996
+ int i, nr = compound_nr(newpage);
992997
998
+ for (i = 0; i < nr; i++)
999
+ flush_dcache_page(newpage + i);
1000
+ }
9931001 }
9941002 out:
9951003 return rc;
....@@ -1013,7 +1021,7 @@
10131021 * to the LRU. Later, when the IO completes the pages are
10141022 * marked uptodate and unlocked. However, the queueing
10151023 * could be merging multiple pages for one bio (e.g.
1016
- * mpage_readpages). If an allocation happens for the
1024
+ * mpage_readahead). If an allocation happens for the
10171025 * second or third page, the process can end up locking
10181026 * the same page twice and deadlocking. Rather than
10191027 * trying to be clever about what pages can be locked,
....@@ -1101,8 +1109,7 @@
11011109 /* Establish migration ptes */
11021110 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
11031111 page);
1104
- try_to_unmap(page,
1105
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1112
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
11061113 page_was_mapped = 1;
11071114 }
11081115
....@@ -1141,34 +1148,19 @@
11411148 }
11421149
11431150 /*
1144
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
1145
- * around it.
1146
- */
1147
-#if defined(CONFIG_ARM) && \
1148
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
1149
-#define ICE_noinline noinline
1150
-#else
1151
-#define ICE_noinline
1152
-#endif
1153
-
1154
-/*
11551151 * Obtain the lock on page, remove all ptes and migrate the page
11561152 * to the newly allocated page in newpage.
11571153 */
1158
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
1154
+static int unmap_and_move(new_page_t get_new_page,
11591155 free_page_t put_new_page,
11601156 unsigned long private, struct page *page,
11611157 int force, enum migrate_mode mode,
11621158 enum migrate_reason reason)
11631159 {
11641160 int rc = MIGRATEPAGE_SUCCESS;
1165
- struct page *newpage;
1161
+ struct page *newpage = NULL;
11661162
11671163 if (!thp_migration_supported() && PageTransHuge(page))
1168
- return -ENOMEM;
1169
-
1170
- newpage = get_new_page(page, private);
1171
- if (!newpage)
11721164 return -ENOMEM;
11731165
11741166 if (page_count(page) == 1) {
....@@ -1178,15 +1170,15 @@
11781170 if (unlikely(__PageMovable(page))) {
11791171 lock_page(page);
11801172 if (!PageMovable(page))
1181
- __ClearPageIsolated(page);
1173
+ ClearPageIsolated(page);
11821174 unlock_page(page);
11831175 }
1184
- if (put_new_page)
1185
- put_new_page(newpage, private);
1186
- else
1187
- put_page(newpage);
11881176 goto out;
11891177 }
1178
+
1179
+ newpage = get_new_page(page, private);
1180
+ if (!newpage)
1181
+ return -ENOMEM;
11901182
11911183 rc = __unmap_and_move(page, newpage, force, mode);
11921184 if (rc == MIGRATEPAGE_SUCCESS)
....@@ -1197,8 +1189,7 @@
11971189 /*
11981190 * A page that has been migrated has all references
11991191 * removed and will be freed. A page that has not been
1200
- * migrated will have kepts its references and be
1201
- * restored.
1192
+ * migrated will have kept its references and be restored.
12021193 */
12031194 list_del(&page->lru);
12041195
....@@ -1209,7 +1200,7 @@
12091200 */
12101201 if (likely(!__PageMovable(page)))
12111202 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1212
- page_is_file_cache(page), -hpage_nr_pages(page));
1203
+ page_is_file_lru(page), -thp_nr_pages(page));
12131204 }
12141205
12151206 /*
....@@ -1218,16 +1209,11 @@
12181209 * we want to retry.
12191210 */
12201211 if (rc == MIGRATEPAGE_SUCCESS) {
1221
- put_page(page);
1222
- if (reason == MR_MEMORY_FAILURE) {
1212
+ if (reason != MR_MEMORY_FAILURE)
12231213 /*
1224
- * Set PG_HWPoison on just freed page
1225
- * intentionally. Although it's rather weird,
1226
- * it's how HWPoison flag works at the moment.
1214
+ * We release the page in page_handle_poison.
12271215 */
1228
- if (set_hwpoison_free_buddy_page(page))
1229
- num_poisoned_pages_inc();
1230
- }
1216
+ put_page(page);
12311217 } else {
12321218 if (rc != -EAGAIN) {
12331219 if (likely(!__PageMovable(page))) {
....@@ -1239,7 +1225,7 @@
12391225 if (PageMovable(page))
12401226 putback_movable_page(page);
12411227 else
1242
- __ClearPageIsolated(page);
1228
+ ClearPageIsolated(page);
12431229 unlock_page(page);
12441230 put_page(page);
12451231 }
....@@ -1280,9 +1266,10 @@
12801266 int page_was_mapped = 0;
12811267 struct page *new_hpage;
12821268 struct anon_vma *anon_vma = NULL;
1269
+ struct address_space *mapping = NULL;
12831270
12841271 /*
1285
- * Movability of hugepages depends on architectures and hugepage size.
1272
+ * Migratability of hugepages depends on architectures and their size.
12861273 * This check is necessary because some callers of hugepage migration
12871274 * like soft offline and memory hotremove don't walk through page
12881275 * tables or check whether the hugepage is pmd-based or not before
....@@ -1327,9 +1314,29 @@
13271314 goto put_anon;
13281315
13291316 if (page_mapped(hpage)) {
1330
- try_to_unmap(hpage,
1331
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1317
+ bool mapping_locked = false;
1318
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
1319
+
1320
+ if (!PageAnon(hpage)) {
1321
+ /*
1322
+ * In shared mappings, try_to_unmap could potentially
1323
+ * call huge_pmd_unshare. Because of this, take
1324
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
1325
+ * to let lower levels know we have taken the lock.
1326
+ */
1327
+ mapping = hugetlb_page_mapping_lock_write(hpage);
1328
+ if (unlikely(!mapping))
1329
+ goto unlock_put_anon;
1330
+
1331
+ mapping_locked = true;
1332
+ ttu |= TTU_RMAP_LOCKED;
1333
+ }
1334
+
1335
+ try_to_unmap(hpage, ttu);
13321336 page_was_mapped = 1;
1337
+
1338
+ if (mapping_locked)
1339
+ i_mmap_unlock_write(mapping);
13331340 }
13341341
13351342 if (!page_mapped(hpage))
....@@ -1339,6 +1346,7 @@
13391346 remove_migration_ptes(hpage,
13401347 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
13411348
1349
+unlock_put_anon:
13421350 unlock_page(new_hpage);
13431351
13441352 put_anon:
....@@ -1395,22 +1403,37 @@
13951403 enum migrate_mode mode, int reason)
13961404 {
13971405 int retry = 1;
1406
+ int thp_retry = 1;
13981407 int nr_failed = 0;
13991408 int nr_succeeded = 0;
1409
+ int nr_thp_succeeded = 0;
1410
+ int nr_thp_failed = 0;
1411
+ int nr_thp_split = 0;
14001412 int pass = 0;
1413
+ bool is_thp = false;
14011414 struct page *page;
14021415 struct page *page2;
14031416 int swapwrite = current->flags & PF_SWAPWRITE;
1404
- int rc;
1417
+ int rc, nr_subpages;
1418
+
1419
+ trace_mm_migrate_pages_start(mode, reason);
14051420
14061421 if (!swapwrite)
14071422 current->flags |= PF_SWAPWRITE;
14081423
1409
- for(pass = 0; pass < 10 && retry; pass++) {
1424
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
14101425 retry = 0;
1426
+ thp_retry = 0;
14111427
14121428 list_for_each_entry_safe(page, page2, from, lru) {
14131429 retry:
1430
+ /*
1431
+ * THP statistics is based on the source huge page.
1432
+ * Capture required information that might get lost
1433
+ * during migration.
1434
+ */
1435
+ is_thp = PageTransHuge(page) && !PageHuge(page);
1436
+ nr_subpages = thp_nr_pages(page);
14141437 cond_resched();
14151438
14161439 if (PageHuge(page))
....@@ -1435,21 +1458,35 @@
14351458 * we encounter them after the rest of the list
14361459 * is processed.
14371460 */
1438
- if (PageTransHuge(page) && !PageHuge(page)) {
1461
+ if (is_thp) {
14391462 lock_page(page);
14401463 rc = split_huge_page_to_list(page, from);
14411464 unlock_page(page);
14421465 if (!rc) {
14431466 list_safe_reset_next(page, page2, lru);
1467
+ nr_thp_split++;
14441468 goto retry;
14451469 }
1470
+
1471
+ nr_thp_failed++;
1472
+ nr_failed += nr_subpages;
1473
+ goto out;
14461474 }
14471475 nr_failed++;
14481476 goto out;
14491477 case -EAGAIN:
1478
+ if (is_thp) {
1479
+ thp_retry++;
1480
+ break;
1481
+ }
14501482 retry++;
14511483 break;
14521484 case MIGRATEPAGE_SUCCESS:
1485
+ if (is_thp) {
1486
+ nr_thp_succeeded++;
1487
+ nr_succeeded += nr_subpages;
1488
+ break;
1489
+ }
14531490 nr_succeeded++;
14541491 break;
14551492 default:
....@@ -1459,24 +1496,76 @@
14591496 * removed from migration page list and not
14601497 * retried in the next outer loop.
14611498 */
1499
+ if (is_thp) {
1500
+ nr_thp_failed++;
1501
+ nr_failed += nr_subpages;
1502
+ break;
1503
+ }
14621504 nr_failed++;
14631505 break;
14641506 }
14651507 }
14661508 }
1467
- nr_failed += retry;
1509
+ nr_failed += retry + thp_retry;
1510
+ nr_thp_failed += thp_retry;
14681511 rc = nr_failed;
14691512 out:
1470
- if (nr_succeeded)
1471
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1472
- if (nr_failed)
1473
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
1474
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1513
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1514
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
1515
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1516
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1517
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1518
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
1519
+ nr_thp_failed, nr_thp_split, mode, reason);
14751520
14761521 if (!swapwrite)
14771522 current->flags &= ~PF_SWAPWRITE;
14781523
14791524 return rc;
1525
+}
1526
+EXPORT_SYMBOL_GPL(migrate_pages);
1527
+
1528
+struct page *alloc_migration_target(struct page *page, unsigned long private)
1529
+{
1530
+ struct migration_target_control *mtc;
1531
+ gfp_t gfp_mask;
1532
+ unsigned int order = 0;
1533
+ struct page *new_page = NULL;
1534
+ int nid;
1535
+ int zidx;
1536
+
1537
+ mtc = (struct migration_target_control *)private;
1538
+ gfp_mask = mtc->gfp_mask;
1539
+ nid = mtc->nid;
1540
+ if (nid == NUMA_NO_NODE)
1541
+ nid = page_to_nid(page);
1542
+
1543
+ if (PageHuge(page)) {
1544
+ struct hstate *h = page_hstate(compound_head(page));
1545
+
1546
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1547
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1548
+ }
1549
+
1550
+ if (PageTransHuge(page)) {
1551
+ /*
1552
+ * clear __GFP_RECLAIM to make the migration callback
1553
+ * consistent with regular THP allocations.
1554
+ */
1555
+ gfp_mask &= ~__GFP_RECLAIM;
1556
+ gfp_mask |= GFP_TRANSHUGE;
1557
+ order = HPAGE_PMD_ORDER;
1558
+ }
1559
+ zidx = zone_idx(page_zone(page));
1560
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1561
+ gfp_mask |= __GFP_HIGHMEM;
1562
+
1563
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
1564
+
1565
+ if (new_page && PageTransHuge(new_page))
1566
+ prep_transhuge_page(new_page);
1567
+
1568
+ return new_page;
14801569 }
14811570
14821571 #ifdef CONFIG_NUMA
....@@ -1496,12 +1585,13 @@
14961585 struct list_head *pagelist, int node)
14971586 {
14981587 int err;
1588
+ struct migration_target_control mtc = {
1589
+ .nid = node,
1590
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1591
+ };
14991592
1500
- if (list_empty(pagelist))
1501
- return 0;
1502
-
1503
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
1504
- MIGRATE_SYNC, MR_SYSCALL);
1593
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
1594
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
15051595 if (err)
15061596 putback_movable_pages(pagelist);
15071597 return err;
....@@ -1524,7 +1614,7 @@
15241614 unsigned int follflags;
15251615 int err;
15261616
1527
- down_read(&mm->mmap_sem);
1617
+ mmap_read_lock(mm);
15281618 err = -EFAULT;
15291619 vma = find_vma(mm, addr);
15301620 if (!vma || addr < vma->vm_start || !vma_migratable(vma))
....@@ -1552,8 +1642,9 @@
15521642
15531643 if (PageHuge(page)) {
15541644 if (PageHead(page)) {
1555
- isolate_huge_page(page, pagelist);
1556
- err = 1;
1645
+ err = isolate_hugetlb(page, pagelist);
1646
+ if (!err)
1647
+ err = 1;
15571648 }
15581649 } else {
15591650 struct page *head;
....@@ -1566,8 +1657,8 @@
15661657 err = 1;
15671658 list_add_tail(&head->lru, pagelist);
15681659 mod_node_page_state(page_pgdat(head),
1569
- NR_ISOLATED_ANON + page_is_file_cache(head),
1570
- hpage_nr_pages(head));
1660
+ NR_ISOLATED_ANON + page_is_file_lru(head),
1661
+ thp_nr_pages(head));
15711662 }
15721663 out_putpage:
15731664 /*
....@@ -1575,10 +1666,36 @@
15751666 * isolate_lru_page() or drop the page ref if it was
15761667 * not isolated.
15771668 */
1578
- put_page(page);
1669
+ put_user_page(page);
15791670 out:
1580
- up_read(&mm->mmap_sem);
1671
+ mmap_read_unlock(mm);
15811672 return err;
1673
+}
1674
+
1675
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
1676
+ struct list_head *pagelist, int __user *status,
1677
+ int start, int i, unsigned long nr_pages)
1678
+{
1679
+ int err;
1680
+
1681
+ if (list_empty(pagelist))
1682
+ return 0;
1683
+
1684
+ err = do_move_pages_to_node(mm, pagelist, node);
1685
+ if (err) {
1686
+ /*
1687
+ * Positive err means the number of failed
1688
+ * pages to migrate. Since we are going to
1689
+ * abort and return the number of non-migrated
1690
+ * pages, so need to incude the rest of the
1691
+ * nr_pages that have not been attempted as
1692
+ * well.
1693
+ */
1694
+ if (err > 0)
1695
+ err += nr_pages - i - 1;
1696
+ return err;
1697
+ }
1698
+ return store_status(status, start, node, i - start);
15821699 }
15831700
15841701 /*
....@@ -1596,7 +1713,7 @@
15961713 int start, i;
15971714 int err = 0, err1;
15981715
1599
- migrate_prep();
1716
+ lru_cache_disable();
16001717
16011718 for (i = start = 0; i < nr_pages; i++) {
16021719 const void __user *p;
....@@ -1624,21 +1741,8 @@
16241741 current_node = node;
16251742 start = i;
16261743 } else if (node != current_node) {
1627
- err = do_move_pages_to_node(mm, &pagelist, current_node);
1628
- if (err) {
1629
- /*
1630
- * Positive err means the number of failed
1631
- * pages to migrate. Since we are going to
1632
- * abort and return the number of non-migrated
1633
- * pages, so need to incude the rest of the
1634
- * nr_pages that have not been attempted as
1635
- * well.
1636
- */
1637
- if (err > 0)
1638
- err += nr_pages - i - 1;
1639
- goto out;
1640
- }
1641
- err = store_status(status, start, current_node, i - start);
1744
+ err = move_pages_and_store_status(mm, current_node,
1745
+ &pagelist, status, start, i, nr_pages);
16421746 if (err)
16431747 goto out;
16441748 start = i;
....@@ -1652,52 +1756,33 @@
16521756 err = add_page_for_migration(mm, addr, current_node,
16531757 &pagelist, flags & MPOL_MF_MOVE_ALL);
16541758
1655
- if (!err) {
1656
- /* The page is already on the target node */
1657
- err = store_status(status, i, current_node, 1);
1658
- if (err)
1659
- goto out_flush;
1660
- continue;
1661
- } else if (err > 0) {
1759
+ if (err > 0) {
16621760 /* The page is successfully queued for migration */
16631761 continue;
16641762 }
16651763
1666
- err = store_status(status, i, err, 1);
1764
+ /*
1765
+ * If the page is already on the target node (!err), store the
1766
+ * node, otherwise, store the err.
1767
+ */
1768
+ err = store_status(status, i, err ? : current_node, 1);
16671769 if (err)
16681770 goto out_flush;
16691771
1670
- err = do_move_pages_to_node(mm, &pagelist, current_node);
1671
- if (err) {
1672
- if (err > 0)
1673
- err += nr_pages - i - 1;
1772
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
1773
+ status, start, i, nr_pages);
1774
+ if (err)
16741775 goto out;
1675
- }
1676
- if (i > start) {
1677
- err = store_status(status, start, current_node, i - start);
1678
- if (err)
1679
- goto out;
1680
- }
16811776 current_node = NUMA_NO_NODE;
16821777 }
16831778 out_flush:
1684
- if (list_empty(&pagelist))
1685
- return err;
1686
-
16871779 /* Make sure we do not overwrite the existing error */
1688
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
1689
- /*
1690
- * Don't have to report non-attempted pages here since:
1691
- * - If the above loop is done gracefully all pages have been
1692
- * attempted.
1693
- * - If the above loop is aborted it means a fatal error
1694
- * happened, should return ret.
1695
- */
1696
- if (!err1)
1697
- err1 = store_status(status, start, current_node, i - start);
1780
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1781
+ status, start, i, nr_pages);
16981782 if (err >= 0)
16991783 err = err1;
17001784 out:
1785
+ lru_cache_enable();
17011786 return err;
17021787 }
17031788
....@@ -1709,7 +1794,7 @@
17091794 {
17101795 unsigned long i;
17111796
1712
- down_read(&mm->mmap_sem);
1797
+ mmap_read_lock(mm);
17131798
17141799 for (i = 0; i < nr_pages; i++) {
17151800 unsigned long addr = (unsigned long)(*pages);
....@@ -1736,7 +1821,7 @@
17361821 status++;
17371822 }
17381823
1739
- up_read(&mm->mmap_sem);
1824
+ mmap_read_unlock(mm);
17401825 }
17411826
17421827 /*
....@@ -1773,6 +1858,53 @@
17731858 return nr_pages ? -EFAULT : 0;
17741859 }
17751860
1861
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1862
+{
1863
+ struct task_struct *task;
1864
+ struct mm_struct *mm;
1865
+
1866
+ /*
1867
+ * There is no need to check if current process has the right to modify
1868
+ * the specified process when they are same.
1869
+ */
1870
+ if (!pid) {
1871
+ mmget(current->mm);
1872
+ *mem_nodes = cpuset_mems_allowed(current);
1873
+ return current->mm;
1874
+ }
1875
+
1876
+ /* Find the mm_struct */
1877
+ rcu_read_lock();
1878
+ task = find_task_by_vpid(pid);
1879
+ if (!task) {
1880
+ rcu_read_unlock();
1881
+ return ERR_PTR(-ESRCH);
1882
+ }
1883
+ get_task_struct(task);
1884
+
1885
+ /*
1886
+ * Check if this process has the right to modify the specified
1887
+ * process. Use the regular "ptrace_may_access()" checks.
1888
+ */
1889
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1890
+ rcu_read_unlock();
1891
+ mm = ERR_PTR(-EPERM);
1892
+ goto out;
1893
+ }
1894
+ rcu_read_unlock();
1895
+
1896
+ mm = ERR_PTR(security_task_movememory(task));
1897
+ if (IS_ERR(mm))
1898
+ goto out;
1899
+ *mem_nodes = cpuset_mems_allowed(task);
1900
+ mm = get_task_mm(task);
1901
+out:
1902
+ put_task_struct(task);
1903
+ if (!mm)
1904
+ mm = ERR_PTR(-EINVAL);
1905
+ return mm;
1906
+}
1907
+
17761908 /*
17771909 * Move a list of pages in the address space of the currently executing
17781910 * process.
....@@ -1782,7 +1914,6 @@
17821914 const int __user *nodes,
17831915 int __user *status, int flags)
17841916 {
1785
- struct task_struct *task;
17861917 struct mm_struct *mm;
17871918 int err;
17881919 nodemask_t task_nodes;
....@@ -1794,36 +1925,9 @@
17941925 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
17951926 return -EPERM;
17961927
1797
- /* Find the mm_struct */
1798
- rcu_read_lock();
1799
- task = pid ? find_task_by_vpid(pid) : current;
1800
- if (!task) {
1801
- rcu_read_unlock();
1802
- return -ESRCH;
1803
- }
1804
- get_task_struct(task);
1805
-
1806
- /*
1807
- * Check if this process has the right to modify the specified
1808
- * process. Use the regular "ptrace_may_access()" checks.
1809
- */
1810
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1811
- rcu_read_unlock();
1812
- err = -EPERM;
1813
- goto out;
1814
- }
1815
- rcu_read_unlock();
1816
-
1817
- err = security_task_movememory(task);
1818
- if (err)
1819
- goto out;
1820
-
1821
- task_nodes = cpuset_mems_allowed(task);
1822
- mm = get_task_mm(task);
1823
- put_task_struct(task);
1824
-
1825
- if (!mm)
1826
- return -EINVAL;
1928
+ mm = find_mm_struct(pid, &task_nodes);
1929
+ if (IS_ERR(mm))
1930
+ return PTR_ERR(mm);
18271931
18281932 if (nodes)
18291933 err = do_pages_move(mm, task_nodes, nr_pages, pages,
....@@ -1832,10 +1936,6 @@
18321936 err = do_pages_stat(mm, nr_pages, pages, status);
18331937
18341938 mmput(mm);
1835
- return err;
1836
-
1837
-out:
1838
- put_task_struct(task);
18391939 return err;
18401940 }
18411941
....@@ -1889,7 +1989,7 @@
18891989 if (!zone_watermark_ok(zone, 0,
18901990 high_wmark_pages(zone) +
18911991 nr_migrate_pages,
1892
- 0, 0))
1992
+ ZONE_MOVABLE, 0))
18931993 continue;
18941994 return true;
18951995 }
....@@ -1918,7 +2018,7 @@
19182018 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
19192019
19202020 /* Avoid migrating to a node that is nearly full */
1921
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
2021
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
19222022 return 0;
19232023
19242024 if (isolate_lru_page(page))
....@@ -1936,9 +2036,9 @@
19362036 return 0;
19372037 }
19382038
1939
- page_lru = page_is_file_cache(page);
2039
+ page_lru = page_is_file_lru(page);
19402040 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1941
- hpage_nr_pages(page));
2041
+ thp_nr_pages(page));
19422042
19432043 /*
19442044 * Isolating the page has taken another reference, so the
....@@ -1960,7 +2060,7 @@
19602060 * node. Caller is expected to have an elevated reference count on
19612061 * the page that will be dropped by this function before returning.
19622062 */
1963
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2063
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
19642064 int node)
19652065 {
19662066 pg_data_t *pgdat = NODE_DATA(node);
....@@ -1972,15 +2072,15 @@
19722072 * Don't migrate file pages that are mapped in multiple processes
19732073 * with execute permissions as they are probably shared libraries.
19742074 */
1975
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1976
- (vma->vm_flags & VM_EXEC))
2075
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2076
+ (vmf->vma_flags & VM_EXEC))
19772077 goto out;
19782078
19792079 /*
19802080 * Also do not migrate dirty pages as not all filesystems can move
19812081 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
19822082 */
1983
- if (page_is_file_cache(page) && PageDirty(page))
2083
+ if (page_is_file_lru(page) && PageDirty(page))
19842084 goto out;
19852085
19862086 isolated = numamigrate_isolate_page(pgdat, page);
....@@ -1995,7 +2095,7 @@
19952095 if (!list_empty(&migratepages)) {
19962096 list_del(&page->lru);
19972097 dec_node_page_state(page, NR_ISOLATED_ANON +
1998
- page_is_file_cache(page));
2098
+ page_is_file_lru(page));
19992099 putback_lru_page(page);
20002100 }
20012101 isolated = 0;
....@@ -2025,9 +2125,8 @@
20252125 pg_data_t *pgdat = NODE_DATA(node);
20262126 int isolated = 0;
20272127 struct page *new_page = NULL;
2028
- int page_lru = page_is_file_cache(page);
2029
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
2030
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
2128
+ int page_lru = page_is_file_lru(page);
2129
+ unsigned long start = address & HPAGE_PMD_MASK;
20312130
20322131 new_page = alloc_pages_node(node,
20332132 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
....@@ -2050,15 +2149,15 @@
20502149 /* anon mapping, we can simply copy page->mapping to the new page: */
20512150 new_page->mapping = page->mapping;
20522151 new_page->index = page->index;
2152
+ /* flush the cache before copying using the kernel virtual address */
2153
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
20532154 migrate_page_copy(new_page, page);
20542155 WARN_ON(PageLRU(new_page));
20552156
20562157 /* Recheck the target PMD */
2057
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
20582158 ptl = pmd_lock(mm, pmd);
20592159 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
20602160 spin_unlock(ptl);
2061
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
20622161
20632162 /* Reverse changes made by migrate_page_copy() */
20642163 if (TestClearPageActive(new_page))
....@@ -2089,8 +2188,7 @@
20892188 * new page and page_add_new_anon_rmap guarantee the copy is
20902189 * visible before the pagetable update.
20912190 */
2092
- flush_cache_range(vma, mmun_start, mmun_end);
2093
- page_add_anon_rmap(new_page, vma, mmun_start, true);
2191
+ page_add_anon_rmap(new_page, vma, start, true);
20942192 /*
20952193 * At this point the pmd is numa/protnone (i.e. non present) and the TLB
20962194 * has already been flushed globally. So no TLB can be currently
....@@ -2098,11 +2196,11 @@
20982196 * pmd before doing set_pmd_at(), nor to flush the TLB after
20992197 * set_pmd_at(). Clearing the pmd here would introduce a race
21002198 * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
2101
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
2199
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
21022200 * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
21032201 * pmd.
21042202 */
2105
- set_pmd_at(mm, mmun_start, pmd, entry);
2203
+ set_pmd_at(mm, start, pmd, entry);
21062204 update_mmu_cache_pmd(vma, address, &entry);
21072205
21082206 page_ref_unfreeze(page, 2);
....@@ -2111,11 +2209,6 @@
21112209 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
21122210
21132211 spin_unlock(ptl);
2114
- /*
2115
- * No need to double call mmu_notifier->invalidate_range() callback as
2116
- * the above pmdp_huge_clear_flush_notify() did already call it.
2117
- */
2118
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
21192212
21202213 /* Take an "isolate" reference and put new page on the LRU. */
21212214 get_page(new_page);
....@@ -2139,7 +2232,7 @@
21392232 ptl = pmd_lock(mm, pmd);
21402233 if (pmd_same(*pmd, entry)) {
21412234 entry = pmd_modify(entry, vma->vm_page_prot);
2142
- set_pmd_at(mm, mmun_start, pmd, entry);
2235
+ set_pmd_at(mm, start, pmd, entry);
21432236 update_mmu_cache_pmd(vma, address, &entry);
21442237 }
21452238 spin_unlock(ptl);
....@@ -2153,25 +2246,26 @@
21532246
21542247 #endif /* CONFIG_NUMA */
21552248
2156
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
2157
-struct migrate_vma {
2158
- struct vm_area_struct *vma;
2159
- unsigned long *dst;
2160
- unsigned long *src;
2161
- unsigned long cpages;
2162
- unsigned long npages;
2163
- unsigned long start;
2164
- unsigned long end;
2165
-};
2166
-
2249
+#ifdef CONFIG_DEVICE_PRIVATE
21672250 static int migrate_vma_collect_hole(unsigned long start,
21682251 unsigned long end,
2252
+ __always_unused int depth,
21692253 struct mm_walk *walk)
21702254 {
21712255 struct migrate_vma *migrate = walk->private;
21722256 unsigned long addr;
21732257
2174
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2258
+ /* Only allow populating anonymous memory. */
2259
+ if (!vma_is_anonymous(walk->vma)) {
2260
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2261
+ migrate->src[migrate->npages] = 0;
2262
+ migrate->dst[migrate->npages] = 0;
2263
+ migrate->npages++;
2264
+ }
2265
+ return 0;
2266
+ }
2267
+
2268
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
21752269 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
21762270 migrate->dst[migrate->npages] = 0;
21772271 migrate->npages++;
....@@ -2188,7 +2282,7 @@
21882282 struct migrate_vma *migrate = walk->private;
21892283 unsigned long addr;
21902284
2191
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2285
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
21922286 migrate->dst[migrate->npages] = 0;
21932287 migrate->src[migrate->npages++] = 0;
21942288 }
....@@ -2210,7 +2304,7 @@
22102304
22112305 again:
22122306 if (pmd_none(*pmdp))
2213
- return migrate_vma_collect_hole(start, end, walk);
2307
+ return migrate_vma_collect_hole(start, end, -1, walk);
22142308
22152309 if (pmd_trans_huge(*pmdp)) {
22162310 struct page *page;
....@@ -2243,7 +2337,7 @@
22432337 return migrate_vma_collect_skip(start, end,
22442338 walk);
22452339 if (pmd_none(*pmdp))
2246
- return migrate_vma_collect_hole(start, end,
2340
+ return migrate_vma_collect_hole(start, end, -1,
22472341 walk);
22482342 }
22492343 }
....@@ -2255,24 +2349,22 @@
22552349 arch_enter_lazy_mmu_mode();
22562350
22572351 for (; addr < end; addr += PAGE_SIZE, ptep++) {
2258
- unsigned long mpfn, pfn;
2352
+ unsigned long mpfn = 0, pfn;
22592353 struct page *page;
22602354 swp_entry_t entry;
22612355 pte_t pte;
22622356
22632357 pte = *ptep;
2264
- pfn = pte_pfn(pte);
22652358
22662359 if (pte_none(pte)) {
2267
- mpfn = MIGRATE_PFN_MIGRATE;
2268
- migrate->cpages++;
2269
- pfn = 0;
2360
+ if (vma_is_anonymous(vma)) {
2361
+ mpfn = MIGRATE_PFN_MIGRATE;
2362
+ migrate->cpages++;
2363
+ }
22702364 goto next;
22712365 }
22722366
22732367 if (!pte_present(pte)) {
2274
- mpfn = pfn = 0;
2275
-
22762368 /*
22772369 * Only care about unaddressable device page special
22782370 * page table entry. Other special swap entries are not
....@@ -2283,28 +2375,34 @@
22832375 goto next;
22842376
22852377 page = device_private_entry_to_page(entry);
2286
- mpfn = migrate_pfn(page_to_pfn(page))|
2287
- MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2378
+ if (!(migrate->flags &
2379
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
2380
+ page->pgmap->owner != migrate->pgmap_owner)
2381
+ goto next;
2382
+
2383
+ mpfn = migrate_pfn(page_to_pfn(page)) |
2384
+ MIGRATE_PFN_MIGRATE;
22882385 if (is_write_device_private_entry(entry))
22892386 mpfn |= MIGRATE_PFN_WRITE;
22902387 } else {
2388
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
2389
+ goto next;
2390
+ pfn = pte_pfn(pte);
22912391 if (is_zero_pfn(pfn)) {
22922392 mpfn = MIGRATE_PFN_MIGRATE;
22932393 migrate->cpages++;
2294
- pfn = 0;
22952394 goto next;
22962395 }
2297
- page = _vm_normal_page(migrate->vma, addr, pte, true);
2396
+ page = vm_normal_page(migrate->vma, addr, pte);
22982397 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
22992398 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
23002399 }
23012400
23022401 /* FIXME support THP */
23032402 if (!page || !page->mapping || PageTransCompound(page)) {
2304
- mpfn = pfn = 0;
2403
+ mpfn = 0;
23052404 goto next;
23062405 }
2307
- pfn = page_to_pfn(page);
23082406
23092407 /*
23102408 * By getting a reference on the page we pin it and that blocks
....@@ -2333,8 +2431,17 @@
23332431 entry = make_migration_entry(page, mpfn &
23342432 MIGRATE_PFN_WRITE);
23352433 swp_pte = swp_entry_to_pte(entry);
2336
- if (pte_soft_dirty(pte))
2337
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
2434
+ if (pte_present(pte)) {
2435
+ if (pte_soft_dirty(pte))
2436
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
2437
+ if (pte_uffd_wp(pte))
2438
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
2439
+ } else {
2440
+ if (pte_swp_soft_dirty(pte))
2441
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
2442
+ if (pte_swp_uffd_wp(pte))
2443
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
2444
+ }
23382445 set_pte_at(mm, addr, ptep, swp_pte);
23392446
23402447 /*
....@@ -2353,15 +2460,21 @@
23532460 migrate->dst[migrate->npages] = 0;
23542461 migrate->src[migrate->npages++] = mpfn;
23552462 }
2356
- arch_leave_lazy_mmu_mode();
2357
- pte_unmap_unlock(ptep - 1, ptl);
23582463
23592464 /* Only flush the TLB if we actually modified any entries */
23602465 if (unmapped)
23612466 flush_tlb_range(walk->vma, start, end);
23622467
2468
+ arch_leave_lazy_mmu_mode();
2469
+ pte_unmap_unlock(ptep - 1, ptl);
2470
+
23632471 return 0;
23642472 }
2473
+
2474
+static const struct mm_walk_ops migrate_vma_walk_ops = {
2475
+ .pmd_entry = migrate_vma_collect_pmd,
2476
+ .pte_hole = migrate_vma_collect_hole,
2477
+};
23652478
23662479 /*
23672480 * migrate_vma_collect() - collect pages over a range of virtual addresses
....@@ -2373,22 +2486,22 @@
23732486 */
23742487 static void migrate_vma_collect(struct migrate_vma *migrate)
23752488 {
2376
- struct mm_walk mm_walk = {
2377
- .pmd_entry = migrate_vma_collect_pmd,
2378
- .pte_hole = migrate_vma_collect_hole,
2379
- .vma = migrate->vma,
2380
- .mm = migrate->vma->vm_mm,
2381
- .private = migrate,
2382
- };
2489
+ struct mmu_notifier_range range;
23832490
2384
- mmu_notifier_invalidate_range_start(mm_walk.mm,
2385
- migrate->start,
2386
- migrate->end);
2387
- walk_page_range(migrate->start, migrate->end, &mm_walk);
2388
- mmu_notifier_invalidate_range_end(mm_walk.mm,
2389
- migrate->start,
2390
- migrate->end);
2491
+ /*
2492
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
2493
+ * that the registered device driver can skip invalidating device
2494
+ * private page mappings that won't be migrated.
2495
+ */
2496
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
2497
+ migrate->vma->vm_mm, migrate->start, migrate->end,
2498
+ migrate->pgmap_owner);
2499
+ mmu_notifier_invalidate_range_start(&range);
23912500
2501
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
2502
+ &migrate_vma_walk_ops, migrate);
2503
+
2504
+ mmu_notifier_invalidate_range_end(&range);
23922505 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
23932506 }
23942507
....@@ -2432,16 +2545,7 @@
24322545 * FIXME proper solution is to rework migration_entry_wait() so
24332546 * it does not need to take a reference on page.
24342547 */
2435
- if (is_device_private_page(page))
2436
- return true;
2437
-
2438
- /*
2439
- * Only allow device public page to be migrated and account for
2440
- * the extra reference count imply by ZONE_DEVICE pages.
2441
- */
2442
- if (!is_device_public_page(page))
2443
- return false;
2444
- extra++;
2548
+ return is_device_private_page(page);
24452549 }
24462550
24472551 /* For file back page */
....@@ -2575,7 +2679,7 @@
25752679 */
25762680 static void migrate_vma_unmap(struct migrate_vma *migrate)
25772681 {
2578
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2682
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
25792683 const unsigned long npages = migrate->npages;
25802684 const unsigned long start = migrate->start;
25812685 unsigned long addr, i, restore = 0;
....@@ -2620,6 +2724,118 @@
26202724 }
26212725 }
26222726
2727
+/**
2728
+ * migrate_vma_setup() - prepare to migrate a range of memory
2729
+ * @args: contains the vma, start, and pfns arrays for the migration
2730
+ *
2731
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
2732
+ * without an error.
2733
+ *
2734
+ * Prepare to migrate a range of memory virtual address range by collecting all
2735
+ * the pages backing each virtual address in the range, saving them inside the
2736
+ * src array. Then lock those pages and unmap them. Once the pages are locked
2737
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
2738
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
2739
+ * corresponding src array entry. Then restores any pages that are pinned, by
2740
+ * remapping and unlocking those pages.
2741
+ *
2742
+ * The caller should then allocate destination memory and copy source memory to
2743
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
2744
+ * flag set). Once these are allocated and copied, the caller must update each
2745
+ * corresponding entry in the dst array with the pfn value of the destination
2746
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
2747
+ * (destination pages must have their struct pages locked, via lock_page()).
2748
+ *
2749
+ * Note that the caller does not have to migrate all the pages that are marked
2750
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
2751
+ * device memory to system memory. If the caller cannot migrate a device page
2752
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
2753
+ * consequences for the userspace process, so it must be avoided if at all
2754
+ * possible.
2755
+ *
2756
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
2757
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
2758
+ * allowing the caller to allocate device memory for those unback virtual
2759
+ * address. For this the caller simply has to allocate device memory and
2760
+ * properly set the destination entry like for regular migration. Note that
2761
+ * this can still fails and thus inside the device driver must check if the
2762
+ * migration was successful for those entries after calling migrate_vma_pages()
2763
+ * just like for regular migration.
2764
+ *
2765
+ * After that, the callers must call migrate_vma_pages() to go over each entry
2766
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2767
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2768
+ * then migrate_vma_pages() to migrate struct page information from the source
2769
+ * struct page to the destination struct page. If it fails to migrate the
2770
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
2771
+ * src array.
2772
+ *
2773
+ * At this point all successfully migrated pages have an entry in the src
2774
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2775
+ * array entry with MIGRATE_PFN_VALID flag set.
2776
+ *
2777
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
2778
+ * successfully migrated, and which were not. Successfully migrated pages will
2779
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
2780
+ *
2781
+ * It is safe to update device page table after migrate_vma_pages() because
2782
+ * both destination and source page are still locked, and the mmap_lock is held
2783
+ * in read mode (hence no one can unmap the range being migrated).
2784
+ *
2785
+ * Once the caller is done cleaning up things and updating its page table (if it
2786
+ * chose to do so, this is not an obligation) it finally calls
2787
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
2788
+ * for successfully migrated pages or otherwise restore the CPU page table to
2789
+ * point to the original source pages.
2790
+ */
2791
+int migrate_vma_setup(struct migrate_vma *args)
2792
+{
2793
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
2794
+
2795
+ args->start &= PAGE_MASK;
2796
+ args->end &= PAGE_MASK;
2797
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
2798
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
2799
+ return -EINVAL;
2800
+ if (nr_pages <= 0)
2801
+ return -EINVAL;
2802
+ if (args->start < args->vma->vm_start ||
2803
+ args->start >= args->vma->vm_end)
2804
+ return -EINVAL;
2805
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
2806
+ return -EINVAL;
2807
+ if (!args->src || !args->dst)
2808
+ return -EINVAL;
2809
+
2810
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
2811
+ args->cpages = 0;
2812
+ args->npages = 0;
2813
+
2814
+ migrate_vma_collect(args);
2815
+
2816
+ if (args->cpages)
2817
+ migrate_vma_prepare(args);
2818
+ if (args->cpages)
2819
+ migrate_vma_unmap(args);
2820
+
2821
+ /*
2822
+ * At this point pages are locked and unmapped, and thus they have
2823
+ * stable content and can safely be copied to destination memory that
2824
+ * is allocated by the drivers.
2825
+ */
2826
+ return 0;
2827
+
2828
+}
2829
+EXPORT_SYMBOL(migrate_vma_setup);
2830
+
2831
+/*
2832
+ * This code closely matches the code in:
2833
+ * __handle_mm_fault()
2834
+ * handle_pte_fault()
2835
+ * do_anonymous_page()
2836
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
2837
+ * private page.
2838
+ */
26232839 static void migrate_vma_insert_page(struct migrate_vma *migrate,
26242840 unsigned long addr,
26252841 struct page *page,
....@@ -2628,7 +2844,6 @@
26282844 {
26292845 struct vm_area_struct *vma = migrate->vma;
26302846 struct mm_struct *mm = vma->vm_mm;
2631
- struct mem_cgroup *memcg;
26322847 bool flush = false;
26332848 spinlock_t *ptl;
26342849 pte_t entry;
....@@ -2661,12 +2876,12 @@
26612876 * pte_offset_map() on pmds where a huge pmd might be created
26622877 * from a different thread.
26632878 *
2664
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2879
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
26652880 * parallel threads are excluded by other means.
26662881 *
2667
- * Here we only have down_read(mmap_sem).
2882
+ * Here we only have mmap_read_lock(mm).
26682883 */
2669
- if (pte_alloc(mm, pmdp, addr))
2884
+ if (pte_alloc(mm, pmdp))
26702885 goto abort;
26712886
26722887 /* See the comment in pte_alloc_one_map() */
....@@ -2675,7 +2890,7 @@
26752890
26762891 if (unlikely(anon_vma_prepare(vma)))
26772892 goto abort;
2678
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2893
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
26792894 goto abort;
26802895
26812896 /*
....@@ -2691,11 +2906,13 @@
26912906
26922907 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
26932908 entry = swp_entry_to_pte(swp_entry);
2694
- } else if (is_device_public_page(page)) {
2695
- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2696
- if (vma->vm_flags & VM_WRITE)
2697
- entry = pte_mkwrite(pte_mkdirty(entry));
2698
- entry = pte_mkdevmap(entry);
2909
+ } else {
2910
+ /*
2911
+ * For now we only support migrating to un-addressable
2912
+ * device memory.
2913
+ */
2914
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
2915
+ goto abort;
26992916 }
27002917 } else {
27012918 entry = mk_pte(page, vma->vm_page_prot);
....@@ -2705,36 +2922,29 @@
27052922
27062923 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
27072924
2925
+ if (check_stable_address_space(mm))
2926
+ goto unlock_abort;
2927
+
27082928 if (pte_present(*ptep)) {
27092929 unsigned long pfn = pte_pfn(*ptep);
27102930
2711
- if (!is_zero_pfn(pfn)) {
2712
- pte_unmap_unlock(ptep, ptl);
2713
- mem_cgroup_cancel_charge(page, memcg, false);
2714
- goto abort;
2715
- }
2931
+ if (!is_zero_pfn(pfn))
2932
+ goto unlock_abort;
27162933 flush = true;
2717
- } else if (!pte_none(*ptep)) {
2718
- pte_unmap_unlock(ptep, ptl);
2719
- mem_cgroup_cancel_charge(page, memcg, false);
2720
- goto abort;
2721
- }
2934
+ } else if (!pte_none(*ptep))
2935
+ goto unlock_abort;
27222936
27232937 /*
2724
- * Check for usefaultfd but do not deliver the fault. Instead,
2938
+ * Check for userfaultfd but do not deliver the fault. Instead,
27252939 * just back off.
27262940 */
2727
- if (userfaultfd_missing(vma)) {
2728
- pte_unmap_unlock(ptep, ptl);
2729
- mem_cgroup_cancel_charge(page, memcg, false);
2730
- goto abort;
2731
- }
2941
+ if (userfaultfd_missing(vma))
2942
+ goto unlock_abort;
27322943
27332944 inc_mm_counter(mm, MM_ANONPAGES);
27342945 page_add_new_anon_rmap(page, vma, addr, false);
2735
- mem_cgroup_commit_charge(page, memcg, false, false);
27362946 if (!is_zone_device_page(page))
2737
- lru_cache_add_active_or_unevictable(page, vma);
2947
+ lru_cache_add_inactive_or_unevictable(page, vma);
27382948 get_page(page);
27392949
27402950 if (flush) {
....@@ -2752,11 +2962,13 @@
27522962 *src = MIGRATE_PFN_MIGRATE;
27532963 return;
27542964
2965
+unlock_abort:
2966
+ pte_unmap_unlock(ptep, ptl);
27552967 abort:
27562968 *src &= ~MIGRATE_PFN_MIGRATE;
27572969 }
27582970
2759
-/*
2971
+/**
27602972 * migrate_vma_pages() - migrate meta-data from src page to dst page
27612973 * @migrate: migrate struct containing all migration information
27622974 *
....@@ -2764,13 +2976,12 @@
27642976 * struct page. This effectively finishes the migration from source page to the
27652977 * destination page.
27662978 */
2767
-static void migrate_vma_pages(struct migrate_vma *migrate)
2979
+void migrate_vma_pages(struct migrate_vma *migrate)
27682980 {
27692981 const unsigned long npages = migrate->npages;
27702982 const unsigned long start = migrate->start;
2771
- struct vm_area_struct *vma = migrate->vma;
2772
- struct mm_struct *mm = vma->vm_mm;
2773
- unsigned long addr, i, mmu_start;
2983
+ struct mmu_notifier_range range;
2984
+ unsigned long addr, i;
27742985 bool notified = false;
27752986
27762987 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
....@@ -2785,15 +2996,17 @@
27852996 }
27862997
27872998 if (!page) {
2788
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
2999
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
27893000 continue;
2790
- }
27913001 if (!notified) {
2792
- mmu_start = addr;
27933002 notified = true;
2794
- mmu_notifier_invalidate_range_start(mm,
2795
- mmu_start,
2796
- migrate->end);
3003
+
3004
+ mmu_notifier_range_init(&range,
3005
+ MMU_NOTIFY_CLEAR, 0,
3006
+ NULL,
3007
+ migrate->vma->vm_mm,
3008
+ addr, migrate->end);
3009
+ mmu_notifier_invalidate_range_start(&range);
27973010 }
27983011 migrate_vma_insert_page(migrate, addr, newpage,
27993012 &migrate->src[i],
....@@ -2813,7 +3026,7 @@
28133026 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
28143027 continue;
28153028 }
2816
- } else if (!is_device_public_page(newpage)) {
3029
+ } else {
28173030 /*
28183031 * Other types of ZONE_DEVICE page are not
28193032 * supported.
....@@ -2834,11 +3047,11 @@
28343047 * did already call it.
28353048 */
28363049 if (notified)
2837
- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2838
- migrate->end);
3050
+ mmu_notifier_invalidate_range_only_end(&range);
28393051 }
3052
+EXPORT_SYMBOL(migrate_vma_pages);
28403053
2841
-/*
3054
+/**
28423055 * migrate_vma_finalize() - restore CPU page table entry
28433056 * @migrate: migrate struct containing all migration information
28443057 *
....@@ -2849,7 +3062,7 @@
28493062 * This also unlocks the pages and puts them back on the lru, or drops the extra
28503063 * refcount, for device pages.
28513064 */
2852
-static void migrate_vma_finalize(struct migrate_vma *migrate)
3065
+void migrate_vma_finalize(struct migrate_vma *migrate)
28533066 {
28543067 const unsigned long npages = migrate->npages;
28553068 unsigned long i;
....@@ -2876,7 +3089,6 @@
28763089
28773090 remove_migration_ptes(page, newpage, false);
28783091 unlock_page(page);
2879
- migrate->cpages--;
28803092
28813093 if (is_zone_device_page(page))
28823094 put_page(page);
....@@ -2892,124 +3104,5 @@
28923104 }
28933105 }
28943106 }
2895
-
2896
-/*
2897
- * migrate_vma() - migrate a range of memory inside vma
2898
- *
2899
- * @ops: migration callback for allocating destination memory and copying
2900
- * @vma: virtual memory area containing the range to be migrated
2901
- * @start: start address of the range to migrate (inclusive)
2902
- * @end: end address of the range to migrate (exclusive)
2903
- * @src: array of hmm_pfn_t containing source pfns
2904
- * @dst: array of hmm_pfn_t containing destination pfns
2905
- * @private: pointer passed back to each of the callback
2906
- * Returns: 0 on success, error code otherwise
2907
- *
2908
- * This function tries to migrate a range of memory virtual address range, using
2909
- * callbacks to allocate and copy memory from source to destination. First it
2910
- * collects all the pages backing each virtual address in the range, saving this
2911
- * inside the src array. Then it locks those pages and unmaps them. Once the pages
2912
- * are locked and unmapped, it checks whether each page is pinned or not. Pages
2913
- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2914
- * in the corresponding src array entry. It then restores any pages that are
2915
- * pinned, by remapping and unlocking those pages.
2916
- *
2917
- * At this point it calls the alloc_and_copy() callback. For documentation on
2918
- * what is expected from that callback, see struct migrate_vma_ops comments in
2919
- * include/linux/migrate.h
2920
- *
2921
- * After the alloc_and_copy() callback, this function goes over each entry in
2922
- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2923
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2924
- * then the function tries to migrate struct page information from the source
2925
- * struct page to the destination struct page. If it fails to migrate the struct
2926
- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2927
- * array.
2928
- *
2929
- * At this point all successfully migrated pages have an entry in the src
2930
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2931
- * array entry with MIGRATE_PFN_VALID flag set.
2932
- *
2933
- * It then calls the finalize_and_map() callback. See comments for "struct
2934
- * migrate_vma_ops", in include/linux/migrate.h for details about
2935
- * finalize_and_map() behavior.
2936
- *
2937
- * After the finalize_and_map() callback, for successfully migrated pages, this
2938
- * function updates the CPU page table to point to new pages, otherwise it
2939
- * restores the CPU page table to point to the original source pages.
2940
- *
2941
- * Function returns 0 after the above steps, even if no pages were migrated
2942
- * (The function only returns an error if any of the arguments are invalid.)
2943
- *
2944
- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2945
- * unsigned long entries.
2946
- */
2947
-int migrate_vma(const struct migrate_vma_ops *ops,
2948
- struct vm_area_struct *vma,
2949
- unsigned long start,
2950
- unsigned long end,
2951
- unsigned long *src,
2952
- unsigned long *dst,
2953
- void *private)
2954
-{
2955
- struct migrate_vma migrate;
2956
-
2957
- /* Sanity check the arguments */
2958
- start &= PAGE_MASK;
2959
- end &= PAGE_MASK;
2960
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
2961
- vma_is_dax(vma))
2962
- return -EINVAL;
2963
- if (start < vma->vm_start || start >= vma->vm_end)
2964
- return -EINVAL;
2965
- if (end <= vma->vm_start || end > vma->vm_end)
2966
- return -EINVAL;
2967
- if (!ops || !src || !dst || start >= end)
2968
- return -EINVAL;
2969
-
2970
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2971
- migrate.src = src;
2972
- migrate.dst = dst;
2973
- migrate.start = start;
2974
- migrate.npages = 0;
2975
- migrate.cpages = 0;
2976
- migrate.end = end;
2977
- migrate.vma = vma;
2978
-
2979
- /* Collect, and try to unmap source pages */
2980
- migrate_vma_collect(&migrate);
2981
- if (!migrate.cpages)
2982
- return 0;
2983
-
2984
- /* Lock and isolate page */
2985
- migrate_vma_prepare(&migrate);
2986
- if (!migrate.cpages)
2987
- return 0;
2988
-
2989
- /* Unmap pages */
2990
- migrate_vma_unmap(&migrate);
2991
- if (!migrate.cpages)
2992
- return 0;
2993
-
2994
- /*
2995
- * At this point pages are locked and unmapped, and thus they have
2996
- * stable content and can safely be copied to destination memory that
2997
- * is allocated by the callback.
2998
- *
2999
- * Note that migration can fail in migrate_vma_struct_page() for each
3000
- * individual page.
3001
- */
3002
- ops->alloc_and_copy(vma, src, dst, start, end, private);
3003
-
3004
- /* This does the real migration of struct page */
3005
- migrate_vma_pages(&migrate);
3006
-
3007
- ops->finalize_and_map(vma, src, dst, start, end, private);
3008
-
3009
- /* Unlock and remap pages */
3010
- migrate_vma_finalize(&migrate);
3011
-
3012
- return 0;
3013
-}
3014
-EXPORT_SYMBOL(migrate_vma);
3015
-#endif /* defined(MIGRATE_VMA_HELPER) */
3107
+EXPORT_SYMBOL(migrate_vma_finalize);
3108
+#endif /* CONFIG_DEVICE_PRIVATE */