hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/mm/migrate.c
....@@ -38,6 +38,7 @@
3838 #include <linux/hugetlb.h>
3939 #include <linux/hugetlb_cgroup.h>
4040 #include <linux/gfp.h>
41
+#include <linux/pagewalk.h>
4142 #include <linux/pfn_t.h>
4243 #include <linux/memremap.h>
4344 #include <linux/userfaultfd_k.h>
....@@ -47,39 +48,17 @@
4748 #include <linux/page_owner.h>
4849 #include <linux/sched/mm.h>
4950 #include <linux/ptrace.h>
51
+#include <linux/oom.h>
5052
5153 #include <asm/tlbflush.h>
5254
5355 #define CREATE_TRACE_POINTS
5456 #include <trace/events/migrate.h>
57
+#undef CREATE_TRACE_POINTS
58
+#include <trace/hooks/mm.h>
59
+#include <trace/hooks/vmscan.h>
5560
5661 #include "internal.h"
57
-
58
-/*
59
- * migrate_prep() needs to be called before we start compiling a list of pages
60
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
61
- * undesirable, use migrate_prep_local()
62
- */
63
-int migrate_prep(void)
64
-{
65
- /*
66
- * Clear the LRU lists so pages can be isolated.
67
- * Note that pages may be moved off the LRU after we have
68
- * drained them. Those pages will fail to migrate like other
69
- * pages that may be busy.
70
- */
71
- lru_add_drain_all();
72
-
73
- return 0;
74
-}
75
-
76
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
77
-int migrate_prep_local(void)
78
-{
79
- lru_add_drain();
80
-
81
- return 0;
82
-}
8362
8463 int isolate_movable_page(struct page *page, isolate_mode_t mode)
8564 {
....@@ -100,7 +79,7 @@
10079 /*
10180 * Check PageMovable before holding a PG_lock because page's owner
10281 * assumes anybody doesn't touch PG_lock of newly allocated page
103
- * so unconditionally grapping the lock ruins page's owner side.
82
+ * so unconditionally grabbing the lock ruins page's owner side.
10483 */
10584 if (unlikely(!__PageMovable(page)))
10685 goto out_putpage;
....@@ -129,7 +108,7 @@
129108
130109 /* Driver shouldn't use PG_isolated bit of page->flags */
131110 WARN_ON_ONCE(PageIsolated(page));
132
- __SetPageIsolated(page);
111
+ SetPageIsolated(page);
133112 unlock_page(page);
134113
135114 return 0;
....@@ -153,7 +132,7 @@
153132
154133 mapping = page_mapping(page);
155134 mapping->a_ops->putback_page(page);
156
- __ClearPageIsolated(page);
135
+ ClearPageIsolated(page);
157136 }
158137
159138 /*
....@@ -186,16 +165,17 @@
186165 if (PageMovable(page))
187166 putback_movable_page(page);
188167 else
189
- __ClearPageIsolated(page);
168
+ ClearPageIsolated(page);
190169 unlock_page(page);
191170 put_page(page);
192171 } else {
193172 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
194
- page_is_file_cache(page), -hpage_nr_pages(page));
173
+ page_is_file_lru(page), -thp_nr_pages(page));
195174 putback_lru_page(page);
196175 }
197176 }
198177 }
178
+EXPORT_SYMBOL_GPL(putback_movable_pages);
199179
200180 /*
201181 * Restore a potential migration pte to a working pte entry
....@@ -240,15 +220,17 @@
240220 */
241221 entry = pte_to_swp_entry(*pvmw.pte);
242222 if (is_write_migration_entry(entry))
243
- pte = maybe_mkwrite(pte, vma);
223
+ pte = maybe_mkwrite(pte, vma->vm_flags);
224
+ else if (pte_swp_uffd_wp(*pvmw.pte))
225
+ pte = pte_mkuffd_wp(pte);
244226
245
- if (unlikely(is_zone_device_page(new))) {
246
- if (is_device_private_page(new)) {
247
- entry = make_device_private_entry(new, pte_write(pte));
248
- pte = swp_entry_to_pte(entry);
249
- } else if (is_device_public_page(new)) {
250
- pte = pte_mkdevmap(pte);
251
- }
227
+ if (unlikely(is_device_private_page(new))) {
228
+ entry = make_device_private_entry(new, pte_write(pte));
229
+ pte = swp_entry_to_pte(entry);
230
+ if (pte_swp_soft_dirty(*pvmw.pte))
231
+ pte = pte_swp_mksoft_dirty(pte);
232
+ if (pte_swp_uffd_wp(*pvmw.pte))
233
+ pte = pte_swp_mkuffd_wp(pte);
252234 }
253235
254236 #ifdef CONFIG_HUGETLB_PAGE
....@@ -322,19 +304,18 @@
322304 goto out;
323305
324306 page = migration_entry_to_page(entry);
307
+ page = compound_head(page);
325308
326309 /*
327
- * Once radix-tree replacement of page migration started, page_count
328
- * *must* be zero. And, we don't want to call wait_on_page_locked()
329
- * against a page without get_page().
330
- * So, we use get_page_unless_zero(), here. Even failed, page fault
331
- * will occur again.
310
+ * Once page cache replacement of page migration started, page_count
311
+ * is zero; but we must not call put_and_wait_on_page_locked() without
312
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
332313 */
333314 if (!get_page_unless_zero(page))
334315 goto out;
335316 pte_unmap_unlock(ptep, ptl);
336
- wait_on_page_locked(page);
337
- put_page(page);
317
+ trace_android_vh_waiting_for_page_migration(page);
318
+ put_and_wait_on_page_locked(page);
338319 return;
339320 out:
340321 pte_unmap_unlock(ptep, ptl);
....@@ -368,63 +349,27 @@
368349 if (!get_page_unless_zero(page))
369350 goto unlock;
370351 spin_unlock(ptl);
371
- wait_on_page_locked(page);
372
- put_page(page);
352
+ put_and_wait_on_page_locked(page);
373353 return;
374354 unlock:
375355 spin_unlock(ptl);
376356 }
377357 #endif
378358
379
-#ifdef CONFIG_BLOCK
380
-/* Returns true if all buffers are successfully locked */
381
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
382
- enum migrate_mode mode)
359
+static int expected_page_refs(struct address_space *mapping, struct page *page)
383360 {
384
- struct buffer_head *bh = head;
361
+ int expected_count = 1;
385362
386
- /* Simple case, sync compaction */
387
- if (mode != MIGRATE_ASYNC) {
388
- do {
389
- get_bh(bh);
390
- lock_buffer(bh);
391
- bh = bh->b_this_page;
363
+ /*
364
+ * Device private pages have an extra refcount as they are
365
+ * ZONE_DEVICE pages.
366
+ */
367
+ expected_count += is_device_private_page(page);
368
+ if (mapping)
369
+ expected_count += thp_nr_pages(page) + page_has_private(page);
392370
393
- } while (bh != head);
394
-
395
- return true;
396
- }
397
-
398
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
399
- do {
400
- get_bh(bh);
401
- if (!trylock_buffer(bh)) {
402
- /*
403
- * We failed to lock the buffer and cannot stall in
404
- * async migration. Release the taken locks
405
- */
406
- struct buffer_head *failed_bh = bh;
407
- put_bh(failed_bh);
408
- bh = head;
409
- while (bh != failed_bh) {
410
- unlock_buffer(bh);
411
- put_bh(bh);
412
- bh = bh->b_this_page;
413
- }
414
- return false;
415
- }
416
-
417
- bh = bh->b_this_page;
418
- } while (bh != head);
419
- return true;
371
+ return expected_count;
420372 }
421
-#else
422
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
423
- enum migrate_mode mode)
424
-{
425
- return true;
426
-}
427
-#endif /* CONFIG_BLOCK */
428373
429374 /*
430375 * Replace the page in the mapping.
....@@ -435,21 +380,13 @@
435380 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
436381 */
437382 int migrate_page_move_mapping(struct address_space *mapping,
438
- struct page *newpage, struct page *page,
439
- struct buffer_head *head, enum migrate_mode mode,
440
- int extra_count)
383
+ struct page *newpage, struct page *page, int extra_count)
441384 {
385
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
442386 struct zone *oldzone, *newzone;
443387 int dirty;
444
- int expected_count = 1 + extra_count;
445
- void **pslot;
446
-
447
- /*
448
- * Device public or private pages have an extra refcount as they are
449
- * ZONE_DEVICE pages.
450
- */
451
- expected_count += is_device_private_page(page);
452
- expected_count += is_device_public_page(page);
388
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
389
+ int nr = thp_nr_pages(page);
453390
454391 if (!mapping) {
455392 /* Anonymous page without mapping */
....@@ -468,35 +405,14 @@
468405 oldzone = page_zone(page);
469406 newzone = page_zone(newpage);
470407
471
- xa_lock_irq(&mapping->i_pages);
472
-
473
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
474
- page_index(page));
475
-
476
- expected_count += hpage_nr_pages(page) + page_has_private(page);
477
- if (page_count(page) != expected_count ||
478
- radix_tree_deref_slot_protected(pslot,
479
- &mapping->i_pages.xa_lock) != page) {
480
- xa_unlock_irq(&mapping->i_pages);
408
+ xas_lock_irq(&xas);
409
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
410
+ xas_unlock_irq(&xas);
481411 return -EAGAIN;
482412 }
483413
484414 if (!page_ref_freeze(page, expected_count)) {
485
- xa_unlock_irq(&mapping->i_pages);
486
- return -EAGAIN;
487
- }
488
-
489
- /*
490
- * In the async migration case of moving a page with buffers, lock the
491
- * buffers using trylock before the mapping is moved. If the mapping
492
- * was moved, we later failed to lock the buffers and could not move
493
- * the mapping back due to an elevated page count, we would have to
494
- * block waiting on other references to be dropped.
495
- */
496
- if (mode == MIGRATE_ASYNC && head &&
497
- !buffer_migrate_lock_buffers(head, mode)) {
498
- page_ref_unfreeze(page, expected_count);
499
- xa_unlock_irq(&mapping->i_pages);
415
+ xas_unlock_irq(&xas);
500416 return -EAGAIN;
501417 }
502418
....@@ -506,7 +422,7 @@
506422 */
507423 newpage->index = page->index;
508424 newpage->mapping = page->mapping;
509
- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
425
+ page_ref_add(newpage, nr); /* add cache reference */
510426 if (PageSwapBacked(page)) {
511427 __SetPageSwapBacked(newpage);
512428 if (PageSwapCache(page)) {
....@@ -524,16 +440,13 @@
524440 SetPageDirty(newpage);
525441 }
526442
527
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
443
+ xas_store(&xas, newpage);
528444 if (PageTransHuge(page)) {
529445 int i;
530
- int index = page_index(page);
531446
532
- for (i = 1; i < HPAGE_PMD_NR; i++) {
533
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
534
- index + i);
535
- radix_tree_replace_slot(&mapping->i_pages, pslot,
536
- newpage + i);
447
+ for (i = 1; i < nr; i++) {
448
+ xas_next(&xas);
449
+ xas_store(&xas, newpage);
537450 }
538451 }
539452
....@@ -542,9 +455,9 @@
542455 * to one less reference.
543456 * We know this isn't the last reference.
544457 */
545
- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
458
+ page_ref_unfreeze(page, expected_count - nr);
546459
547
- xa_unlock(&mapping->i_pages);
460
+ xas_unlock(&xas);
548461 /* Leave irq disabled to prevent preemption while updating stats */
549462
550463 /*
....@@ -558,17 +471,24 @@
558471 * are mapped to swap space.
559472 */
560473 if (newzone != oldzone) {
561
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
562
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
474
+ struct lruvec *old_lruvec, *new_lruvec;
475
+ struct mem_cgroup *memcg;
476
+
477
+ memcg = page_memcg(page);
478
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
479
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
480
+
481
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
482
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
563483 if (PageSwapBacked(page) && !PageSwapCache(page)) {
564
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
565
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
484
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
485
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
566486 }
567
- if (dirty && mapping_cap_account_dirty(mapping)) {
568
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
569
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
570
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
571
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
487
+ if (dirty && mapping_can_writeback(mapping)) {
488
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
489
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
490
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
491
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
572492 }
573493 }
574494 local_irq_enable();
....@@ -584,22 +504,18 @@
584504 int migrate_huge_page_move_mapping(struct address_space *mapping,
585505 struct page *newpage, struct page *page)
586506 {
507
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
587508 int expected_count;
588
- void **pslot;
589509
590
- xa_lock_irq(&mapping->i_pages);
591
-
592
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
593
-
510
+ xas_lock_irq(&xas);
594511 expected_count = 2 + page_has_private(page);
595
- if (page_count(page) != expected_count ||
596
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
597
- xa_unlock_irq(&mapping->i_pages);
512
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
513
+ xas_unlock_irq(&xas);
598514 return -EAGAIN;
599515 }
600516
601517 if (!page_ref_freeze(page, expected_count)) {
602
- xa_unlock_irq(&mapping->i_pages);
518
+ xas_unlock_irq(&xas);
603519 return -EAGAIN;
604520 }
605521
....@@ -608,11 +524,11 @@
608524
609525 get_page(newpage);
610526
611
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
527
+ xas_store(&xas, newpage);
612528
613529 page_ref_unfreeze(page, expected_count - 1);
614530
615
- xa_unlock_irq(&mapping->i_pages);
531
+ xas_unlock_irq(&xas);
616532
617533 return MIGRATEPAGE_SUCCESS;
618534 }
....@@ -656,7 +572,7 @@
656572 } else {
657573 /* thp page */
658574 BUG_ON(!PageTransHuge(src));
659
- nr_pages = hpage_nr_pages(src);
575
+ nr_pages = thp_nr_pages(src);
660576 }
661577
662578 for (i = 0; i < nr_pages; i++) {
....@@ -671,6 +587,8 @@
671587 void migrate_page_states(struct page *newpage, struct page *page)
672588 {
673589 int cpupid;
590
+
591
+ trace_android_vh_migrate_page_states(page, newpage);
674592
675593 if (PageError(page))
676594 SetPageError(newpage);
....@@ -689,6 +607,7 @@
689607 SetPageChecked(newpage);
690608 if (PageMappedToDisk(page))
691609 SetPageMappedToDisk(newpage);
610
+ trace_android_vh_look_around_migrate_page(page, newpage);
692611
693612 /* Move dirty on pages not done by migrate_page_move_mapping() */
694613 if (PageDirty(page))
....@@ -723,9 +642,18 @@
723642 if (PageWriteback(newpage))
724643 end_page_writeback(newpage);
725644
645
+ /*
646
+ * PG_readahead shares the same bit with PG_reclaim. The above
647
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
648
+ * bit after that.
649
+ */
650
+ if (PageReadahead(page))
651
+ SetPageReadahead(newpage);
652
+
726653 copy_page_owner(page, newpage);
727654
728
- mem_cgroup_migrate(page, newpage);
655
+ if (!PageHuge(page))
656
+ mem_cgroup_migrate(page, newpage);
729657 }
730658 EXPORT_SYMBOL(migrate_page_states);
731659
....@@ -758,7 +686,7 @@
758686
759687 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
760688
761
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
689
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
762690
763691 if (rc != MIGRATEPAGE_SUCCESS)
764692 return rc;
....@@ -772,40 +700,96 @@
772700 EXPORT_SYMBOL(migrate_page);
773701
774702 #ifdef CONFIG_BLOCK
775
-/*
776
- * Migration function for pages with buffers. This function can only be used
777
- * if the underlying filesystem guarantees that no other references to "page"
778
- * exist.
779
- */
780
-int buffer_migrate_page(struct address_space *mapping,
781
- struct page *newpage, struct page *page, enum migrate_mode mode)
703
+/* Returns true if all buffers are successfully locked */
704
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
705
+ enum migrate_mode mode)
706
+{
707
+ struct buffer_head *bh = head;
708
+
709
+ /* Simple case, sync compaction */
710
+ if (mode != MIGRATE_ASYNC) {
711
+ do {
712
+ lock_buffer(bh);
713
+ bh = bh->b_this_page;
714
+
715
+ } while (bh != head);
716
+
717
+ return true;
718
+ }
719
+
720
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
721
+ do {
722
+ if (!trylock_buffer(bh)) {
723
+ /*
724
+ * We failed to lock the buffer and cannot stall in
725
+ * async migration. Release the taken locks
726
+ */
727
+ struct buffer_head *failed_bh = bh;
728
+ bh = head;
729
+ while (bh != failed_bh) {
730
+ unlock_buffer(bh);
731
+ bh = bh->b_this_page;
732
+ }
733
+ return false;
734
+ }
735
+
736
+ bh = bh->b_this_page;
737
+ } while (bh != head);
738
+ return true;
739
+}
740
+
741
+static int __buffer_migrate_page(struct address_space *mapping,
742
+ struct page *newpage, struct page *page, enum migrate_mode mode,
743
+ bool check_refs)
782744 {
783745 struct buffer_head *bh, *head;
784746 int rc;
747
+ int expected_count;
785748
786749 if (!page_has_buffers(page))
787750 return migrate_page(mapping, newpage, page, mode);
788751
752
+ /* Check whether page does not have extra refs before we do more work */
753
+ expected_count = expected_page_refs(mapping, page);
754
+ if (page_count(page) != expected_count)
755
+ return -EAGAIN;
756
+
789757 head = page_buffers(page);
758
+ if (!buffer_migrate_lock_buffers(head, mode))
759
+ return -EAGAIN;
790760
791
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
761
+ if (check_refs) {
762
+ bool busy;
763
+ bool invalidated = false;
792764
765
+recheck_buffers:
766
+ busy = false;
767
+ spin_lock(&mapping->private_lock);
768
+ bh = head;
769
+ do {
770
+ if (atomic_read(&bh->b_count)) {
771
+ busy = true;
772
+ break;
773
+ }
774
+ bh = bh->b_this_page;
775
+ } while (bh != head);
776
+ if (busy) {
777
+ if (invalidated) {
778
+ rc = -EAGAIN;
779
+ goto unlock_buffers;
780
+ }
781
+ spin_unlock(&mapping->private_lock);
782
+ invalidate_bh_lrus();
783
+ invalidated = true;
784
+ goto recheck_buffers;
785
+ }
786
+ }
787
+
788
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
793789 if (rc != MIGRATEPAGE_SUCCESS)
794
- return rc;
790
+ goto unlock_buffers;
795791
796
- /*
797
- * In the async case, migrate_page_move_mapping locked the buffers
798
- * with an IRQ-safe spinlock held. In the sync case, the buffers
799
- * need to be locked now
800
- */
801
- if (mode != MIGRATE_ASYNC)
802
- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
803
-
804
- ClearPagePrivate(page);
805
- set_page_private(newpage, page_private(page));
806
- set_page_private(page, 0);
807
- put_page(page);
808
- get_page(newpage);
792
+ attach_page_private(newpage, detach_page_private(page));
809793
810794 bh = head;
811795 do {
....@@ -814,24 +798,48 @@
814798
815799 } while (bh != head);
816800
817
- SetPagePrivate(newpage);
818
-
819801 if (mode != MIGRATE_SYNC_NO_COPY)
820802 migrate_page_copy(newpage, page);
821803 else
822804 migrate_page_states(newpage, page);
823805
806
+ rc = MIGRATEPAGE_SUCCESS;
807
+unlock_buffers:
808
+ if (check_refs)
809
+ spin_unlock(&mapping->private_lock);
824810 bh = head;
825811 do {
826812 unlock_buffer(bh);
827
- put_bh(bh);
828813 bh = bh->b_this_page;
829814
830815 } while (bh != head);
831816
832
- return MIGRATEPAGE_SUCCESS;
817
+ return rc;
818
+}
819
+
820
+/*
821
+ * Migration function for pages with buffers. This function can only be used
822
+ * if the underlying filesystem guarantees that no other references to "page"
823
+ * exist. For example attached buffer heads are accessed only under page lock.
824
+ */
825
+int buffer_migrate_page(struct address_space *mapping,
826
+ struct page *newpage, struct page *page, enum migrate_mode mode)
827
+{
828
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
833829 }
834830 EXPORT_SYMBOL(buffer_migrate_page);
831
+
832
+/*
833
+ * Same as above except that this variant is more careful and checks that there
834
+ * are also no buffer head references. This function is the right one for
835
+ * mappings where buffer heads are directly looked up and referenced (such as
836
+ * block device mappings).
837
+ */
838
+int buffer_migrate_page_norefs(struct address_space *mapping,
839
+ struct page *newpage, struct page *page, enum migrate_mode mode)
840
+{
841
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
842
+}
835843 #endif
836844
837845 /*
....@@ -899,7 +907,7 @@
899907 */
900908 if (page_has_private(page) &&
901909 !try_to_release_page(page, GFP_KERNEL))
902
- return -EAGAIN;
910
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
903911
904912 return migrate_page(mapping, newpage, page, mode);
905913 }
....@@ -951,7 +959,7 @@
951959 VM_BUG_ON_PAGE(!PageIsolated(page), page);
952960 if (!PageMovable(page)) {
953961 rc = MIGRATEPAGE_SUCCESS;
954
- __ClearPageIsolated(page);
962
+ ClearPageIsolated(page);
955963 goto out;
956964 }
957965
....@@ -973,23 +981,23 @@
973981 * We clear PG_movable under page_lock so any compactor
974982 * cannot try to migrate this page.
975983 */
976
- __ClearPageIsolated(page);
984
+ ClearPageIsolated(page);
977985 }
978986
979987 /*
980
- * Anonymous and movable page->mapping will be cleard by
988
+ * Anonymous and movable page->mapping will be cleared by
981989 * free_pages_prepare so don't reset it here for keeping
982990 * the type to work PageAnon, for example.
983991 */
984992 if (!PageMappingFlags(page))
985993 page->mapping = NULL;
986994
987
- if (unlikely(is_zone_device_page(newpage))) {
988
- if (is_device_public_page(newpage))
989
- flush_dcache_page(newpage);
990
- } else
991
- flush_dcache_page(newpage);
995
+ if (likely(!is_zone_device_page(newpage))) {
996
+ int i, nr = compound_nr(newpage);
992997
998
+ for (i = 0; i < nr; i++)
999
+ flush_dcache_page(newpage + i);
1000
+ }
9931001 }
9941002 out:
9951003 return rc;
....@@ -1013,7 +1021,7 @@
10131021 * to the LRU. Later, when the IO completes the pages are
10141022 * marked uptodate and unlocked. However, the queueing
10151023 * could be merging multiple pages for one bio (e.g.
1016
- * mpage_readpages). If an allocation happens for the
1024
+ * mpage_readahead). If an allocation happens for the
10171025 * second or third page, the process can end up locking
10181026 * the same page twice and deadlocking. Rather than
10191027 * trying to be clever about what pages can be locked,
....@@ -1101,8 +1109,7 @@
11011109 /* Establish migration ptes */
11021110 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
11031111 page);
1104
- try_to_unmap(page,
1105
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1112
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
11061113 page_was_mapped = 1;
11071114 }
11081115
....@@ -1141,34 +1148,19 @@
11411148 }
11421149
11431150 /*
1144
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
1145
- * around it.
1146
- */
1147
-#if defined(CONFIG_ARM) && \
1148
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
1149
-#define ICE_noinline noinline
1150
-#else
1151
-#define ICE_noinline
1152
-#endif
1153
-
1154
-/*
11551151 * Obtain the lock on page, remove all ptes and migrate the page
11561152 * to the newly allocated page in newpage.
11571153 */
1158
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
1154
+static int unmap_and_move(new_page_t get_new_page,
11591155 free_page_t put_new_page,
11601156 unsigned long private, struct page *page,
11611157 int force, enum migrate_mode mode,
11621158 enum migrate_reason reason)
11631159 {
11641160 int rc = MIGRATEPAGE_SUCCESS;
1165
- struct page *newpage;
1161
+ struct page *newpage = NULL;
11661162
11671163 if (!thp_migration_supported() && PageTransHuge(page))
1168
- return -ENOMEM;
1169
-
1170
- newpage = get_new_page(page, private);
1171
- if (!newpage)
11721164 return -ENOMEM;
11731165
11741166 if (page_count(page) == 1) {
....@@ -1178,15 +1170,15 @@
11781170 if (unlikely(__PageMovable(page))) {
11791171 lock_page(page);
11801172 if (!PageMovable(page))
1181
- __ClearPageIsolated(page);
1173
+ ClearPageIsolated(page);
11821174 unlock_page(page);
11831175 }
1184
- if (put_new_page)
1185
- put_new_page(newpage, private);
1186
- else
1187
- put_page(newpage);
11881176 goto out;
11891177 }
1178
+
1179
+ newpage = get_new_page(page, private);
1180
+ if (!newpage)
1181
+ return -ENOMEM;
11901182
11911183 rc = __unmap_and_move(page, newpage, force, mode);
11921184 if (rc == MIGRATEPAGE_SUCCESS)
....@@ -1197,8 +1189,7 @@
11971189 /*
11981190 * A page that has been migrated has all references
11991191 * removed and will be freed. A page that has not been
1200
- * migrated will have kepts its references and be
1201
- * restored.
1192
+ * migrated will have kept its references and be restored.
12021193 */
12031194 list_del(&page->lru);
12041195
....@@ -1209,7 +1200,7 @@
12091200 */
12101201 if (likely(!__PageMovable(page)))
12111202 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1212
- page_is_file_cache(page), -hpage_nr_pages(page));
1203
+ page_is_file_lru(page), -thp_nr_pages(page));
12131204 }
12141205
12151206 /*
....@@ -1218,16 +1209,11 @@
12181209 * we want to retry.
12191210 */
12201211 if (rc == MIGRATEPAGE_SUCCESS) {
1221
- put_page(page);
1222
- if (reason == MR_MEMORY_FAILURE) {
1212
+ if (reason != MR_MEMORY_FAILURE)
12231213 /*
1224
- * Set PG_HWPoison on just freed page
1225
- * intentionally. Although it's rather weird,
1226
- * it's how HWPoison flag works at the moment.
1214
+ * We release the page in page_handle_poison.
12271215 */
1228
- if (set_hwpoison_free_buddy_page(page))
1229
- num_poisoned_pages_inc();
1230
- }
1216
+ put_page(page);
12311217 } else {
12321218 if (rc != -EAGAIN) {
12331219 if (likely(!__PageMovable(page))) {
....@@ -1239,7 +1225,7 @@
12391225 if (PageMovable(page))
12401226 putback_movable_page(page);
12411227 else
1242
- __ClearPageIsolated(page);
1228
+ ClearPageIsolated(page);
12431229 unlock_page(page);
12441230 put_page(page);
12451231 }
....@@ -1280,9 +1266,10 @@
12801266 int page_was_mapped = 0;
12811267 struct page *new_hpage;
12821268 struct anon_vma *anon_vma = NULL;
1269
+ struct address_space *mapping = NULL;
12831270
12841271 /*
1285
- * Movability of hugepages depends on architectures and hugepage size.
1272
+ * Migratability of hugepages depends on architectures and their size.
12861273 * This check is necessary because some callers of hugepage migration
12871274 * like soft offline and memory hotremove don't walk through page
12881275 * tables or check whether the hugepage is pmd-based or not before
....@@ -1327,9 +1314,29 @@
13271314 goto put_anon;
13281315
13291316 if (page_mapped(hpage)) {
1330
- try_to_unmap(hpage,
1331
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1317
+ bool mapping_locked = false;
1318
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
1319
+
1320
+ if (!PageAnon(hpage)) {
1321
+ /*
1322
+ * In shared mappings, try_to_unmap could potentially
1323
+ * call huge_pmd_unshare. Because of this, take
1324
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
1325
+ * to let lower levels know we have taken the lock.
1326
+ */
1327
+ mapping = hugetlb_page_mapping_lock_write(hpage);
1328
+ if (unlikely(!mapping))
1329
+ goto unlock_put_anon;
1330
+
1331
+ mapping_locked = true;
1332
+ ttu |= TTU_RMAP_LOCKED;
1333
+ }
1334
+
1335
+ try_to_unmap(hpage, ttu);
13321336 page_was_mapped = 1;
1337
+
1338
+ if (mapping_locked)
1339
+ i_mmap_unlock_write(mapping);
13331340 }
13341341
13351342 if (!page_mapped(hpage))
....@@ -1339,6 +1346,7 @@
13391346 remove_migration_ptes(hpage,
13401347 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
13411348
1349
+unlock_put_anon:
13421350 unlock_page(new_hpage);
13431351
13441352 put_anon:
....@@ -1395,22 +1403,37 @@
13951403 enum migrate_mode mode, int reason)
13961404 {
13971405 int retry = 1;
1406
+ int thp_retry = 1;
13981407 int nr_failed = 0;
13991408 int nr_succeeded = 0;
1409
+ int nr_thp_succeeded = 0;
1410
+ int nr_thp_failed = 0;
1411
+ int nr_thp_split = 0;
14001412 int pass = 0;
1413
+ bool is_thp = false;
14011414 struct page *page;
14021415 struct page *page2;
14031416 int swapwrite = current->flags & PF_SWAPWRITE;
1404
- int rc;
1417
+ int rc, nr_subpages;
1418
+
1419
+ trace_mm_migrate_pages_start(mode, reason);
14051420
14061421 if (!swapwrite)
14071422 current->flags |= PF_SWAPWRITE;
14081423
1409
- for(pass = 0; pass < 10 && retry; pass++) {
1424
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
14101425 retry = 0;
1426
+ thp_retry = 0;
14111427
14121428 list_for_each_entry_safe(page, page2, from, lru) {
14131429 retry:
1430
+ /*
1431
+ * THP statistics is based on the source huge page.
1432
+ * Capture required information that might get lost
1433
+ * during migration.
1434
+ */
1435
+ is_thp = PageTransHuge(page) && !PageHuge(page);
1436
+ nr_subpages = thp_nr_pages(page);
14141437 cond_resched();
14151438
14161439 if (PageHuge(page))
....@@ -1435,21 +1458,35 @@
14351458 * we encounter them after the rest of the list
14361459 * is processed.
14371460 */
1438
- if (PageTransHuge(page) && !PageHuge(page)) {
1461
+ if (is_thp) {
14391462 lock_page(page);
14401463 rc = split_huge_page_to_list(page, from);
14411464 unlock_page(page);
14421465 if (!rc) {
14431466 list_safe_reset_next(page, page2, lru);
1467
+ nr_thp_split++;
14441468 goto retry;
14451469 }
1470
+
1471
+ nr_thp_failed++;
1472
+ nr_failed += nr_subpages;
1473
+ goto out;
14461474 }
14471475 nr_failed++;
14481476 goto out;
14491477 case -EAGAIN:
1478
+ if (is_thp) {
1479
+ thp_retry++;
1480
+ break;
1481
+ }
14501482 retry++;
14511483 break;
14521484 case MIGRATEPAGE_SUCCESS:
1485
+ if (is_thp) {
1486
+ nr_thp_succeeded++;
1487
+ nr_succeeded += nr_subpages;
1488
+ break;
1489
+ }
14531490 nr_succeeded++;
14541491 break;
14551492 default:
....@@ -1459,24 +1496,76 @@
14591496 * removed from migration page list and not
14601497 * retried in the next outer loop.
14611498 */
1499
+ if (is_thp) {
1500
+ nr_thp_failed++;
1501
+ nr_failed += nr_subpages;
1502
+ break;
1503
+ }
14621504 nr_failed++;
14631505 break;
14641506 }
14651507 }
14661508 }
1467
- nr_failed += retry;
1509
+ nr_failed += retry + thp_retry;
1510
+ nr_thp_failed += thp_retry;
14681511 rc = nr_failed;
14691512 out:
1470
- if (nr_succeeded)
1471
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1472
- if (nr_failed)
1473
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
1474
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1513
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1514
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
1515
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1516
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1517
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1518
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
1519
+ nr_thp_failed, nr_thp_split, mode, reason);
14751520
14761521 if (!swapwrite)
14771522 current->flags &= ~PF_SWAPWRITE;
14781523
14791524 return rc;
1525
+}
1526
+EXPORT_SYMBOL_GPL(migrate_pages);
1527
+
1528
+struct page *alloc_migration_target(struct page *page, unsigned long private)
1529
+{
1530
+ struct migration_target_control *mtc;
1531
+ gfp_t gfp_mask;
1532
+ unsigned int order = 0;
1533
+ struct page *new_page = NULL;
1534
+ int nid;
1535
+ int zidx;
1536
+
1537
+ mtc = (struct migration_target_control *)private;
1538
+ gfp_mask = mtc->gfp_mask;
1539
+ nid = mtc->nid;
1540
+ if (nid == NUMA_NO_NODE)
1541
+ nid = page_to_nid(page);
1542
+
1543
+ if (PageHuge(page)) {
1544
+ struct hstate *h = page_hstate(compound_head(page));
1545
+
1546
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1547
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1548
+ }
1549
+
1550
+ if (PageTransHuge(page)) {
1551
+ /*
1552
+ * clear __GFP_RECLAIM to make the migration callback
1553
+ * consistent with regular THP allocations.
1554
+ */
1555
+ gfp_mask &= ~__GFP_RECLAIM;
1556
+ gfp_mask |= GFP_TRANSHUGE;
1557
+ order = HPAGE_PMD_ORDER;
1558
+ }
1559
+ zidx = zone_idx(page_zone(page));
1560
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1561
+ gfp_mask |= __GFP_HIGHMEM;
1562
+
1563
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
1564
+
1565
+ if (new_page && PageTransHuge(new_page))
1566
+ prep_transhuge_page(new_page);
1567
+
1568
+ return new_page;
14801569 }
14811570
14821571 #ifdef CONFIG_NUMA
....@@ -1496,12 +1585,13 @@
14961585 struct list_head *pagelist, int node)
14971586 {
14981587 int err;
1588
+ struct migration_target_control mtc = {
1589
+ .nid = node,
1590
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1591
+ };
14991592
1500
- if (list_empty(pagelist))
1501
- return 0;
1502
-
1503
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
1504
- MIGRATE_SYNC, MR_SYSCALL);
1593
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
1594
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
15051595 if (err)
15061596 putback_movable_pages(pagelist);
15071597 return err;
....@@ -1524,7 +1614,7 @@
15241614 unsigned int follflags;
15251615 int err;
15261616
1527
- down_read(&mm->mmap_sem);
1617
+ mmap_read_lock(mm);
15281618 err = -EFAULT;
15291619 vma = find_vma(mm, addr);
15301620 if (!vma || addr < vma->vm_start || !vma_migratable(vma))
....@@ -1566,8 +1656,8 @@
15661656 err = 1;
15671657 list_add_tail(&head->lru, pagelist);
15681658 mod_node_page_state(page_pgdat(head),
1569
- NR_ISOLATED_ANON + page_is_file_cache(head),
1570
- hpage_nr_pages(head));
1659
+ NR_ISOLATED_ANON + page_is_file_lru(head),
1660
+ thp_nr_pages(head));
15711661 }
15721662 out_putpage:
15731663 /*
....@@ -1575,10 +1665,36 @@
15751665 * isolate_lru_page() or drop the page ref if it was
15761666 * not isolated.
15771667 */
1578
- put_page(page);
1668
+ put_user_page(page);
15791669 out:
1580
- up_read(&mm->mmap_sem);
1670
+ mmap_read_unlock(mm);
15811671 return err;
1672
+}
1673
+
1674
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
1675
+ struct list_head *pagelist, int __user *status,
1676
+ int start, int i, unsigned long nr_pages)
1677
+{
1678
+ int err;
1679
+
1680
+ if (list_empty(pagelist))
1681
+ return 0;
1682
+
1683
+ err = do_move_pages_to_node(mm, pagelist, node);
1684
+ if (err) {
1685
+ /*
1686
+ * Positive err means the number of failed
1687
+ * pages to migrate. Since we are going to
1688
+ * abort and return the number of non-migrated
1689
+ * pages, so need to incude the rest of the
1690
+ * nr_pages that have not been attempted as
1691
+ * well.
1692
+ */
1693
+ if (err > 0)
1694
+ err += nr_pages - i - 1;
1695
+ return err;
1696
+ }
1697
+ return store_status(status, start, node, i - start);
15821698 }
15831699
15841700 /*
....@@ -1596,7 +1712,7 @@
15961712 int start, i;
15971713 int err = 0, err1;
15981714
1599
- migrate_prep();
1715
+ lru_cache_disable();
16001716
16011717 for (i = start = 0; i < nr_pages; i++) {
16021718 const void __user *p;
....@@ -1624,21 +1740,8 @@
16241740 current_node = node;
16251741 start = i;
16261742 } else if (node != current_node) {
1627
- err = do_move_pages_to_node(mm, &pagelist, current_node);
1628
- if (err) {
1629
- /*
1630
- * Positive err means the number of failed
1631
- * pages to migrate. Since we are going to
1632
- * abort and return the number of non-migrated
1633
- * pages, so need to incude the rest of the
1634
- * nr_pages that have not been attempted as
1635
- * well.
1636
- */
1637
- if (err > 0)
1638
- err += nr_pages - i - 1;
1639
- goto out;
1640
- }
1641
- err = store_status(status, start, current_node, i - start);
1743
+ err = move_pages_and_store_status(mm, current_node,
1744
+ &pagelist, status, start, i, nr_pages);
16421745 if (err)
16431746 goto out;
16441747 start = i;
....@@ -1652,52 +1755,33 @@
16521755 err = add_page_for_migration(mm, addr, current_node,
16531756 &pagelist, flags & MPOL_MF_MOVE_ALL);
16541757
1655
- if (!err) {
1656
- /* The page is already on the target node */
1657
- err = store_status(status, i, current_node, 1);
1658
- if (err)
1659
- goto out_flush;
1660
- continue;
1661
- } else if (err > 0) {
1758
+ if (err > 0) {
16621759 /* The page is successfully queued for migration */
16631760 continue;
16641761 }
16651762
1666
- err = store_status(status, i, err, 1);
1763
+ /*
1764
+ * If the page is already on the target node (!err), store the
1765
+ * node, otherwise, store the err.
1766
+ */
1767
+ err = store_status(status, i, err ? : current_node, 1);
16671768 if (err)
16681769 goto out_flush;
16691770
1670
- err = do_move_pages_to_node(mm, &pagelist, current_node);
1671
- if (err) {
1672
- if (err > 0)
1673
- err += nr_pages - i - 1;
1771
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
1772
+ status, start, i, nr_pages);
1773
+ if (err)
16741774 goto out;
1675
- }
1676
- if (i > start) {
1677
- err = store_status(status, start, current_node, i - start);
1678
- if (err)
1679
- goto out;
1680
- }
16811775 current_node = NUMA_NO_NODE;
16821776 }
16831777 out_flush:
1684
- if (list_empty(&pagelist))
1685
- return err;
1686
-
16871778 /* Make sure we do not overwrite the existing error */
1688
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
1689
- /*
1690
- * Don't have to report non-attempted pages here since:
1691
- * - If the above loop is done gracefully all pages have been
1692
- * attempted.
1693
- * - If the above loop is aborted it means a fatal error
1694
- * happened, should return ret.
1695
- */
1696
- if (!err1)
1697
- err1 = store_status(status, start, current_node, i - start);
1779
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1780
+ status, start, i, nr_pages);
16981781 if (err >= 0)
16991782 err = err1;
17001783 out:
1784
+ lru_cache_enable();
17011785 return err;
17021786 }
17031787
....@@ -1709,7 +1793,7 @@
17091793 {
17101794 unsigned long i;
17111795
1712
- down_read(&mm->mmap_sem);
1796
+ mmap_read_lock(mm);
17131797
17141798 for (i = 0; i < nr_pages; i++) {
17151799 unsigned long addr = (unsigned long)(*pages);
....@@ -1736,7 +1820,7 @@
17361820 status++;
17371821 }
17381822
1739
- up_read(&mm->mmap_sem);
1823
+ mmap_read_unlock(mm);
17401824 }
17411825
17421826 /*
....@@ -1773,6 +1857,53 @@
17731857 return nr_pages ? -EFAULT : 0;
17741858 }
17751859
1860
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1861
+{
1862
+ struct task_struct *task;
1863
+ struct mm_struct *mm;
1864
+
1865
+ /*
1866
+ * There is no need to check if current process has the right to modify
1867
+ * the specified process when they are same.
1868
+ */
1869
+ if (!pid) {
1870
+ mmget(current->mm);
1871
+ *mem_nodes = cpuset_mems_allowed(current);
1872
+ return current->mm;
1873
+ }
1874
+
1875
+ /* Find the mm_struct */
1876
+ rcu_read_lock();
1877
+ task = find_task_by_vpid(pid);
1878
+ if (!task) {
1879
+ rcu_read_unlock();
1880
+ return ERR_PTR(-ESRCH);
1881
+ }
1882
+ get_task_struct(task);
1883
+
1884
+ /*
1885
+ * Check if this process has the right to modify the specified
1886
+ * process. Use the regular "ptrace_may_access()" checks.
1887
+ */
1888
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1889
+ rcu_read_unlock();
1890
+ mm = ERR_PTR(-EPERM);
1891
+ goto out;
1892
+ }
1893
+ rcu_read_unlock();
1894
+
1895
+ mm = ERR_PTR(security_task_movememory(task));
1896
+ if (IS_ERR(mm))
1897
+ goto out;
1898
+ *mem_nodes = cpuset_mems_allowed(task);
1899
+ mm = get_task_mm(task);
1900
+out:
1901
+ put_task_struct(task);
1902
+ if (!mm)
1903
+ mm = ERR_PTR(-EINVAL);
1904
+ return mm;
1905
+}
1906
+
17761907 /*
17771908 * Move a list of pages in the address space of the currently executing
17781909 * process.
....@@ -1782,7 +1913,6 @@
17821913 const int __user *nodes,
17831914 int __user *status, int flags)
17841915 {
1785
- struct task_struct *task;
17861916 struct mm_struct *mm;
17871917 int err;
17881918 nodemask_t task_nodes;
....@@ -1794,36 +1924,9 @@
17941924 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
17951925 return -EPERM;
17961926
1797
- /* Find the mm_struct */
1798
- rcu_read_lock();
1799
- task = pid ? find_task_by_vpid(pid) : current;
1800
- if (!task) {
1801
- rcu_read_unlock();
1802
- return -ESRCH;
1803
- }
1804
- get_task_struct(task);
1805
-
1806
- /*
1807
- * Check if this process has the right to modify the specified
1808
- * process. Use the regular "ptrace_may_access()" checks.
1809
- */
1810
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1811
- rcu_read_unlock();
1812
- err = -EPERM;
1813
- goto out;
1814
- }
1815
- rcu_read_unlock();
1816
-
1817
- err = security_task_movememory(task);
1818
- if (err)
1819
- goto out;
1820
-
1821
- task_nodes = cpuset_mems_allowed(task);
1822
- mm = get_task_mm(task);
1823
- put_task_struct(task);
1824
-
1825
- if (!mm)
1826
- return -EINVAL;
1927
+ mm = find_mm_struct(pid, &task_nodes);
1928
+ if (IS_ERR(mm))
1929
+ return PTR_ERR(mm);
18271930
18281931 if (nodes)
18291932 err = do_pages_move(mm, task_nodes, nr_pages, pages,
....@@ -1832,10 +1935,6 @@
18321935 err = do_pages_stat(mm, nr_pages, pages, status);
18331936
18341937 mmput(mm);
1835
- return err;
1836
-
1837
-out:
1838
- put_task_struct(task);
18391938 return err;
18401939 }
18411940
....@@ -1889,7 +1988,7 @@
18891988 if (!zone_watermark_ok(zone, 0,
18901989 high_wmark_pages(zone) +
18911990 nr_migrate_pages,
1892
- 0, 0))
1991
+ ZONE_MOVABLE, 0))
18931992 continue;
18941993 return true;
18951994 }
....@@ -1918,7 +2017,7 @@
19182017 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
19192018
19202019 /* Avoid migrating to a node that is nearly full */
1921
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
2020
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
19222021 return 0;
19232022
19242023 if (isolate_lru_page(page))
....@@ -1936,9 +2035,9 @@
19362035 return 0;
19372036 }
19382037
1939
- page_lru = page_is_file_cache(page);
2038
+ page_lru = page_is_file_lru(page);
19402039 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1941
- hpage_nr_pages(page));
2040
+ thp_nr_pages(page));
19422041
19432042 /*
19442043 * Isolating the page has taken another reference, so the
....@@ -1960,7 +2059,7 @@
19602059 * node. Caller is expected to have an elevated reference count on
19612060 * the page that will be dropped by this function before returning.
19622061 */
1963
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2062
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
19642063 int node)
19652064 {
19662065 pg_data_t *pgdat = NODE_DATA(node);
....@@ -1972,15 +2071,15 @@
19722071 * Don't migrate file pages that are mapped in multiple processes
19732072 * with execute permissions as they are probably shared libraries.
19742073 */
1975
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1976
- (vma->vm_flags & VM_EXEC))
2074
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2075
+ (vmf->vma_flags & VM_EXEC))
19772076 goto out;
19782077
19792078 /*
19802079 * Also do not migrate dirty pages as not all filesystems can move
19812080 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
19822081 */
1983
- if (page_is_file_cache(page) && PageDirty(page))
2082
+ if (page_is_file_lru(page) && PageDirty(page))
19842083 goto out;
19852084
19862085 isolated = numamigrate_isolate_page(pgdat, page);
....@@ -1995,7 +2094,7 @@
19952094 if (!list_empty(&migratepages)) {
19962095 list_del(&page->lru);
19972096 dec_node_page_state(page, NR_ISOLATED_ANON +
1998
- page_is_file_cache(page));
2097
+ page_is_file_lru(page));
19992098 putback_lru_page(page);
20002099 }
20012100 isolated = 0;
....@@ -2025,9 +2124,8 @@
20252124 pg_data_t *pgdat = NODE_DATA(node);
20262125 int isolated = 0;
20272126 struct page *new_page = NULL;
2028
- int page_lru = page_is_file_cache(page);
2029
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
2030
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
2127
+ int page_lru = page_is_file_lru(page);
2128
+ unsigned long start = address & HPAGE_PMD_MASK;
20312129
20322130 new_page = alloc_pages_node(node,
20332131 (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
....@@ -2050,15 +2148,15 @@
20502148 /* anon mapping, we can simply copy page->mapping to the new page: */
20512149 new_page->mapping = page->mapping;
20522150 new_page->index = page->index;
2151
+ /* flush the cache before copying using the kernel virtual address */
2152
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
20532153 migrate_page_copy(new_page, page);
20542154 WARN_ON(PageLRU(new_page));
20552155
20562156 /* Recheck the target PMD */
2057
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
20582157 ptl = pmd_lock(mm, pmd);
20592158 if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
20602159 spin_unlock(ptl);
2061
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
20622160
20632161 /* Reverse changes made by migrate_page_copy() */
20642162 if (TestClearPageActive(new_page))
....@@ -2089,8 +2187,7 @@
20892187 * new page and page_add_new_anon_rmap guarantee the copy is
20902188 * visible before the pagetable update.
20912189 */
2092
- flush_cache_range(vma, mmun_start, mmun_end);
2093
- page_add_anon_rmap(new_page, vma, mmun_start, true);
2190
+ page_add_anon_rmap(new_page, vma, start, true);
20942191 /*
20952192 * At this point the pmd is numa/protnone (i.e. non present) and the TLB
20962193 * has already been flushed globally. So no TLB can be currently
....@@ -2098,11 +2195,11 @@
20982195 * pmd before doing set_pmd_at(), nor to flush the TLB after
20992196 * set_pmd_at(). Clearing the pmd here would introduce a race
21002197 * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
2101
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
2198
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
21022199 * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
21032200 * pmd.
21042201 */
2105
- set_pmd_at(mm, mmun_start, pmd, entry);
2202
+ set_pmd_at(mm, start, pmd, entry);
21062203 update_mmu_cache_pmd(vma, address, &entry);
21072204
21082205 page_ref_unfreeze(page, 2);
....@@ -2111,11 +2208,6 @@
21112208 set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
21122209
21132210 spin_unlock(ptl);
2114
- /*
2115
- * No need to double call mmu_notifier->invalidate_range() callback as
2116
- * the above pmdp_huge_clear_flush_notify() did already call it.
2117
- */
2118
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
21192211
21202212 /* Take an "isolate" reference and put new page on the LRU. */
21212213 get_page(new_page);
....@@ -2139,7 +2231,7 @@
21392231 ptl = pmd_lock(mm, pmd);
21402232 if (pmd_same(*pmd, entry)) {
21412233 entry = pmd_modify(entry, vma->vm_page_prot);
2142
- set_pmd_at(mm, mmun_start, pmd, entry);
2234
+ set_pmd_at(mm, start, pmd, entry);
21432235 update_mmu_cache_pmd(vma, address, &entry);
21442236 }
21452237 spin_unlock(ptl);
....@@ -2153,25 +2245,26 @@
21532245
21542246 #endif /* CONFIG_NUMA */
21552247
2156
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
2157
-struct migrate_vma {
2158
- struct vm_area_struct *vma;
2159
- unsigned long *dst;
2160
- unsigned long *src;
2161
- unsigned long cpages;
2162
- unsigned long npages;
2163
- unsigned long start;
2164
- unsigned long end;
2165
-};
2166
-
2248
+#ifdef CONFIG_DEVICE_PRIVATE
21672249 static int migrate_vma_collect_hole(unsigned long start,
21682250 unsigned long end,
2251
+ __always_unused int depth,
21692252 struct mm_walk *walk)
21702253 {
21712254 struct migrate_vma *migrate = walk->private;
21722255 unsigned long addr;
21732256
2174
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2257
+ /* Only allow populating anonymous memory. */
2258
+ if (!vma_is_anonymous(walk->vma)) {
2259
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2260
+ migrate->src[migrate->npages] = 0;
2261
+ migrate->dst[migrate->npages] = 0;
2262
+ migrate->npages++;
2263
+ }
2264
+ return 0;
2265
+ }
2266
+
2267
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
21752268 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
21762269 migrate->dst[migrate->npages] = 0;
21772270 migrate->npages++;
....@@ -2188,7 +2281,7 @@
21882281 struct migrate_vma *migrate = walk->private;
21892282 unsigned long addr;
21902283
2191
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2284
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
21922285 migrate->dst[migrate->npages] = 0;
21932286 migrate->src[migrate->npages++] = 0;
21942287 }
....@@ -2210,7 +2303,7 @@
22102303
22112304 again:
22122305 if (pmd_none(*pmdp))
2213
- return migrate_vma_collect_hole(start, end, walk);
2306
+ return migrate_vma_collect_hole(start, end, -1, walk);
22142307
22152308 if (pmd_trans_huge(*pmdp)) {
22162309 struct page *page;
....@@ -2243,7 +2336,7 @@
22432336 return migrate_vma_collect_skip(start, end,
22442337 walk);
22452338 if (pmd_none(*pmdp))
2246
- return migrate_vma_collect_hole(start, end,
2339
+ return migrate_vma_collect_hole(start, end, -1,
22472340 walk);
22482341 }
22492342 }
....@@ -2255,24 +2348,22 @@
22552348 arch_enter_lazy_mmu_mode();
22562349
22572350 for (; addr < end; addr += PAGE_SIZE, ptep++) {
2258
- unsigned long mpfn, pfn;
2351
+ unsigned long mpfn = 0, pfn;
22592352 struct page *page;
22602353 swp_entry_t entry;
22612354 pte_t pte;
22622355
22632356 pte = *ptep;
2264
- pfn = pte_pfn(pte);
22652357
22662358 if (pte_none(pte)) {
2267
- mpfn = MIGRATE_PFN_MIGRATE;
2268
- migrate->cpages++;
2269
- pfn = 0;
2359
+ if (vma_is_anonymous(vma)) {
2360
+ mpfn = MIGRATE_PFN_MIGRATE;
2361
+ migrate->cpages++;
2362
+ }
22702363 goto next;
22712364 }
22722365
22732366 if (!pte_present(pte)) {
2274
- mpfn = pfn = 0;
2275
-
22762367 /*
22772368 * Only care about unaddressable device page special
22782369 * page table entry. Other special swap entries are not
....@@ -2283,28 +2374,34 @@
22832374 goto next;
22842375
22852376 page = device_private_entry_to_page(entry);
2286
- mpfn = migrate_pfn(page_to_pfn(page))|
2287
- MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2377
+ if (!(migrate->flags &
2378
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
2379
+ page->pgmap->owner != migrate->pgmap_owner)
2380
+ goto next;
2381
+
2382
+ mpfn = migrate_pfn(page_to_pfn(page)) |
2383
+ MIGRATE_PFN_MIGRATE;
22882384 if (is_write_device_private_entry(entry))
22892385 mpfn |= MIGRATE_PFN_WRITE;
22902386 } else {
2387
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
2388
+ goto next;
2389
+ pfn = pte_pfn(pte);
22912390 if (is_zero_pfn(pfn)) {
22922391 mpfn = MIGRATE_PFN_MIGRATE;
22932392 migrate->cpages++;
2294
- pfn = 0;
22952393 goto next;
22962394 }
2297
- page = _vm_normal_page(migrate->vma, addr, pte, true);
2395
+ page = vm_normal_page(migrate->vma, addr, pte);
22982396 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
22992397 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
23002398 }
23012399
23022400 /* FIXME support THP */
23032401 if (!page || !page->mapping || PageTransCompound(page)) {
2304
- mpfn = pfn = 0;
2402
+ mpfn = 0;
23052403 goto next;
23062404 }
2307
- pfn = page_to_pfn(page);
23082405
23092406 /*
23102407 * By getting a reference on the page we pin it and that blocks
....@@ -2333,8 +2430,17 @@
23332430 entry = make_migration_entry(page, mpfn &
23342431 MIGRATE_PFN_WRITE);
23352432 swp_pte = swp_entry_to_pte(entry);
2336
- if (pte_soft_dirty(pte))
2337
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
2433
+ if (pte_present(pte)) {
2434
+ if (pte_soft_dirty(pte))
2435
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
2436
+ if (pte_uffd_wp(pte))
2437
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
2438
+ } else {
2439
+ if (pte_swp_soft_dirty(pte))
2440
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
2441
+ if (pte_swp_uffd_wp(pte))
2442
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
2443
+ }
23382444 set_pte_at(mm, addr, ptep, swp_pte);
23392445
23402446 /*
....@@ -2353,15 +2459,21 @@
23532459 migrate->dst[migrate->npages] = 0;
23542460 migrate->src[migrate->npages++] = mpfn;
23552461 }
2356
- arch_leave_lazy_mmu_mode();
2357
- pte_unmap_unlock(ptep - 1, ptl);
23582462
23592463 /* Only flush the TLB if we actually modified any entries */
23602464 if (unmapped)
23612465 flush_tlb_range(walk->vma, start, end);
23622466
2467
+ arch_leave_lazy_mmu_mode();
2468
+ pte_unmap_unlock(ptep - 1, ptl);
2469
+
23632470 return 0;
23642471 }
2472
+
2473
+static const struct mm_walk_ops migrate_vma_walk_ops = {
2474
+ .pmd_entry = migrate_vma_collect_pmd,
2475
+ .pte_hole = migrate_vma_collect_hole,
2476
+};
23652477
23662478 /*
23672479 * migrate_vma_collect() - collect pages over a range of virtual addresses
....@@ -2373,22 +2485,22 @@
23732485 */
23742486 static void migrate_vma_collect(struct migrate_vma *migrate)
23752487 {
2376
- struct mm_walk mm_walk = {
2377
- .pmd_entry = migrate_vma_collect_pmd,
2378
- .pte_hole = migrate_vma_collect_hole,
2379
- .vma = migrate->vma,
2380
- .mm = migrate->vma->vm_mm,
2381
- .private = migrate,
2382
- };
2488
+ struct mmu_notifier_range range;
23832489
2384
- mmu_notifier_invalidate_range_start(mm_walk.mm,
2385
- migrate->start,
2386
- migrate->end);
2387
- walk_page_range(migrate->start, migrate->end, &mm_walk);
2388
- mmu_notifier_invalidate_range_end(mm_walk.mm,
2389
- migrate->start,
2390
- migrate->end);
2490
+ /*
2491
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
2492
+ * that the registered device driver can skip invalidating device
2493
+ * private page mappings that won't be migrated.
2494
+ */
2495
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
2496
+ migrate->vma->vm_mm, migrate->start, migrate->end,
2497
+ migrate->pgmap_owner);
2498
+ mmu_notifier_invalidate_range_start(&range);
23912499
2500
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
2501
+ &migrate_vma_walk_ops, migrate);
2502
+
2503
+ mmu_notifier_invalidate_range_end(&range);
23922504 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
23932505 }
23942506
....@@ -2432,16 +2544,7 @@
24322544 * FIXME proper solution is to rework migration_entry_wait() so
24332545 * it does not need to take a reference on page.
24342546 */
2435
- if (is_device_private_page(page))
2436
- return true;
2437
-
2438
- /*
2439
- * Only allow device public page to be migrated and account for
2440
- * the extra reference count imply by ZONE_DEVICE pages.
2441
- */
2442
- if (!is_device_public_page(page))
2443
- return false;
2444
- extra++;
2547
+ return is_device_private_page(page);
24452548 }
24462549
24472550 /* For file back page */
....@@ -2575,7 +2678,7 @@
25752678 */
25762679 static void migrate_vma_unmap(struct migrate_vma *migrate)
25772680 {
2578
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2681
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
25792682 const unsigned long npages = migrate->npages;
25802683 const unsigned long start = migrate->start;
25812684 unsigned long addr, i, restore = 0;
....@@ -2620,6 +2723,118 @@
26202723 }
26212724 }
26222725
2726
+/**
2727
+ * migrate_vma_setup() - prepare to migrate a range of memory
2728
+ * @args: contains the vma, start, and pfns arrays for the migration
2729
+ *
2730
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
2731
+ * without an error.
2732
+ *
2733
+ * Prepare to migrate a range of memory virtual address range by collecting all
2734
+ * the pages backing each virtual address in the range, saving them inside the
2735
+ * src array. Then lock those pages and unmap them. Once the pages are locked
2736
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
2737
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
2738
+ * corresponding src array entry. Then restores any pages that are pinned, by
2739
+ * remapping and unlocking those pages.
2740
+ *
2741
+ * The caller should then allocate destination memory and copy source memory to
2742
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
2743
+ * flag set). Once these are allocated and copied, the caller must update each
2744
+ * corresponding entry in the dst array with the pfn value of the destination
2745
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
2746
+ * (destination pages must have their struct pages locked, via lock_page()).
2747
+ *
2748
+ * Note that the caller does not have to migrate all the pages that are marked
2749
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
2750
+ * device memory to system memory. If the caller cannot migrate a device page
2751
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
2752
+ * consequences for the userspace process, so it must be avoided if at all
2753
+ * possible.
2754
+ *
2755
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
2756
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
2757
+ * allowing the caller to allocate device memory for those unback virtual
2758
+ * address. For this the caller simply has to allocate device memory and
2759
+ * properly set the destination entry like for regular migration. Note that
2760
+ * this can still fails and thus inside the device driver must check if the
2761
+ * migration was successful for those entries after calling migrate_vma_pages()
2762
+ * just like for regular migration.
2763
+ *
2764
+ * After that, the callers must call migrate_vma_pages() to go over each entry
2765
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2766
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2767
+ * then migrate_vma_pages() to migrate struct page information from the source
2768
+ * struct page to the destination struct page. If it fails to migrate the
2769
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
2770
+ * src array.
2771
+ *
2772
+ * At this point all successfully migrated pages have an entry in the src
2773
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2774
+ * array entry with MIGRATE_PFN_VALID flag set.
2775
+ *
2776
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
2777
+ * successfully migrated, and which were not. Successfully migrated pages will
2778
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
2779
+ *
2780
+ * It is safe to update device page table after migrate_vma_pages() because
2781
+ * both destination and source page are still locked, and the mmap_lock is held
2782
+ * in read mode (hence no one can unmap the range being migrated).
2783
+ *
2784
+ * Once the caller is done cleaning up things and updating its page table (if it
2785
+ * chose to do so, this is not an obligation) it finally calls
2786
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
2787
+ * for successfully migrated pages or otherwise restore the CPU page table to
2788
+ * point to the original source pages.
2789
+ */
2790
+int migrate_vma_setup(struct migrate_vma *args)
2791
+{
2792
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
2793
+
2794
+ args->start &= PAGE_MASK;
2795
+ args->end &= PAGE_MASK;
2796
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
2797
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
2798
+ return -EINVAL;
2799
+ if (nr_pages <= 0)
2800
+ return -EINVAL;
2801
+ if (args->start < args->vma->vm_start ||
2802
+ args->start >= args->vma->vm_end)
2803
+ return -EINVAL;
2804
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
2805
+ return -EINVAL;
2806
+ if (!args->src || !args->dst)
2807
+ return -EINVAL;
2808
+
2809
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
2810
+ args->cpages = 0;
2811
+ args->npages = 0;
2812
+
2813
+ migrate_vma_collect(args);
2814
+
2815
+ if (args->cpages)
2816
+ migrate_vma_prepare(args);
2817
+ if (args->cpages)
2818
+ migrate_vma_unmap(args);
2819
+
2820
+ /*
2821
+ * At this point pages are locked and unmapped, and thus they have
2822
+ * stable content and can safely be copied to destination memory that
2823
+ * is allocated by the drivers.
2824
+ */
2825
+ return 0;
2826
+
2827
+}
2828
+EXPORT_SYMBOL(migrate_vma_setup);
2829
+
2830
+/*
2831
+ * This code closely matches the code in:
2832
+ * __handle_mm_fault()
2833
+ * handle_pte_fault()
2834
+ * do_anonymous_page()
2835
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
2836
+ * private page.
2837
+ */
26232838 static void migrate_vma_insert_page(struct migrate_vma *migrate,
26242839 unsigned long addr,
26252840 struct page *page,
....@@ -2628,7 +2843,6 @@
26282843 {
26292844 struct vm_area_struct *vma = migrate->vma;
26302845 struct mm_struct *mm = vma->vm_mm;
2631
- struct mem_cgroup *memcg;
26322846 bool flush = false;
26332847 spinlock_t *ptl;
26342848 pte_t entry;
....@@ -2661,12 +2875,12 @@
26612875 * pte_offset_map() on pmds where a huge pmd might be created
26622876 * from a different thread.
26632877 *
2664
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2878
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
26652879 * parallel threads are excluded by other means.
26662880 *
2667
- * Here we only have down_read(mmap_sem).
2881
+ * Here we only have mmap_read_lock(mm).
26682882 */
2669
- if (pte_alloc(mm, pmdp, addr))
2883
+ if (pte_alloc(mm, pmdp))
26702884 goto abort;
26712885
26722886 /* See the comment in pte_alloc_one_map() */
....@@ -2675,7 +2889,7 @@
26752889
26762890 if (unlikely(anon_vma_prepare(vma)))
26772891 goto abort;
2678
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2892
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
26792893 goto abort;
26802894
26812895 /*
....@@ -2691,11 +2905,13 @@
26912905
26922906 swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
26932907 entry = swp_entry_to_pte(swp_entry);
2694
- } else if (is_device_public_page(page)) {
2695
- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2696
- if (vma->vm_flags & VM_WRITE)
2697
- entry = pte_mkwrite(pte_mkdirty(entry));
2698
- entry = pte_mkdevmap(entry);
2908
+ } else {
2909
+ /*
2910
+ * For now we only support migrating to un-addressable
2911
+ * device memory.
2912
+ */
2913
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
2914
+ goto abort;
26992915 }
27002916 } else {
27012917 entry = mk_pte(page, vma->vm_page_prot);
....@@ -2705,36 +2921,29 @@
27052921
27062922 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
27072923
2924
+ if (check_stable_address_space(mm))
2925
+ goto unlock_abort;
2926
+
27082927 if (pte_present(*ptep)) {
27092928 unsigned long pfn = pte_pfn(*ptep);
27102929
2711
- if (!is_zero_pfn(pfn)) {
2712
- pte_unmap_unlock(ptep, ptl);
2713
- mem_cgroup_cancel_charge(page, memcg, false);
2714
- goto abort;
2715
- }
2930
+ if (!is_zero_pfn(pfn))
2931
+ goto unlock_abort;
27162932 flush = true;
2717
- } else if (!pte_none(*ptep)) {
2718
- pte_unmap_unlock(ptep, ptl);
2719
- mem_cgroup_cancel_charge(page, memcg, false);
2720
- goto abort;
2721
- }
2933
+ } else if (!pte_none(*ptep))
2934
+ goto unlock_abort;
27222935
27232936 /*
2724
- * Check for usefaultfd but do not deliver the fault. Instead,
2937
+ * Check for userfaultfd but do not deliver the fault. Instead,
27252938 * just back off.
27262939 */
2727
- if (userfaultfd_missing(vma)) {
2728
- pte_unmap_unlock(ptep, ptl);
2729
- mem_cgroup_cancel_charge(page, memcg, false);
2730
- goto abort;
2731
- }
2940
+ if (userfaultfd_missing(vma))
2941
+ goto unlock_abort;
27322942
27332943 inc_mm_counter(mm, MM_ANONPAGES);
27342944 page_add_new_anon_rmap(page, vma, addr, false);
2735
- mem_cgroup_commit_charge(page, memcg, false, false);
27362945 if (!is_zone_device_page(page))
2737
- lru_cache_add_active_or_unevictable(page, vma);
2946
+ lru_cache_add_inactive_or_unevictable(page, vma);
27382947 get_page(page);
27392948
27402949 if (flush) {
....@@ -2752,11 +2961,13 @@
27522961 *src = MIGRATE_PFN_MIGRATE;
27532962 return;
27542963
2964
+unlock_abort:
2965
+ pte_unmap_unlock(ptep, ptl);
27552966 abort:
27562967 *src &= ~MIGRATE_PFN_MIGRATE;
27572968 }
27582969
2759
-/*
2970
+/**
27602971 * migrate_vma_pages() - migrate meta-data from src page to dst page
27612972 * @migrate: migrate struct containing all migration information
27622973 *
....@@ -2764,13 +2975,12 @@
27642975 * struct page. This effectively finishes the migration from source page to the
27652976 * destination page.
27662977 */
2767
-static void migrate_vma_pages(struct migrate_vma *migrate)
2978
+void migrate_vma_pages(struct migrate_vma *migrate)
27682979 {
27692980 const unsigned long npages = migrate->npages;
27702981 const unsigned long start = migrate->start;
2771
- struct vm_area_struct *vma = migrate->vma;
2772
- struct mm_struct *mm = vma->vm_mm;
2773
- unsigned long addr, i, mmu_start;
2982
+ struct mmu_notifier_range range;
2983
+ unsigned long addr, i;
27742984 bool notified = false;
27752985
27762986 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
....@@ -2785,15 +2995,17 @@
27852995 }
27862996
27872997 if (!page) {
2788
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
2998
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
27892999 continue;
2790
- }
27913000 if (!notified) {
2792
- mmu_start = addr;
27933001 notified = true;
2794
- mmu_notifier_invalidate_range_start(mm,
2795
- mmu_start,
2796
- migrate->end);
3002
+
3003
+ mmu_notifier_range_init(&range,
3004
+ MMU_NOTIFY_CLEAR, 0,
3005
+ NULL,
3006
+ migrate->vma->vm_mm,
3007
+ addr, migrate->end);
3008
+ mmu_notifier_invalidate_range_start(&range);
27973009 }
27983010 migrate_vma_insert_page(migrate, addr, newpage,
27993011 &migrate->src[i],
....@@ -2813,7 +3025,7 @@
28133025 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
28143026 continue;
28153027 }
2816
- } else if (!is_device_public_page(newpage)) {
3028
+ } else {
28173029 /*
28183030 * Other types of ZONE_DEVICE page are not
28193031 * supported.
....@@ -2834,11 +3046,11 @@
28343046 * did already call it.
28353047 */
28363048 if (notified)
2837
- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2838
- migrate->end);
3049
+ mmu_notifier_invalidate_range_only_end(&range);
28393050 }
3051
+EXPORT_SYMBOL(migrate_vma_pages);
28403052
2841
-/*
3053
+/**
28423054 * migrate_vma_finalize() - restore CPU page table entry
28433055 * @migrate: migrate struct containing all migration information
28443056 *
....@@ -2849,7 +3061,7 @@
28493061 * This also unlocks the pages and puts them back on the lru, or drops the extra
28503062 * refcount, for device pages.
28513063 */
2852
-static void migrate_vma_finalize(struct migrate_vma *migrate)
3064
+void migrate_vma_finalize(struct migrate_vma *migrate)
28533065 {
28543066 const unsigned long npages = migrate->npages;
28553067 unsigned long i;
....@@ -2876,7 +3088,6 @@
28763088
28773089 remove_migration_ptes(page, newpage, false);
28783090 unlock_page(page);
2879
- migrate->cpages--;
28803091
28813092 if (is_zone_device_page(page))
28823093 put_page(page);
....@@ -2892,124 +3103,5 @@
28923103 }
28933104 }
28943105 }
2895
-
2896
-/*
2897
- * migrate_vma() - migrate a range of memory inside vma
2898
- *
2899
- * @ops: migration callback for allocating destination memory and copying
2900
- * @vma: virtual memory area containing the range to be migrated
2901
- * @start: start address of the range to migrate (inclusive)
2902
- * @end: end address of the range to migrate (exclusive)
2903
- * @src: array of hmm_pfn_t containing source pfns
2904
- * @dst: array of hmm_pfn_t containing destination pfns
2905
- * @private: pointer passed back to each of the callback
2906
- * Returns: 0 on success, error code otherwise
2907
- *
2908
- * This function tries to migrate a range of memory virtual address range, using
2909
- * callbacks to allocate and copy memory from source to destination. First it
2910
- * collects all the pages backing each virtual address in the range, saving this
2911
- * inside the src array. Then it locks those pages and unmaps them. Once the pages
2912
- * are locked and unmapped, it checks whether each page is pinned or not. Pages
2913
- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2914
- * in the corresponding src array entry. It then restores any pages that are
2915
- * pinned, by remapping and unlocking those pages.
2916
- *
2917
- * At this point it calls the alloc_and_copy() callback. For documentation on
2918
- * what is expected from that callback, see struct migrate_vma_ops comments in
2919
- * include/linux/migrate.h
2920
- *
2921
- * After the alloc_and_copy() callback, this function goes over each entry in
2922
- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2923
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2924
- * then the function tries to migrate struct page information from the source
2925
- * struct page to the destination struct page. If it fails to migrate the struct
2926
- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2927
- * array.
2928
- *
2929
- * At this point all successfully migrated pages have an entry in the src
2930
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2931
- * array entry with MIGRATE_PFN_VALID flag set.
2932
- *
2933
- * It then calls the finalize_and_map() callback. See comments for "struct
2934
- * migrate_vma_ops", in include/linux/migrate.h for details about
2935
- * finalize_and_map() behavior.
2936
- *
2937
- * After the finalize_and_map() callback, for successfully migrated pages, this
2938
- * function updates the CPU page table to point to new pages, otherwise it
2939
- * restores the CPU page table to point to the original source pages.
2940
- *
2941
- * Function returns 0 after the above steps, even if no pages were migrated
2942
- * (The function only returns an error if any of the arguments are invalid.)
2943
- *
2944
- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2945
- * unsigned long entries.
2946
- */
2947
-int migrate_vma(const struct migrate_vma_ops *ops,
2948
- struct vm_area_struct *vma,
2949
- unsigned long start,
2950
- unsigned long end,
2951
- unsigned long *src,
2952
- unsigned long *dst,
2953
- void *private)
2954
-{
2955
- struct migrate_vma migrate;
2956
-
2957
- /* Sanity check the arguments */
2958
- start &= PAGE_MASK;
2959
- end &= PAGE_MASK;
2960
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
2961
- vma_is_dax(vma))
2962
- return -EINVAL;
2963
- if (start < vma->vm_start || start >= vma->vm_end)
2964
- return -EINVAL;
2965
- if (end <= vma->vm_start || end > vma->vm_end)
2966
- return -EINVAL;
2967
- if (!ops || !src || !dst || start >= end)
2968
- return -EINVAL;
2969
-
2970
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2971
- migrate.src = src;
2972
- migrate.dst = dst;
2973
- migrate.start = start;
2974
- migrate.npages = 0;
2975
- migrate.cpages = 0;
2976
- migrate.end = end;
2977
- migrate.vma = vma;
2978
-
2979
- /* Collect, and try to unmap source pages */
2980
- migrate_vma_collect(&migrate);
2981
- if (!migrate.cpages)
2982
- return 0;
2983
-
2984
- /* Lock and isolate page */
2985
- migrate_vma_prepare(&migrate);
2986
- if (!migrate.cpages)
2987
- return 0;
2988
-
2989
- /* Unmap pages */
2990
- migrate_vma_unmap(&migrate);
2991
- if (!migrate.cpages)
2992
- return 0;
2993
-
2994
- /*
2995
- * At this point pages are locked and unmapped, and thus they have
2996
- * stable content and can safely be copied to destination memory that
2997
- * is allocated by the callback.
2998
- *
2999
- * Note that migration can fail in migrate_vma_struct_page() for each
3000
- * individual page.
3001
- */
3002
- ops->alloc_and_copy(vma, src, dst, start, end, private);
3003
-
3004
- /* This does the real migration of struct page */
3005
- migrate_vma_pages(&migrate);
3006
-
3007
- ops->finalize_and_map(vma, src, dst, start, end, private);
3008
-
3009
- /* Unlock and remap pages */
3010
- migrate_vma_finalize(&migrate);
3011
-
3012
- return 0;
3013
-}
3014
-EXPORT_SYMBOL(migrate_vma);
3015
-#endif /* defined(MIGRATE_VMA_HELPER) */
3106
+EXPORT_SYMBOL(migrate_vma_finalize);
3107
+#endif /* CONFIG_DEVICE_PRIVATE */