From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/mm/migrate.c | 1275 +++++++++++++++++++++++++++++++--------------------------
1 files changed, 684 insertions(+), 591 deletions(-)
diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c
index e748bef..ac32427 100644
--- a/kernel/mm/migrate.c
+++ b/kernel/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pagewalk.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
@@ -47,39 +48,17 @@
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/mm.h>
+#include <trace/hooks/vmscan.h>
#include "internal.h"
-
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-int migrate_prep(void)
-{
- /*
- * Clear the LRU lists so pages can be isolated.
- * Note that pages may be moved off the LRU after we have
- * drained them. Those pages will fail to migrate like other
- * pages that may be busy.
- */
- lru_add_drain_all();
-
- return 0;
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-int migrate_prep_local(void)
-{
- lru_add_drain();
-
- return 0;
-}
int isolate_movable_page(struct page *page, isolate_mode_t mode)
{
@@ -100,7 +79,7 @@
/*
* Check PageMovable before holding a PG_lock because page's owner
* assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grapping the lock ruins page's owner side.
+ * so unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
@@ -129,7 +108,7 @@
/* Driver shouldn't use PG_isolated bit of page->flags */
WARN_ON_ONCE(PageIsolated(page));
- __SetPageIsolated(page);
+ SetPageIsolated(page);
unlock_page(page);
return 0;
@@ -153,7 +132,7 @@
mapping = page_mapping(page);
mapping->a_ops->putback_page(page);
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
}
/*
@@ -162,7 +141,7 @@
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -186,16 +165,17 @@
if (PageMovable(page))
putback_movable_page(page);
else
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
unlock_page(page);
put_page(page);
} else {
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
putback_lru_page(page);
}
}
}
+EXPORT_SYMBOL_GPL(putback_movable_pages);
/*
* Restore a potential migration pte to a working pte entry
@@ -240,15 +220,17 @@
*/
entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry))
- pte = maybe_mkwrite(pte, vma);
+ pte = maybe_mkwrite(pte, vma->vm_flags);
+ else if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
- if (unlikely(is_zone_device_page(new))) {
- if (is_device_private_page(new)) {
- entry = make_device_private_entry(new, pte_write(pte));
- pte = swp_entry_to_pte(entry);
- } else if (is_device_public_page(new)) {
- pte = pte_mkdevmap(pte);
- }
+ if (unlikely(is_device_private_page(new))) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_swp_mkuffd_wp(pte);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -322,19 +304,18 @@
goto out;
page = migration_entry_to_page(entry);
+ page = compound_head(page);
/*
- * Once radix-tree replacement of page migration started, page_count
- * *must* be zero. And, we don't want to call wait_on_page_locked()
- * against a page without get_page().
- * So, we use get_page_unless_zero(), here. Even failed, page fault
- * will occur again.
+ * Once page cache replacement of page migration started, page_count
+ * is zero; but we must not call put_and_wait_on_page_locked() without
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
*/
if (!get_page_unless_zero(page))
goto out;
pte_unmap_unlock(ptep, ptl);
- wait_on_page_locked(page);
- put_page(page);
+ trace_android_vh_waiting_for_page_migration(page);
+ put_and_wait_on_page_locked(page);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -368,63 +349,27 @@
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
return;
unlock:
spin_unlock(ptl);
}
#endif
-#ifdef CONFIG_BLOCK
-/* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
+static int expected_page_refs(struct address_space *mapping, struct page *page)
{
- struct buffer_head *bh = head;
+ int expected_count = 1;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
+ /*
+ * Device private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ if (mapping)
+ expected_count += thp_nr_pages(page) + page_has_private(page);
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
- do {
- get_bh(bh);
- if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- put_bh(failed_bh);
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- put_bh(bh);
- bh = bh->b_this_page;
- }
- return false;
- }
-
- bh = bh->b_this_page;
- } while (bh != head);
- return true;
+ return expected_count;
}
-#else
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
-{
- return true;
-}
-#endif /* CONFIG_BLOCK */
/*
* Replace the page in the mapping.
@@ -435,21 +380,13 @@
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode,
- int extra_count)
+ struct page *newpage, struct page *page, int extra_count)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = 1 + extra_count;
- void **pslot;
-
- /*
- * Device public or private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
- expected_count += is_device_public_page(page);
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int nr = thp_nr_pages(page);
if (!mapping) {
/* Anonymous page without mapping */
@@ -468,35 +405,14 @@
oldzone = page_zone(page);
newzone = page_zone(newpage);
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(page));
-
- expected_count += hpage_nr_pages(page) + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
- return -EAGAIN;
- }
-
- /*
- * In the async migration case of moving a page with buffers, lock the
- * buffers using trylock before the mapping is moved. If the mapping
- * was moved, we later failed to lock the buffers and could not move
- * the mapping back due to an elevated page count, we would have to
- * block waiting on other references to be dropped.
- */
- if (mode == MIGRATE_ASYNC && head &&
- !buffer_migrate_lock_buffers(head, mode)) {
- page_ref_unfreeze(page, expected_count);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -506,7 +422,7 @@
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
+ page_ref_add(newpage, nr); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -524,16 +440,13 @@
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
if (PageTransHuge(page)) {
int i;
- int index = page_index(page);
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- index + i);
- radix_tree_replace_slot(&mapping->i_pages, pslot,
- newpage + i);
+ for (i = 1; i < nr; i++) {
+ xas_next(&xas);
+ xas_store(&xas, newpage);
}
}
@@ -542,9 +455,9 @@
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
+ page_ref_unfreeze(page, expected_count - nr);
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
/*
@@ -558,17 +471,24 @@
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
- if (dirty && mapping_cap_account_dirty(mapping)) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
+ if (dirty && mapping_can_writeback(mapping)) {
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
local_irq_enable();
@@ -584,22 +504,18 @@
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
int expected_count;
- void **pslot;
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
-
+ xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -608,11 +524,11 @@
get_page(newpage);
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
page_ref_unfreeze(page, expected_count - 1);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return MIGRATEPAGE_SUCCESS;
}
@@ -656,7 +572,7 @@
} else {
/* thp page */
BUG_ON(!PageTransHuge(src));
- nr_pages = hpage_nr_pages(src);
+ nr_pages = thp_nr_pages(src);
}
for (i = 0; i < nr_pages; i++) {
@@ -671,6 +587,8 @@
void migrate_page_states(struct page *newpage, struct page *page)
{
int cpupid;
+
+ trace_android_vh_migrate_page_states(page, newpage);
if (PageError(page))
SetPageError(newpage);
@@ -689,6 +607,7 @@
SetPageChecked(newpage);
if (PageMappedToDisk(page))
SetPageMappedToDisk(newpage);
+ trace_android_vh_look_around_migrate_page(page, newpage);
/* Move dirty on pages not done by migrate_page_move_mapping() */
if (PageDirty(page))
@@ -723,9 +642,18 @@
if (PageWriteback(newpage))
end_page_writeback(newpage);
+ /*
+ * PG_readahead shares the same bit with PG_reclaim. The above
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
+ * bit after that.
+ */
+ if (PageReadahead(page))
+ SetPageReadahead(newpage);
+
copy_page_owner(page, newpage);
- mem_cgroup_migrate(page, newpage);
+ if (!PageHuge(page))
+ mem_cgroup_migrate(page, newpage);
}
EXPORT_SYMBOL(migrate_page_states);
@@ -758,7 +686,7 @@
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -772,40 +700,96 @@
EXPORT_SYMBOL(migrate_page);
#ifdef CONFIG_BLOCK
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+
+static int __buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode,
+ bool check_refs)
{
struct buffer_head *bh, *head;
int rc;
+ int expected_count;
if (!page_has_buffers(page))
return migrate_page(mapping, newpage, page, mode);
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = expected_page_refs(mapping, page);
+ if (page_count(page) != expected_count)
+ return -EAGAIN;
+
head = page_buffers(page);
+ if (!buffer_migrate_lock_buffers(head, mode))
+ return -EAGAIN;
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
+ if (check_refs) {
+ bool busy;
+ bool invalidated = false;
+recheck_buffers:
+ busy = false;
+ spin_lock(&mapping->private_lock);
+ bh = head;
+ do {
+ if (atomic_read(&bh->b_count)) {
+ busy = true;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ if (busy) {
+ if (invalidated) {
+ rc = -EAGAIN;
+ goto unlock_buffers;
+ }
+ spin_unlock(&mapping->private_lock);
+ invalidate_bh_lrus();
+ invalidated = true;
+ goto recheck_buffers;
+ }
+ }
+
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
+ goto unlock_buffers;
- /*
- * In the async case, migrate_page_move_mapping locked the buffers
- * with an IRQ-safe spinlock held. In the sync case, the buffers
- * need to be locked now
- */
- if (mode != MIGRATE_ASYNC)
- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
-
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- get_page(newpage);
+ attach_page_private(newpage, detach_page_private(page));
bh = head;
do {
@@ -814,24 +798,48 @@
} while (bh != head);
- SetPagePrivate(newpage);
-
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
+ rc = MIGRATEPAGE_SUCCESS;
+unlock_buffers:
+ if (check_refs)
+ spin_unlock(&mapping->private_lock);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
- return MIGRATEPAGE_SUCCESS;
+ return rc;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
}
EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Same as above except that this variant is more careful and checks that there
+ * are also no buffer head references. This function is the right one for
+ * mappings where buffer heads are directly looked up and referenced (such as
+ * block device mappings).
+ */
+int buffer_migrate_page_norefs(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
+}
#endif
/*
@@ -899,7 +907,7 @@
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
- return -EAGAIN;
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
@@ -951,7 +959,7 @@
VM_BUG_ON_PAGE(!PageIsolated(page), page);
if (!PageMovable(page)) {
rc = MIGRATEPAGE_SUCCESS;
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
goto out;
}
@@ -973,23 +981,23 @@
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
}
/*
- * Anonymous and movable page->mapping will be cleard by
+ * Anonymous and movable page->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
if (!PageMappingFlags(page))
page->mapping = NULL;
- if (unlikely(is_zone_device_page(newpage))) {
- if (is_device_public_page(newpage))
- flush_dcache_page(newpage);
- } else
- flush_dcache_page(newpage);
+ if (likely(!is_zone_device_page(newpage))) {
+ int i, nr = compound_nr(newpage);
+ for (i = 0; i < nr; i++)
+ flush_dcache_page(newpage + i);
+ }
}
out:
return rc;
@@ -1013,7 +1021,7 @@
* to the LRU. Later, when the IO completes the pages are
* marked uptodate and unlocked. However, the queueing
* could be merging multiple pages for one bio (e.g.
- * mpage_readpages). If an allocation happens for the
+ * mpage_readahead). If an allocation happens for the
* second or third page, the process can end up locking
* the same page twice and deadlocking. Rather than
* trying to be clever about what pages can be locked,
@@ -1101,8 +1109,7 @@
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
page_was_mapped = 1;
}
@@ -1141,34 +1148,19 @@
}
/*
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
- * around it.
- */
-#if defined(CONFIG_ARM) && \
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
-#define ICE_noinline noinline
-#else
-#define ICE_noinline
-#endif
-
-/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
- struct page *newpage;
+ struct page *newpage = NULL;
if (!thp_migration_supported() && PageTransHuge(page))
- return -ENOMEM;
-
- newpage = get_new_page(page, private);
- if (!newpage)
return -ENOMEM;
if (page_count(page) == 1) {
@@ -1178,15 +1170,15 @@
if (unlikely(__PageMovable(page))) {
lock_page(page);
if (!PageMovable(page))
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
unlock_page(page);
}
- if (put_new_page)
- put_new_page(newpage, private);
- else
- put_page(newpage);
goto out;
}
+
+ newpage = get_new_page(page, private);
+ if (!newpage)
+ return -ENOMEM;
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
@@ -1197,8 +1189,7 @@
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
+ * migrated will have kept its references and be restored.
*/
list_del(&page->lru);
@@ -1209,7 +1200,7 @@
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
}
/*
@@ -1218,16 +1209,11 @@
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
@@ -1239,7 +1225,7 @@
if (PageMovable(page))
putback_movable_page(page);
else
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
unlock_page(page);
put_page(page);
}
@@ -1280,9 +1266,10 @@
int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
/*
- * Movability of hugepages depends on architectures and hugepage size.
+ * Migratability of hugepages depends on architectures and their size.
* This check is necessary because some callers of hugepage migration
* like soft offline and memory hotremove don't walk through page
* tables or check whether the hugepage is pmd-based or not before
@@ -1327,9 +1314,29 @@
goto put_anon;
if (page_mapped(hpage)) {
- try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ bool mapping_locked = false;
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+
+ if (!PageAnon(hpage)) {
+ /*
+ * In shared mappings, try_to_unmap could potentially
+ * call huge_pmd_unshare. Because of this, take
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
+ * to let lower levels know we have taken the lock.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (unlikely(!mapping))
+ goto unlock_put_anon;
+
+ mapping_locked = true;
+ ttu |= TTU_RMAP_LOCKED;
+ }
+
+ try_to_unmap(hpage, ttu);
page_was_mapped = 1;
+
+ if (mapping_locked)
+ i_mmap_unlock_write(mapping);
}
if (!page_mapped(hpage))
@@ -1339,6 +1346,7 @@
remove_migration_ptes(hpage,
rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+unlock_put_anon:
unlock_page(new_hpage);
put_anon:
@@ -1395,22 +1403,37 @@
enum migrate_mode mode, int reason)
{
int retry = 1;
+ int thp_retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
int pass = 0;
+ bool is_thp = false;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
+ int rc, nr_subpages;
+
+ trace_mm_migrate_pages_start(mode, reason);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
- for(pass = 0; pass < 10 && retry; pass++) {
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
+ thp_retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page) && !PageHuge(page);
+ nr_subpages = thp_nr_pages(page);
cond_resched();
if (PageHuge(page))
@@ -1435,21 +1458,35 @@
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page) && !PageHuge(page)) {
+ if (is_thp) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
goto retry;
}
+
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
}
nr_failed++;
goto out;
case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
nr_succeeded++;
break;
default:
@@ -1459,24 +1496,76 @@
* removed from migration page list and not
* retried in the next outer loop.
*/
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
nr_failed++;
break;
}
}
}
- nr_failed += retry;
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
rc = nr_failed;
out:
- if (nr_succeeded)
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- if (nr_failed)
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
return rc;
+}
+EXPORT_SYMBOL_GPL(migrate_pages);
+
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
}
#ifdef CONFIG_NUMA
@@ -1496,12 +1585,13 @@
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- if (list_empty(pagelist))
- return 0;
-
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -1524,7 +1614,7 @@
unsigned int follflags;
int err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
err = -EFAULT;
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start || !vma_migratable(vma))
@@ -1552,8 +1642,9 @@
if (PageHuge(page)) {
if (PageHead(page)) {
- isolate_huge_page(page, pagelist);
- err = 1;
+ err = isolate_hugetlb(page, pagelist);
+ if (!err)
+ err = 1;
}
} else {
struct page *head;
@@ -1566,8 +1657,8 @@
err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
}
out_putpage:
/*
@@ -1575,10 +1666,36 @@
* isolate_lru_page() or drop the page ref if it was
* not isolated.
*/
- put_page(page);
+ put_user_page(page);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
+}
+
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+ struct list_head *pagelist, int __user *status,
+ int start, int i, unsigned long nr_pages)
+{
+ int err;
+
+ if (list_empty(pagelist))
+ return 0;
+
+ err = do_move_pages_to_node(mm, pagelist, node);
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
+ return err;
+ }
+ return store_status(status, start, node, i - start);
}
/*
@@ -1596,7 +1713,7 @@
int start, i;
int err = 0, err1;
- migrate_prep();
+ lru_cache_disable();
for (i = start = 0; i < nr_pages; i++) {
const void __user *p;
@@ -1624,21 +1741,8 @@
current_node = node;
start = i;
} else if (node != current_node) {
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- /*
- * Positive err means the number of failed
- * pages to migrate. Since we are going to
- * abort and return the number of non-migrated
- * pages, so need to incude the rest of the
- * nr_pages that have not been attempted as
- * well.
- */
- if (err > 0)
- err += nr_pages - i - 1;
- goto out;
- }
- err = store_status(status, start, current_node, i - start);
+ err = move_pages_and_store_status(mm, current_node,
+ &pagelist, status, start, i, nr_pages);
if (err)
goto out;
start = i;
@@ -1652,52 +1756,33 @@
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err) {
- /* The page is already on the target node */
- err = store_status(status, i, current_node, 1);
- if (err)
- goto out_flush;
- continue;
- } else if (err > 0) {
+ if (err > 0) {
/* The page is successfully queued for migration */
continue;
}
- err = store_status(status, i, err, 1);
+ /*
+ * If the page is already on the target node (!err), store the
+ * node, otherwise, store the err.
+ */
+ err = store_status(status, i, err ? : current_node, 1);
if (err)
goto out_flush;
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- if (err > 0)
- err += nr_pages - i - 1;
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err)
goto out;
- }
- if (i > start) {
- err = store_status(status, start, current_node, i - start);
- if (err)
- goto out;
- }
current_node = NUMA_NO_NODE;
}
out_flush:
- if (list_empty(&pagelist))
- return err;
-
/* Make sure we do not overwrite the existing error */
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
- /*
- * Don't have to report non-attempted pages here since:
- * - If the above loop is done gracefully all pages have been
- * attempted.
- * - If the above loop is aborted it means a fatal error
- * happened, should return ret.
- */
- if (!err1)
- err1 = store_status(status, start, current_node, i - start);
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
if (err >= 0)
err = err1;
out:
+ lru_cache_enable();
return err;
}
@@ -1709,7 +1794,7 @@
{
unsigned long i;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
@@ -1736,7 +1821,7 @@
status++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
/*
@@ -1773,6 +1858,53 @@
return nr_pages ? -EFAULT : 0;
}
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
+
+ /* Find the mm_struct */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return ERR_PTR(-ESRCH);
+ }
+ get_task_struct(task);
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ mm = ERR_PTR(-EPERM);
+ goto out;
+ }
+ rcu_read_unlock();
+
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
+ goto out;
+ *mem_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+out:
+ put_task_struct(task);
+ if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
/*
* Move a list of pages in the address space of the currently executing
* process.
@@ -1782,7 +1914,6 @@
const int __user *nodes,
int __user *status, int flags)
{
- struct task_struct *task;
struct mm_struct *mm;
int err;
nodemask_t task_nodes;
@@ -1794,36 +1925,9 @@
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
- /* Find the mm_struct */
- rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
- if (!task) {
- rcu_read_unlock();
- return -ESRCH;
- }
- get_task_struct(task);
-
- /*
- * Check if this process has the right to modify the specified
- * process. Use the regular "ptrace_may_access()" checks.
- */
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
- rcu_read_unlock();
- err = -EPERM;
- goto out;
- }
- rcu_read_unlock();
-
- err = security_task_movememory(task);
- if (err)
- goto out;
-
- task_nodes = cpuset_mems_allowed(task);
- mm = get_task_mm(task);
- put_task_struct(task);
-
- if (!mm)
- return -EINVAL;
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
@@ -1832,10 +1936,6 @@
err = do_pages_stat(mm, nr_pages, pages, status);
mmput(mm);
- return err;
-
-out:
- put_task_struct(task);
return err;
}
@@ -1889,7 +1989,7 @@
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
nr_migrate_pages,
- 0, 0))
+ ZONE_MOVABLE, 0))
continue;
return true;
}
@@ -1918,7 +2018,7 @@
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
if (isolate_lru_page(page))
@@ -1936,9 +2036,9 @@
return 0;
}
- page_lru = page_is_file_cache(page);
+ page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
+ thp_nr_pages(page));
/*
* Isolating the page has taken another reference, so the
@@ -1960,7 +2060,7 @@
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
int node)
{
pg_data_t *pgdat = NODE_DATA(node);
@@ -1972,15 +2072,15 @@
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
- (vma->vm_flags & VM_EXEC))
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+ (vmf->vma_flags & VM_EXEC))
goto out;
/*
* Also do not migrate dirty pages as not all filesystems can move
* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
goto out;
isolated = numamigrate_isolate_page(pgdat, page);
@@ -1995,7 +2095,7 @@
if (!list_empty(&migratepages)) {
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
putback_lru_page(page);
}
isolated = 0;
@@ -2025,9 +2125,8 @@
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ int page_lru = page_is_file_lru(page);
+ unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
@@ -2050,15 +2149,15 @@
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -2089,8 +2188,7 @@
* new page and page_add_new_anon_rmap guarantee the copy is
* visible before the pagetable update.
*/
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start, true);
+ page_add_anon_rmap(new_page, vma, start, true);
/*
* At this point the pmd is numa/protnone (i.e. non present) and the TLB
* has already been flushed globally. So no TLB can be currently
@@ -2098,11 +2196,11 @@
* pmd before doing set_pmd_at(), nor to flush the TLB after
* set_pmd_at(). Clearing the pmd here would introduce a race
* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
* pmd.
*/
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_ref_unfreeze(page, 2);
@@ -2111,11 +2209,6 @@
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
@@ -2139,7 +2232,7 @@
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
@@ -2153,25 +2246,26 @@
#endif /* CONFIG_NUMA */
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
-struct migrate_vma {
- struct vm_area_struct *vma;
- unsigned long *dst;
- unsigned long *src;
- unsigned long cpages;
- unsigned long npages;
- unsigned long start;
- unsigned long end;
-};
-
+#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
migrate->npages++;
@@ -2188,7 +2282,7 @@
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2210,7 +2304,7 @@
again:
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, walk);
+ return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
struct page *page;
@@ -2243,7 +2337,7 @@
return migrate_vma_collect_skip(start, end,
walk);
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end,
+ return migrate_vma_collect_hole(start, end, -1,
walk);
}
}
@@ -2255,24 +2349,22 @@
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
- unsigned long mpfn, pfn;
+ unsigned long mpfn = 0, pfn;
struct page *page;
swp_entry_t entry;
pte_t pte;
pte = *ptep;
- pfn = pte_pfn(pte);
if (pte_none(pte)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
- pfn = 0;
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
goto next;
}
if (!pte_present(pte)) {
- mpfn = pfn = 0;
-
/*
* Only care about unaddressable device page special
* page table entry. Other special swap entries are not
@@ -2283,28 +2375,34 @@
goto next;
page = device_private_entry_to_page(entry);
- mpfn = migrate_pfn(page_to_pfn(page))|
- MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
- page = _vm_normal_page(migrate->vma, addr, pte, true);
+ page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
/* FIXME support THP */
if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = pfn = 0;
+ mpfn = 0;
goto next;
}
- pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
@@ -2333,8 +2431,17 @@
entry = make_migration_entry(page, mpfn &
MIGRATE_PFN_WRITE);
swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, addr, ptep, swp_pte);
/*
@@ -2353,15 +2460,21 @@
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
+
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
/*
* migrate_vma_collect() - collect pages over a range of virtual addresses
@@ -2373,22 +2486,22 @@
*/
static void migrate_vma_collect(struct migrate_vma *migrate)
{
- struct mm_walk mm_walk = {
- .pmd_entry = migrate_vma_collect_pmd,
- .pte_hole = migrate_vma_collect_hole,
- .vma = migrate->vma,
- .mm = migrate->vma->vm_mm,
- .private = migrate,
- };
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm_walk.mm,
- migrate->start,
- migrate->end);
- walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(mm_walk.mm,
- migrate->start,
- migrate->end);
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
}
@@ -2432,16 +2545,7 @@
* FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page.
*/
- if (is_device_private_page(page))
- return true;
-
- /*
- * Only allow device public page to be migrated and account for
- * the extra reference count imply by ZONE_DEVICE pages.
- */
- if (!is_device_public_page(page))
- return false;
- extra++;
+ return is_device_private_page(page);
}
/* For file back page */
@@ -2575,7 +2679,7 @@
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2620,6 +2724,118 @@
}
}
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
+ * (destination pages must have their struct pages locked, via lock_page()).
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unback virtual
+ * address. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entries after calling migrate_vma_pages()
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_lock is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_prepare(args);
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2628,7 +2844,6 @@
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
- struct mem_cgroup *memcg;
bool flush = false;
spinlock_t *ptl;
pte_t entry;
@@ -2661,12 +2876,12 @@
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
- if (pte_alloc(mm, pmdp, addr))
+ if (pte_alloc(mm, pmdp))
goto abort;
/* See the comment in pte_alloc_one_map() */
@@ -2675,7 +2890,7 @@
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -2691,11 +2906,13 @@
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
- } else if (is_device_public_page(page)) {
- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- entry = pte_mkdevmap(entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
@@ -2705,36 +2922,29 @@
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
if (pte_present(*ptep)) {
unsigned long pfn = pte_pfn(*ptep);
- if (!is_zero_pfn(pfn)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
/*
- * Check for usefaultfd but do not deliver the fault. Instead,
+ * Check for userfaultfd but do not deliver the fault. Instead,
* just back off.
*/
- if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
if (!is_zone_device_page(page))
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
if (flush) {
@@ -2752,11 +2962,13 @@
*src = MIGRATE_PFN_MIGRATE;
return;
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/*
+/**
* migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
@@ -2764,13 +2976,12 @@
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
-static void migrate_vma_pages(struct migrate_vma *migrate)
+void migrate_vma_pages(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
- struct vm_area_struct *vma = migrate->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr, i, mmu_start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
bool notified = false;
for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2785,15 +2996,17 @@
}
if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
- }
if (!notified) {
- mmu_start = addr;
notified = true;
- mmu_notifier_invalidate_range_start(mm,
- mmu_start,
- migrate->end);
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
+ migrate->vma->vm_mm,
+ addr, migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
&migrate->src[i],
@@ -2813,7 +3026,7 @@
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- } else if (!is_device_public_page(newpage)) {
+ } else {
/*
* Other types of ZONE_DEVICE page are not
* supported.
@@ -2834,11 +3047,11 @@
* did already call it.
*/
if (notified)
- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
- migrate->end);
+ mmu_notifier_invalidate_range_only_end(&range);
}
+EXPORT_SYMBOL(migrate_vma_pages);
-/*
+/**
* migrate_vma_finalize() - restore CPU page table entry
* @migrate: migrate struct containing all migration information
*
@@ -2849,7 +3062,7 @@
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
*/
-static void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_vma_finalize(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
unsigned long i;
@@ -2876,7 +3089,6 @@
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);
@@ -2892,124 +3104,5 @@
}
}
}
-
-/*
- * migrate_vma() - migrate a range of memory inside vma
- *
- * @ops: migration callback for allocating destination memory and copying
- * @vma: virtual memory area containing the range to be migrated
- * @start: start address of the range to migrate (inclusive)
- * @end: end address of the range to migrate (exclusive)
- * @src: array of hmm_pfn_t containing source pfns
- * @dst: array of hmm_pfn_t containing destination pfns
- * @private: pointer passed back to each of the callback
- * Returns: 0 on success, error code otherwise
- *
- * This function tries to migrate a range of memory virtual address range, using
- * callbacks to allocate and copy memory from source to destination. First it
- * collects all the pages backing each virtual address in the range, saving this
- * inside the src array. Then it locks those pages and unmaps them. Once the pages
- * are locked and unmapped, it checks whether each page is pinned or not. Pages
- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
- * in the corresponding src array entry. It then restores any pages that are
- * pinned, by remapping and unlocking those pages.
- *
- * At this point it calls the alloc_and_copy() callback. For documentation on
- * what is expected from that callback, see struct migrate_vma_ops comments in
- * include/linux/migrate.h
- *
- * After the alloc_and_copy() callback, this function goes over each entry in
- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
- * then the function tries to migrate struct page information from the source
- * struct page to the destination struct page. If it fails to migrate the struct
- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
- * array.
- *
- * At this point all successfully migrated pages have an entry in the src
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
- * array entry with MIGRATE_PFN_VALID flag set.
- *
- * It then calls the finalize_and_map() callback. See comments for "struct
- * migrate_vma_ops", in include/linux/migrate.h for details about
- * finalize_and_map() behavior.
- *
- * After the finalize_and_map() callback, for successfully migrated pages, this
- * function updates the CPU page table to point to new pages, otherwise it
- * restores the CPU page table to point to the original source pages.
- *
- * Function returns 0 after the above steps, even if no pages were migrated
- * (The function only returns an error if any of the arguments are invalid.)
- *
- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
- * unsigned long entries.
- */
-int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private)
-{
- struct migrate_vma migrate;
-
- /* Sanity check the arguments */
- start &= PAGE_MASK;
- end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma))
- return -EINVAL;
- if (start < vma->vm_start || start >= vma->vm_end)
- return -EINVAL;
- if (end <= vma->vm_start || end > vma->vm_end)
- return -EINVAL;
- if (!ops || !src || !dst || start >= end)
- return -EINVAL;
-
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
- migrate.src = src;
- migrate.dst = dst;
- migrate.start = start;
- migrate.npages = 0;
- migrate.cpages = 0;
- migrate.end = end;
- migrate.vma = vma;
-
- /* Collect, and try to unmap source pages */
- migrate_vma_collect(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Lock and isolate page */
- migrate_vma_prepare(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Unmap pages */
- migrate_vma_unmap(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /*
- * At this point pages are locked and unmapped, and thus they have
- * stable content and can safely be copied to destination memory that
- * is allocated by the callback.
- *
- * Note that migration can fail in migrate_vma_struct_page() for each
- * individual page.
- */
- ops->alloc_and_copy(vma, src, dst, start, end, private);
-
- /* This does the real migration of struct page */
- migrate_vma_pages(&migrate);
-
- ops->finalize_and_map(vma, src, dst, start, end, private);
-
- /* Unlock and remap pages */
- migrate_vma_finalize(&migrate);
-
- return 0;
-}
-EXPORT_SYMBOL(migrate_vma);
-#endif /* defined(MIGRATE_VMA_HELPER) */
+EXPORT_SYMBOL(migrate_vma_finalize);
+#endif /* CONFIG_DEVICE_PRIVATE */
--
Gitblit v1.6.2