From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/migrate.c | 1275 +++++++++++++++++++++++++++++++-------------------------- 1 files changed, 684 insertions(+), 591 deletions(-) diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c index e748bef..ac32427 100644 --- a/kernel/mm/migrate.c +++ b/kernel/mm/migrate.c @@ -38,6 +38,7 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pagewalk.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> @@ -47,39 +48,17 @@ #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> +#include <linux/oom.h> #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> +#undef CREATE_TRACE_POINTS +#include <trace/hooks/mm.h> +#include <trace/hooks/vmscan.h> #include "internal.h" - -/* - * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is - * undesirable, use migrate_prep_local() - */ -int migrate_prep(void) -{ - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); - - return 0; -} - -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ -int migrate_prep_local(void) -{ - lru_add_drain(); - - return 0; -} int isolate_movable_page(struct page *page, isolate_mode_t mode) { @@ -100,7 +79,7 @@ /* * Check PageMovable before holding a PG_lock because page's owner * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grapping the lock ruins page's owner side. + * so unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; @@ -129,7 +108,7 @@ /* Driver shouldn't use PG_isolated bit of page->flags */ WARN_ON_ONCE(PageIsolated(page)); - __SetPageIsolated(page); + SetPageIsolated(page); unlock_page(page); return 0; @@ -153,7 +132,7 @@ mapping = page_mapping(page); mapping->a_ops->putback_page(page); - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* @@ -162,7 +141,7 @@ * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_huge_page(). + * and isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -186,16 +165,17 @@ if (PageMovable(page)) putback_movable_page(page); else - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); put_page(page); } else { mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -thp_nr_pages(page)); putback_lru_page(page); } } } +EXPORT_SYMBOL_GPL(putback_movable_pages); /* * Restore a potential migration pte to a working pte entry @@ -240,15 +220,17 @@ */ entry = pte_to_swp_entry(*pvmw.pte); if (is_write_migration_entry(entry)) - pte = maybe_mkwrite(pte, vma); + pte = maybe_mkwrite(pte, vma->vm_flags); + else if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); - if (unlikely(is_zone_device_page(new))) { - if (is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); - } else if (is_device_public_page(new)) { - pte = pte_mkdevmap(pte); - } + if (unlikely(is_device_private_page(new))) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_swp_mkuffd_wp(pte); } #ifdef CONFIG_HUGETLB_PAGE @@ -322,19 +304,18 @@ goto out; page = migration_entry_to_page(entry); + page = compound_head(page); /* - * Once radix-tree replacement of page migration started, page_count - * *must* be zero. And, we don't want to call wait_on_page_locked() - * against a page without get_page(). - * So, we use get_page_unless_zero(), here. Even failed, page fault - * will occur again. + * Once page cache replacement of page migration started, page_count + * is zero; but we must not call put_and_wait_on_page_locked() without + * a ref. Use get_page_unless_zero(), and just fault again if it fails. */ if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - wait_on_page_locked(page); - put_page(page); + trace_android_vh_waiting_for_page_migration(page); + put_and_wait_on_page_locked(page); return; out: pte_unmap_unlock(ptep, ptl); @@ -368,63 +349,27 @@ if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); return; unlock: spin_unlock(ptl); } #endif -#ifdef CONFIG_BLOCK -/* Returns true if all buffers are successfully locked */ -static bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) +static int expected_page_refs(struct address_space *mapping, struct page *page) { - struct buffer_head *bh = head; + int expected_count = 1; - /* Simple case, sync compaction */ - if (mode != MIGRATE_ASYNC) { - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; + /* + * Device private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + if (mapping) + expected_count += thp_nr_pages(page) + page_has_private(page); - } while (bh != head); - - return true; - } - - /* async case, we cannot block on lock_buffer so use trylock_buffer */ - do { - get_bh(bh); - if (!trylock_buffer(bh)) { - /* - * We failed to lock the buffer and cannot stall in - * async migration. Release the taken locks - */ - struct buffer_head *failed_bh = bh; - put_bh(failed_bh); - bh = head; - while (bh != failed_bh) { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - } - return false; - } - - bh = bh->b_this_page; - } while (bh != head); - return true; + return expected_count; } -#else -static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) -{ - return true; -} -#endif /* CONFIG_BLOCK */ /* * Replace the page in the mapping. @@ -435,21 +380,13 @@ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode, - int extra_count) + struct page *newpage, struct page *page, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; - int expected_count = 1 + extra_count; - void **pslot; - - /* - * Device public or private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); - expected_count += is_device_public_page(page); + int expected_count = expected_page_refs(mapping, page) + extra_count; + int nr = thp_nr_pages(page); if (!mapping) { /* Anonymous page without mapping */ @@ -468,35 +405,14 @@ oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); - - expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); - return -EAGAIN; - } - - /* - * In the async migration case of moving a page with buffers, lock the - * buffers using trylock before the mapping is moved. If the mapping - * was moved, we later failed to lock the buffers and could not move - * the mapping back due to an elevated page count, we would have to - * block waiting on other references to be dropped. - */ - if (mode == MIGRATE_ASYNC && head && - !buffer_migrate_lock_buffers(head, mode)) { - page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -506,7 +422,7 @@ */ newpage->index = page->index; newpage->mapping = page->mapping; - page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ + page_ref_add(newpage, nr); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -524,16 +440,13 @@ SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + for (i = 1; i < nr; i++) { + xas_next(&xas); + xas_store(&xas, newpage); } } @@ -542,9 +455,9 @@ * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); + page_ref_unfreeze(page, expected_count - nr); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -558,17 +471,24 @@ * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } - if (dirty && mapping_cap_account_dirty(mapping)) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); - __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); + if (dirty && mapping_can_writeback(mapping)) { + __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable(); @@ -584,22 +504,18 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); - + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -608,11 +524,11 @@ get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } @@ -656,7 +572,7 @@ } else { /* thp page */ BUG_ON(!PageTransHuge(src)); - nr_pages = hpage_nr_pages(src); + nr_pages = thp_nr_pages(src); } for (i = 0; i < nr_pages; i++) { @@ -671,6 +587,8 @@ void migrate_page_states(struct page *newpage, struct page *page) { int cpupid; + + trace_android_vh_migrate_page_states(page, newpage); if (PageError(page)) SetPageError(newpage); @@ -689,6 +607,7 @@ SetPageChecked(newpage); if (PageMappedToDisk(page)) SetPageMappedToDisk(newpage); + trace_android_vh_look_around_migrate_page(page, newpage); /* Move dirty on pages not done by migrate_page_move_mapping() */ if (PageDirty(page)) @@ -723,9 +642,18 @@ if (PageWriteback(newpage)) end_page_writeback(newpage); + /* + * PG_readahead shares the same bit with PG_reclaim. The above + * end_page_writeback() may clear PG_readahead mistakenly, so set the + * bit after that. + */ + if (PageReadahead(page)) + SetPageReadahead(newpage); + copy_page_owner(page, newpage); - mem_cgroup_migrate(page, newpage); + if (!PageHuge(page)) + mem_cgroup_migrate(page, newpage); } EXPORT_SYMBOL(migrate_page_states); @@ -758,7 +686,7 @@ BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -772,40 +700,96 @@ EXPORT_SYMBOL(migrate_page); #ifdef CONFIG_BLOCK -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} + +static int __buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode, + bool check_refs) { struct buffer_head *bh, *head; int rc; + int expected_count; if (!page_has_buffers(page)) return migrate_page(mapping, newpage, page, mode); + /* Check whether page does not have extra refs before we do more work */ + expected_count = expected_page_refs(mapping, page); + if (page_count(page) != expected_count) + return -EAGAIN; + head = page_buffers(page); + if (!buffer_migrate_lock_buffers(head, mode)) + return -EAGAIN; - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); + if (check_refs) { + bool busy; + bool invalidated = false; +recheck_buffers: + busy = false; + spin_lock(&mapping->private_lock); + bh = head; + do { + if (atomic_read(&bh->b_count)) { + busy = true; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (busy) { + if (invalidated) { + rc = -EAGAIN; + goto unlock_buffers; + } + spin_unlock(&mapping->private_lock); + invalidate_bh_lrus(); + invalidated = true; + goto recheck_buffers; + } + } + + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) - return rc; + goto unlock_buffers; - /* - * In the async case, migrate_page_move_mapping locked the buffers - * with an IRQ-safe spinlock held. In the sync case, the buffers - * need to be locked now - */ - if (mode != MIGRATE_ASYNC) - BUG_ON(!buffer_migrate_lock_buffers(head, mode)); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); + attach_page_private(newpage, detach_page_private(page)); bh = head; do { @@ -814,24 +798,48 @@ } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else migrate_page_states(newpage, page); + rc = MIGRATEPAGE_SUCCESS; +unlock_buffers: + if (check_refs) + spin_unlock(&mapping->private_lock); bh = head; do { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } while (bh != head); - return MIGRATEPAGE_SUCCESS; + return rc; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. For example attached buffer heads are accessed only under page lock. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, false); } EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Same as above except that this variant is more careful and checks that there + * are also no buffer head references. This function is the right one for + * mappings where buffer heads are directly looked up and referenced (such as + * block device mappings). + */ +int buffer_migrate_page_norefs(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, true); +} #endif /* @@ -899,7 +907,7 @@ */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_page(mapping, newpage, page, mode); } @@ -951,7 +959,7 @@ VM_BUG_ON_PAGE(!PageIsolated(page), page); if (!PageMovable(page)) { rc = MIGRATEPAGE_SUCCESS; - __ClearPageIsolated(page); + ClearPageIsolated(page); goto out; } @@ -973,23 +981,23 @@ * We clear PG_movable under page_lock so any compactor * cannot try to migrate this page. */ - __ClearPageIsolated(page); + ClearPageIsolated(page); } /* - * Anonymous and movable page->mapping will be cleard by + * Anonymous and movable page->mapping will be cleared by * free_pages_prepare so don't reset it here for keeping * the type to work PageAnon, for example. */ if (!PageMappingFlags(page)) page->mapping = NULL; - if (unlikely(is_zone_device_page(newpage))) { - if (is_device_public_page(newpage)) - flush_dcache_page(newpage); - } else - flush_dcache_page(newpage); + if (likely(!is_zone_device_page(newpage))) { + int i, nr = compound_nr(newpage); + for (i = 0; i < nr; i++) + flush_dcache_page(newpage + i); + } } out: return rc; @@ -1013,7 +1021,7 @@ * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, @@ -1101,8 +1109,7 @@ /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); page_was_mapped = 1; } @@ -1141,34 +1148,19 @@ } /* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - -/* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - struct page *newpage; + struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) - return -ENOMEM; - - newpage = get_new_page(page, private); - if (!newpage) return -ENOMEM; if (page_count(page) == 1) { @@ -1178,15 +1170,15 @@ if (unlikely(__PageMovable(page))) { lock_page(page); if (!PageMovable(page)) - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); } - if (put_new_page) - put_new_page(newpage, private); - else - put_page(newpage); goto out; } + + newpage = get_new_page(page, private); + if (!newpage) + return -ENOMEM; rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) @@ -1197,8 +1189,7 @@ /* * A page that has been migrated has all references * removed and will be freed. A page that has not been - * migrated will have kepts its references and be - * restored. + * migrated will have kept its references and be restored. */ list_del(&page->lru); @@ -1209,7 +1200,7 @@ */ if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -thp_nr_pages(page)); } /* @@ -1218,16 +1209,11 @@ * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { - put_page(page); - if (reason == MR_MEMORY_FAILURE) { + if (reason != MR_MEMORY_FAILURE) /* - * Set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, - * it's how HWPoison flag works at the moment. + * We release the page in page_handle_poison. */ - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - } + put_page(page); } else { if (rc != -EAGAIN) { if (likely(!__PageMovable(page))) { @@ -1239,7 +1225,7 @@ if (PageMovable(page)) putback_movable_page(page); else - __ClearPageIsolated(page); + ClearPageIsolated(page); unlock_page(page); put_page(page); } @@ -1280,9 +1266,10 @@ int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; /* - * Movability of hugepages depends on architectures and hugepage size. + * Migratability of hugepages depends on architectures and their size. * This check is necessary because some callers of hugepage migration * like soft offline and memory hotremove don't walk through page * tables or check whether the hugepage is pmd-based or not before @@ -1327,9 +1314,29 @@ goto put_anon; if (page_mapped(hpage)) { - try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + bool mapping_locked = false; + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; + + if (!PageAnon(hpage)) { + /* + * In shared mappings, try_to_unmap could potentially + * call huge_pmd_unshare. Because of this, take + * semaphore in write mode here and set TTU_RMAP_LOCKED + * to let lower levels know we have taken the lock. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (unlikely(!mapping)) + goto unlock_put_anon; + + mapping_locked = true; + ttu |= TTU_RMAP_LOCKED; + } + + try_to_unmap(hpage, ttu); page_was_mapped = 1; + + if (mapping_locked) + i_mmap_unlock_write(mapping); } if (!page_mapped(hpage)) @@ -1339,6 +1346,7 @@ remove_migration_ptes(hpage, rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); +unlock_put_anon: unlock_page(new_hpage); put_anon: @@ -1395,22 +1403,37 @@ enum migrate_mode mode, int reason) { int retry = 1; + int thp_retry = 1; int nr_failed = 0; int nr_succeeded = 0; + int nr_thp_succeeded = 0; + int nr_thp_failed = 0; + int nr_thp_split = 0; int pass = 0; + bool is_thp = false; struct page *page; struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; - int rc; + int rc, nr_subpages; + + trace_mm_migrate_pages_start(mode, reason); if (!swapwrite) current->flags |= PF_SWAPWRITE; - for(pass = 0; pass < 10 && retry; pass++) { + for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; + thp_retry = 0; list_for_each_entry_safe(page, page2, from, lru) { retry: + /* + * THP statistics is based on the source huge page. + * Capture required information that might get lost + * during migration. + */ + is_thp = PageTransHuge(page) && !PageHuge(page); + nr_subpages = thp_nr_pages(page); cond_resched(); if (PageHuge(page)) @@ -1435,21 +1458,35 @@ * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (is_thp) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); if (!rc) { list_safe_reset_next(page, page2, lru); + nr_thp_split++; goto retry; } + + nr_thp_failed++; + nr_failed += nr_subpages; + goto out; } nr_failed++; goto out; case -EAGAIN: + if (is_thp) { + thp_retry++; + break; + } retry++; break; case MIGRATEPAGE_SUCCESS: + if (is_thp) { + nr_thp_succeeded++; + nr_succeeded += nr_subpages; + break; + } nr_succeeded++; break; default: @@ -1459,24 +1496,76 @@ * removed from migration page list and not * retried in the next outer loop. */ + if (is_thp) { + nr_thp_failed++; + nr_failed += nr_subpages; + break; + } nr_failed++; break; } } } - nr_failed += retry; + nr_failed += retry + thp_retry; + nr_thp_failed += thp_retry; rc = nr_failed; out: - if (nr_succeeded) - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - if (nr_failed) - count_vm_events(PGMIGRATE_FAIL, nr_failed); - trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); + count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); + count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); + trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) current->flags &= ~PF_SWAPWRITE; return rc; +} +EXPORT_SYMBOL_GPL(migrate_pages); + +struct page *alloc_migration_target(struct page *page, unsigned long private) +{ + struct migration_target_control *mtc; + gfp_t gfp_mask; + unsigned int order = 0; + struct page *new_page = NULL; + int nid; + int zidx; + + mtc = (struct migration_target_control *)private; + gfp_mask = mtc->gfp_mask; + nid = mtc->nid; + if (nid == NUMA_NO_NODE) + nid = page_to_nid(page); + + if (PageHuge(page)) { + struct hstate *h = page_hstate(compound_head(page)); + + gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); + return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + } + + if (PageTransHuge(page)) { + /* + * clear __GFP_RECLAIM to make the migration callback + * consistent with regular THP allocations. + */ + gfp_mask &= ~__GFP_RECLAIM; + gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; + } + zidx = zone_idx(page_zone(page)); + if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) + gfp_mask |= __GFP_HIGHMEM; + + new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); + + if (new_page && PageTransHuge(new_page)) + prep_transhuge_page(new_page); + + return new_page; } #ifdef CONFIG_NUMA @@ -1496,12 +1585,13 @@ struct list_head *pagelist, int node) { int err; + struct migration_target_control mtc = { + .nid = node, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; - if (list_empty(pagelist)) - return 0; - - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(pagelist); return err; @@ -1524,7 +1614,7 @@ unsigned int follflags; int err; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); err = -EFAULT; vma = find_vma(mm, addr); if (!vma || addr < vma->vm_start || !vma_migratable(vma)) @@ -1552,8 +1642,9 @@ if (PageHuge(page)) { if (PageHead(page)) { - isolate_huge_page(page, pagelist); - err = 1; + err = isolate_hugetlb(page, pagelist); + if (!err) + err = 1; } } else { struct page *head; @@ -1566,8 +1657,8 @@ err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); + NR_ISOLATED_ANON + page_is_file_lru(head), + thp_nr_pages(head)); } out_putpage: /* @@ -1575,10 +1666,36 @@ * isolate_lru_page() or drop the page ref if it was * not isolated. */ - put_page(page); + put_user_page(page); out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; +} + +static int move_pages_and_store_status(struct mm_struct *mm, int node, + struct list_head *pagelist, int __user *status, + int start, int i, unsigned long nr_pages) +{ + int err; + + if (list_empty(pagelist)) + return 0; + + err = do_move_pages_to_node(mm, pagelist, node); + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to incude the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i - 1; + return err; + } + return store_status(status, start, node, i - start); } /* @@ -1596,7 +1713,7 @@ int start, i; int err = 0, err1; - migrate_prep(); + lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; @@ -1624,21 +1741,8 @@ current_node = node; start = i; } else if (node != current_node) { - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - /* - * Positive err means the number of failed - * pages to migrate. Since we are going to - * abort and return the number of non-migrated - * pages, so need to incude the rest of the - * nr_pages that have not been attempted as - * well. - */ - if (err > 0) - err += nr_pages - i - 1; - goto out; - } - err = store_status(status, start, current_node, i - start); + err = move_pages_and_store_status(mm, current_node, + &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; @@ -1652,52 +1756,33 @@ err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) { - /* The page is already on the target node */ - err = store_status(status, i, current_node, 1); - if (err) - goto out_flush; - continue; - } else if (err > 0) { + if (err > 0) { /* The page is successfully queued for migration */ continue; } - err = store_status(status, i, err, 1); + /* + * If the page is already on the target node (!err), store the + * node, otherwise, store the err. + */ + err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - if (err > 0) - err += nr_pages - i - 1; + err = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); + if (err) goto out; - } - if (i > start) { - err = store_status(status, start, current_node, i - start); - if (err) - goto out; - } current_node = NUMA_NO_NODE; } out_flush: - if (list_empty(&pagelist)) - return err; - /* Make sure we do not overwrite the existing error */ - err1 = do_move_pages_to_node(mm, &pagelist, current_node); - /* - * Don't have to report non-attempted pages here since: - * - If the above loop is done gracefully all pages have been - * attempted. - * - If the above loop is aborted it means a fatal error - * happened, should return ret. - */ - if (!err1) - err1 = store_status(status, start, current_node, i - start); + err1 = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); if (err >= 0) err = err1; out: + lru_cache_enable(); return err; } @@ -1709,7 +1794,7 @@ { unsigned long i; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); @@ -1736,7 +1821,7 @@ status++; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } /* @@ -1773,6 +1858,53 @@ return nr_pages ? -EFAULT : 0; } +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) +{ + struct task_struct *task; + struct mm_struct *mm; + + /* + * There is no need to check if current process has the right to modify + * the specified process when they are same. + */ + if (!pid) { + mmget(current->mm); + *mem_nodes = cpuset_mems_allowed(current); + return current->mm; + } + + /* Find the mm_struct */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return ERR_PTR(-ESRCH); + } + get_task_struct(task); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + mm = ERR_PTR(-EPERM); + goto out; + } + rcu_read_unlock(); + + mm = ERR_PTR(security_task_movememory(task)); + if (IS_ERR(mm)) + goto out; + *mem_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); +out: + put_task_struct(task); + if (!mm) + mm = ERR_PTR(-EINVAL); + return mm; +} + /* * Move a list of pages in the address space of the currently executing * process. @@ -1782,7 +1914,6 @@ const int __user *nodes, int __user *status, int flags) { - struct task_struct *task; struct mm_struct *mm; int err; nodemask_t task_nodes; @@ -1794,36 +1925,9 @@ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; - /* Find the mm_struct */ - rcu_read_lock(); - task = pid ? find_task_by_vpid(pid) : current; - if (!task) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(task); - - /* - * Check if this process has the right to modify the specified - * process. Use the regular "ptrace_may_access()" checks. - */ - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { - rcu_read_unlock(); - err = -EPERM; - goto out; - } - rcu_read_unlock(); - - err = security_task_movememory(task); - if (err) - goto out; - - task_nodes = cpuset_mems_allowed(task); - mm = get_task_mm(task); - put_task_struct(task); - - if (!mm) - return -EINVAL; + mm = find_mm_struct(pid, &task_nodes); + if (IS_ERR(mm)) + return PTR_ERR(mm); if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, @@ -1832,10 +1936,6 @@ err = do_pages_stat(mm, nr_pages, pages, status); mmput(mm); - return err; - -out: - put_task_struct(task); return err; } @@ -1889,7 +1989,7 @@ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - 0, 0)) + ZONE_MOVABLE, 0)) continue; return true; } @@ -1918,7 +2018,7 @@ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0; if (isolate_lru_page(page)) @@ -1936,9 +2036,9 @@ return 0; } - page_lru = page_is_file_cache(page); + page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, - hpage_nr_pages(page)); + thp_nr_pages(page)); /* * Isolating the page has taken another reference, so the @@ -1960,7 +2060,7 @@ * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, +int migrate_misplaced_page(struct page *page, struct vm_fault *vmf, int node) { pg_data_t *pgdat = NODE_DATA(node); @@ -1972,15 +2072,15 @@ * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_cache(page) && - (vma->vm_flags & VM_EXEC)) + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vmf->vma_flags & VM_EXEC)) goto out; /* * Also do not migrate dirty pages as not all filesystems can move * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) goto out; isolated = numamigrate_isolate_page(pgdat, page); @@ -1995,7 +2095,7 @@ if (!list_empty(&migratepages)) { list_del(&page->lru); dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); putback_lru_page(page); } isolated = 0; @@ -2025,9 +2125,8 @@ pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + int page_lru = page_is_file_lru(page); + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -2050,15 +2149,15 @@ /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2089,8 +2188,7 @@ * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); + page_add_anon_rmap(new_page, vma, start, true); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB * has already been flushed globally. So no TLB can be currently @@ -2098,11 +2196,11 @@ * pmd before doing set_pmd_at(), nor to flush the TLB after * set_pmd_at(). Clearing the pmd here would introduce a race * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the - * mmap_sem for reading. If the pmd is set to NULL at any given time, + * mmap_lock for reading. If the pmd is set to NULL at any given time, * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this * pmd. */ - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2111,11 +2209,6 @@ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2139,7 +2232,7 @@ ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); @@ -2153,25 +2246,26 @@ #endif /* CONFIG_NUMA */ -#if defined(CONFIG_MIGRATE_VMA_HELPER) -struct migrate_vma { - struct vm_area_struct *vma; - unsigned long *dst; - unsigned long *src; - unsigned long cpages; - unsigned long npages; - unsigned long start; - unsigned long end; -}; - +#ifdef CONFIG_DEVICE_PRIVATE static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) { + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = 0; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + } + return 0; + } + + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; migrate->npages++; @@ -2188,7 +2282,7 @@ struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = 0; } @@ -2210,7 +2304,7 @@ again: if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, walk); + return migrate_vma_collect_hole(start, end, -1, walk); if (pmd_trans_huge(*pmdp)) { struct page *page; @@ -2243,7 +2337,7 @@ return migrate_vma_collect_skip(start, end, walk); if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, + return migrate_vma_collect_hole(start, end, -1, walk); } } @@ -2255,24 +2349,22 @@ arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn, pfn; + unsigned long mpfn = 0, pfn; struct page *page; swp_entry_t entry; pte_t pte; pte = *ptep; - pfn = pte_pfn(pte); if (pte_none(pte)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; - pfn = 0; + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } goto next; } if (!pte_present(pte)) { - mpfn = pfn = 0; - /* * Only care about unaddressable device page special * page table entry. Other special swap entries are not @@ -2283,28 +2375,34 @@ goto next; page = device_private_entry_to_page(entry); - mpfn = migrate_pfn(page_to_pfn(page))| - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } - page = _vm_normal_page(migrate->vma, addr, pte, true); + page = vm_normal_page(migrate->vma, addr, pte); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } /* FIXME support THP */ if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = pfn = 0; + mpfn = 0; goto next; } - pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks @@ -2333,8 +2431,17 @@ entry = make_migration_entry(page, mpfn & MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } set_pte_at(mm, addr, ptep, swp_pte); /* @@ -2353,15 +2460,21 @@ migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = mpfn; } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); /* Only flush the TLB if we actually modified any entries */ if (unmapped) flush_tlb_range(walk->vma, start, end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + return 0; } + +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; /* * migrate_vma_collect() - collect pages over a range of virtual addresses @@ -2373,22 +2486,22 @@ */ static void migrate_vma_collect(struct migrate_vma *migrate) { - struct mm_walk mm_walk = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, - .vma = migrate->vma, - .mm = migrate->vma->vm_mm, - .private = migrate, - }; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm_walk.mm, - migrate->start, - migrate->end); - walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(mm_walk.mm, - migrate->start, - migrate->end); + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_migrate(&range, 0, migrate->vma, + migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } @@ -2432,16 +2545,7 @@ * FIXME proper solution is to rework migration_entry_wait() so * it does not need to take a reference on page. */ - if (is_device_private_page(page)) - return true; - - /* - * Only allow device public page to be migrated and account for - * the extra reference count imply by ZONE_DEVICE pages. - */ - if (!is_device_public_page(page)) - return false; - extra++; + return is_device_private_page(page); } /* For file back page */ @@ -2575,7 +2679,7 @@ */ static void migrate_vma_unmap(struct migrate_vma *migrate) { - int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; unsigned long addr, i, restore = 0; @@ -2620,6 +2724,118 @@ } } +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set + * (destination pages must have their struct pages locked, via lock_page()). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unback virtual + * address. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fails and thus inside the device driver must check if the + * migration was successful for those entries after calling migrate_vma_pages() + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_prepare(args); + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, @@ -2628,7 +2844,6 @@ { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2661,12 +2876,12 @@ * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * - * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when * parallel threads are excluded by other means. * - * Here we only have down_read(mmap_sem). + * Here we only have mmap_read_lock(mm). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ @@ -2675,7 +2890,7 @@ if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -2691,11 +2906,13 @@ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); - } else if (is_device_public_page(page)) { - entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - entry = pte_mkdevmap(entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } } else { entry = mk_pte(page, vma->vm_page_prot); @@ -2705,36 +2922,29 @@ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (check_stable_address_space(mm)) + goto unlock_abort; + if (pte_present(*ptep)) { unsigned long pfn = pte_pfn(*ptep); - if (!is_zero_pfn(pfn)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (!is_zero_pfn(pfn)) + goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + } else if (!pte_none(*ptep)) + goto unlock_abort; /* - * Check for usefaultfd but do not deliver the fault. Instead, + * Check for userfaultfd but do not deliver the fault. Instead, * just back off. */ - if (userfaultfd_missing(vma)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (userfaultfd_missing(vma)) + goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); if (flush) { @@ -2752,11 +2962,13 @@ *src = MIGRATE_PFN_MIGRATE; return; +unlock_abort: + pte_unmap_unlock(ptep, ptl); abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/* +/** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * @@ -2764,13 +2976,12 @@ * struct page. This effectively finishes the migration from source page to the * destination page. */ -static void migrate_vma_pages(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr, i, mmu_start; + struct mmu_notifier_range range; + unsigned long addr, i; bool notified = false; for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { @@ -2785,15 +2996,17 @@ } if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; - } if (!notified) { - mmu_start = addr; notified = true; - mmu_notifier_invalidate_range_start(mm, - mmu_start, - migrate->end); + + mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, + migrate->vma->vm_mm, + addr, migrate->end); + mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, &migrate->src[i], @@ -2813,7 +3026,7 @@ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else if (!is_device_public_page(newpage)) { + } else { /* * Other types of ZONE_DEVICE page are not * supported. @@ -2834,11 +3047,11 @@ * did already call it. */ if (notified) - mmu_notifier_invalidate_range_only_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(&range); } +EXPORT_SYMBOL(migrate_vma_pages); -/* +/** * migrate_vma_finalize() - restore CPU page table entry * @migrate: migrate struct containing all migration information * @@ -2849,7 +3062,7 @@ * This also unlocks the pages and puts them back on the lru, or drops the extra * refcount, for device pages. */ -static void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_finalize(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; unsigned long i; @@ -2876,7 +3089,6 @@ remove_migration_ptes(page, newpage, false); unlock_page(page); - migrate->cpages--; if (is_zone_device_page(page)) put_page(page); @@ -2892,124 +3104,5 @@ } } } - -/* - * migrate_vma() - migrate a range of memory inside vma - * - * @ops: migration callback for allocating destination memory and copying - * @vma: virtual memory area containing the range to be migrated - * @start: start address of the range to migrate (inclusive) - * @end: end address of the range to migrate (exclusive) - * @src: array of hmm_pfn_t containing source pfns - * @dst: array of hmm_pfn_t containing destination pfns - * @private: pointer passed back to each of the callback - * Returns: 0 on success, error code otherwise - * - * This function tries to migrate a range of memory virtual address range, using - * callbacks to allocate and copy memory from source to destination. First it - * collects all the pages backing each virtual address in the range, saving this - * inside the src array. Then it locks those pages and unmaps them. Once the pages - * are locked and unmapped, it checks whether each page is pinned or not. Pages - * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) - * in the corresponding src array entry. It then restores any pages that are - * pinned, by remapping and unlocking those pages. - * - * At this point it calls the alloc_and_copy() callback. For documentation on - * what is expected from that callback, see struct migrate_vma_ops comments in - * include/linux/migrate.h - * - * After the alloc_and_copy() callback, this function goes over each entry in - * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then the function tries to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the struct - * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src - * array. - * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. - * - * It then calls the finalize_and_map() callback. See comments for "struct - * migrate_vma_ops", in include/linux/migrate.h for details about - * finalize_and_map() behavior. - * - * After the finalize_and_map() callback, for successfully migrated pages, this - * function updates the CPU page table to point to new pages, otherwise it - * restores the CPU page table to point to the original source pages. - * - * Function returns 0 after the above steps, even if no pages were migrated - * (The function only returns an error if any of the arguments are invalid.) - * - * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT - * unsigned long entries. - */ -int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private) -{ - struct migrate_vma migrate; - - /* Sanity check the arguments */ - start &= PAGE_MASK; - end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) - return -EINVAL; - if (start < vma->vm_start || start >= vma->vm_end) - return -EINVAL; - if (end <= vma->vm_start || end > vma->vm_end) - return -EINVAL; - if (!ops || !src || !dst || start >= end) - return -EINVAL; - - memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); - migrate.src = src; - migrate.dst = dst; - migrate.start = start; - migrate.npages = 0; - migrate.cpages = 0; - migrate.end = end; - migrate.vma = vma; - - /* Collect, and try to unmap source pages */ - migrate_vma_collect(&migrate); - if (!migrate.cpages) - return 0; - - /* Lock and isolate page */ - migrate_vma_prepare(&migrate); - if (!migrate.cpages) - return 0; - - /* Unmap pages */ - migrate_vma_unmap(&migrate); - if (!migrate.cpages) - return 0; - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the callback. - * - * Note that migration can fail in migrate_vma_struct_page() for each - * individual page. - */ - ops->alloc_and_copy(vma, src, dst, start, end, private); - - /* This does the real migration of struct page */ - migrate_vma_pages(&migrate); - - ops->finalize_and_map(vma, src, dst, start, end, private); - - /* Unlock and remap pages */ - migrate_vma_finalize(&migrate); - - return 0; -} -EXPORT_SYMBOL(migrate_vma); -#endif /* defined(MIGRATE_VMA_HELPER) */ +EXPORT_SYMBOL(migrate_vma_finalize); +#endif /* CONFIG_DEVICE_PRIVATE */ -- Gitblit v1.6.2