.. | .. |
---|
38 | 38 | #include <linux/hugetlb.h> |
---|
39 | 39 | #include <linux/hugetlb_cgroup.h> |
---|
40 | 40 | #include <linux/gfp.h> |
---|
| 41 | +#include <linux/pagewalk.h> |
---|
41 | 42 | #include <linux/pfn_t.h> |
---|
42 | 43 | #include <linux/memremap.h> |
---|
43 | 44 | #include <linux/userfaultfd_k.h> |
---|
.. | .. |
---|
47 | 48 | #include <linux/page_owner.h> |
---|
48 | 49 | #include <linux/sched/mm.h> |
---|
49 | 50 | #include <linux/ptrace.h> |
---|
| 51 | +#include <linux/oom.h> |
---|
50 | 52 | |
---|
51 | 53 | #include <asm/tlbflush.h> |
---|
52 | 54 | |
---|
53 | 55 | #define CREATE_TRACE_POINTS |
---|
54 | 56 | #include <trace/events/migrate.h> |
---|
| 57 | +#undef CREATE_TRACE_POINTS |
---|
| 58 | +#include <trace/hooks/mm.h> |
---|
| 59 | +#include <trace/hooks/vmscan.h> |
---|
55 | 60 | |
---|
56 | 61 | #include "internal.h" |
---|
57 | | - |
---|
58 | | -/* |
---|
59 | | - * migrate_prep() needs to be called before we start compiling a list of pages |
---|
60 | | - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is |
---|
61 | | - * undesirable, use migrate_prep_local() |
---|
62 | | - */ |
---|
63 | | -int migrate_prep(void) |
---|
64 | | -{ |
---|
65 | | - /* |
---|
66 | | - * Clear the LRU lists so pages can be isolated. |
---|
67 | | - * Note that pages may be moved off the LRU after we have |
---|
68 | | - * drained them. Those pages will fail to migrate like other |
---|
69 | | - * pages that may be busy. |
---|
70 | | - */ |
---|
71 | | - lru_add_drain_all(); |
---|
72 | | - |
---|
73 | | - return 0; |
---|
74 | | -} |
---|
75 | | - |
---|
76 | | -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ |
---|
77 | | -int migrate_prep_local(void) |
---|
78 | | -{ |
---|
79 | | - lru_add_drain(); |
---|
80 | | - |
---|
81 | | - return 0; |
---|
82 | | -} |
---|
83 | 62 | |
---|
84 | 63 | int isolate_movable_page(struct page *page, isolate_mode_t mode) |
---|
85 | 64 | { |
---|
.. | .. |
---|
100 | 79 | /* |
---|
101 | 80 | * Check PageMovable before holding a PG_lock because page's owner |
---|
102 | 81 | * assumes anybody doesn't touch PG_lock of newly allocated page |
---|
103 | | - * so unconditionally grapping the lock ruins page's owner side. |
---|
| 82 | + * so unconditionally grabbing the lock ruins page's owner side. |
---|
104 | 83 | */ |
---|
105 | 84 | if (unlikely(!__PageMovable(page))) |
---|
106 | 85 | goto out_putpage; |
---|
.. | .. |
---|
129 | 108 | |
---|
130 | 109 | /* Driver shouldn't use PG_isolated bit of page->flags */ |
---|
131 | 110 | WARN_ON_ONCE(PageIsolated(page)); |
---|
132 | | - __SetPageIsolated(page); |
---|
| 111 | + SetPageIsolated(page); |
---|
133 | 112 | unlock_page(page); |
---|
134 | 113 | |
---|
135 | 114 | return 0; |
---|
.. | .. |
---|
153 | 132 | |
---|
154 | 133 | mapping = page_mapping(page); |
---|
155 | 134 | mapping->a_ops->putback_page(page); |
---|
156 | | - __ClearPageIsolated(page); |
---|
| 135 | + ClearPageIsolated(page); |
---|
157 | 136 | } |
---|
158 | 137 | |
---|
159 | 138 | /* |
---|
.. | .. |
---|
162 | 141 | * |
---|
163 | 142 | * This function shall be used whenever the isolated pageset has been |
---|
164 | 143 | * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() |
---|
165 | | - * and isolate_huge_page(). |
---|
| 144 | + * and isolate_hugetlb(). |
---|
166 | 145 | */ |
---|
167 | 146 | void putback_movable_pages(struct list_head *l) |
---|
168 | 147 | { |
---|
.. | .. |
---|
186 | 165 | if (PageMovable(page)) |
---|
187 | 166 | putback_movable_page(page); |
---|
188 | 167 | else |
---|
189 | | - __ClearPageIsolated(page); |
---|
| 168 | + ClearPageIsolated(page); |
---|
190 | 169 | unlock_page(page); |
---|
191 | 170 | put_page(page); |
---|
192 | 171 | } else { |
---|
193 | 172 | mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + |
---|
194 | | - page_is_file_cache(page), -hpage_nr_pages(page)); |
---|
| 173 | + page_is_file_lru(page), -thp_nr_pages(page)); |
---|
195 | 174 | putback_lru_page(page); |
---|
196 | 175 | } |
---|
197 | 176 | } |
---|
198 | 177 | } |
---|
| 178 | +EXPORT_SYMBOL_GPL(putback_movable_pages); |
---|
199 | 179 | |
---|
200 | 180 | /* |
---|
201 | 181 | * Restore a potential migration pte to a working pte entry |
---|
.. | .. |
---|
240 | 220 | */ |
---|
241 | 221 | entry = pte_to_swp_entry(*pvmw.pte); |
---|
242 | 222 | if (is_write_migration_entry(entry)) |
---|
243 | | - pte = maybe_mkwrite(pte, vma); |
---|
| 223 | + pte = maybe_mkwrite(pte, vma->vm_flags); |
---|
| 224 | + else if (pte_swp_uffd_wp(*pvmw.pte)) |
---|
| 225 | + pte = pte_mkuffd_wp(pte); |
---|
244 | 226 | |
---|
245 | | - if (unlikely(is_zone_device_page(new))) { |
---|
246 | | - if (is_device_private_page(new)) { |
---|
247 | | - entry = make_device_private_entry(new, pte_write(pte)); |
---|
248 | | - pte = swp_entry_to_pte(entry); |
---|
249 | | - } else if (is_device_public_page(new)) { |
---|
250 | | - pte = pte_mkdevmap(pte); |
---|
251 | | - } |
---|
| 227 | + if (unlikely(is_device_private_page(new))) { |
---|
| 228 | + entry = make_device_private_entry(new, pte_write(pte)); |
---|
| 229 | + pte = swp_entry_to_pte(entry); |
---|
| 230 | + if (pte_swp_soft_dirty(*pvmw.pte)) |
---|
| 231 | + pte = pte_swp_mksoft_dirty(pte); |
---|
| 232 | + if (pte_swp_uffd_wp(*pvmw.pte)) |
---|
| 233 | + pte = pte_swp_mkuffd_wp(pte); |
---|
252 | 234 | } |
---|
253 | 235 | |
---|
254 | 236 | #ifdef CONFIG_HUGETLB_PAGE |
---|
.. | .. |
---|
322 | 304 | goto out; |
---|
323 | 305 | |
---|
324 | 306 | page = migration_entry_to_page(entry); |
---|
| 307 | + page = compound_head(page); |
---|
325 | 308 | |
---|
326 | 309 | /* |
---|
327 | | - * Once radix-tree replacement of page migration started, page_count |
---|
328 | | - * *must* be zero. And, we don't want to call wait_on_page_locked() |
---|
329 | | - * against a page without get_page(). |
---|
330 | | - * So, we use get_page_unless_zero(), here. Even failed, page fault |
---|
331 | | - * will occur again. |
---|
| 310 | + * Once page cache replacement of page migration started, page_count |
---|
| 311 | + * is zero; but we must not call put_and_wait_on_page_locked() without |
---|
| 312 | + * a ref. Use get_page_unless_zero(), and just fault again if it fails. |
---|
332 | 313 | */ |
---|
333 | 314 | if (!get_page_unless_zero(page)) |
---|
334 | 315 | goto out; |
---|
335 | 316 | pte_unmap_unlock(ptep, ptl); |
---|
336 | | - wait_on_page_locked(page); |
---|
337 | | - put_page(page); |
---|
| 317 | + trace_android_vh_waiting_for_page_migration(page); |
---|
| 318 | + put_and_wait_on_page_locked(page); |
---|
338 | 319 | return; |
---|
339 | 320 | out: |
---|
340 | 321 | pte_unmap_unlock(ptep, ptl); |
---|
.. | .. |
---|
368 | 349 | if (!get_page_unless_zero(page)) |
---|
369 | 350 | goto unlock; |
---|
370 | 351 | spin_unlock(ptl); |
---|
371 | | - wait_on_page_locked(page); |
---|
372 | | - put_page(page); |
---|
| 352 | + put_and_wait_on_page_locked(page); |
---|
373 | 353 | return; |
---|
374 | 354 | unlock: |
---|
375 | 355 | spin_unlock(ptl); |
---|
376 | 356 | } |
---|
377 | 357 | #endif |
---|
378 | 358 | |
---|
379 | | -#ifdef CONFIG_BLOCK |
---|
380 | | -/* Returns true if all buffers are successfully locked */ |
---|
381 | | -static bool buffer_migrate_lock_buffers(struct buffer_head *head, |
---|
382 | | - enum migrate_mode mode) |
---|
| 359 | +static int expected_page_refs(struct address_space *mapping, struct page *page) |
---|
383 | 360 | { |
---|
384 | | - struct buffer_head *bh = head; |
---|
| 361 | + int expected_count = 1; |
---|
385 | 362 | |
---|
386 | | - /* Simple case, sync compaction */ |
---|
387 | | - if (mode != MIGRATE_ASYNC) { |
---|
388 | | - do { |
---|
389 | | - get_bh(bh); |
---|
390 | | - lock_buffer(bh); |
---|
391 | | - bh = bh->b_this_page; |
---|
| 363 | + /* |
---|
| 364 | + * Device private pages have an extra refcount as they are |
---|
| 365 | + * ZONE_DEVICE pages. |
---|
| 366 | + */ |
---|
| 367 | + expected_count += is_device_private_page(page); |
---|
| 368 | + if (mapping) |
---|
| 369 | + expected_count += thp_nr_pages(page) + page_has_private(page); |
---|
392 | 370 | |
---|
393 | | - } while (bh != head); |
---|
394 | | - |
---|
395 | | - return true; |
---|
396 | | - } |
---|
397 | | - |
---|
398 | | - /* async case, we cannot block on lock_buffer so use trylock_buffer */ |
---|
399 | | - do { |
---|
400 | | - get_bh(bh); |
---|
401 | | - if (!trylock_buffer(bh)) { |
---|
402 | | - /* |
---|
403 | | - * We failed to lock the buffer and cannot stall in |
---|
404 | | - * async migration. Release the taken locks |
---|
405 | | - */ |
---|
406 | | - struct buffer_head *failed_bh = bh; |
---|
407 | | - put_bh(failed_bh); |
---|
408 | | - bh = head; |
---|
409 | | - while (bh != failed_bh) { |
---|
410 | | - unlock_buffer(bh); |
---|
411 | | - put_bh(bh); |
---|
412 | | - bh = bh->b_this_page; |
---|
413 | | - } |
---|
414 | | - return false; |
---|
415 | | - } |
---|
416 | | - |
---|
417 | | - bh = bh->b_this_page; |
---|
418 | | - } while (bh != head); |
---|
419 | | - return true; |
---|
| 371 | + return expected_count; |
---|
420 | 372 | } |
---|
421 | | -#else |
---|
422 | | -static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, |
---|
423 | | - enum migrate_mode mode) |
---|
424 | | -{ |
---|
425 | | - return true; |
---|
426 | | -} |
---|
427 | | -#endif /* CONFIG_BLOCK */ |
---|
428 | 373 | |
---|
429 | 374 | /* |
---|
430 | 375 | * Replace the page in the mapping. |
---|
.. | .. |
---|
435 | 380 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
---|
436 | 381 | */ |
---|
437 | 382 | int migrate_page_move_mapping(struct address_space *mapping, |
---|
438 | | - struct page *newpage, struct page *page, |
---|
439 | | - struct buffer_head *head, enum migrate_mode mode, |
---|
440 | | - int extra_count) |
---|
| 383 | + struct page *newpage, struct page *page, int extra_count) |
---|
441 | 384 | { |
---|
| 385 | + XA_STATE(xas, &mapping->i_pages, page_index(page)); |
---|
442 | 386 | struct zone *oldzone, *newzone; |
---|
443 | 387 | int dirty; |
---|
444 | | - int expected_count = 1 + extra_count; |
---|
445 | | - void **pslot; |
---|
446 | | - |
---|
447 | | - /* |
---|
448 | | - * Device public or private pages have an extra refcount as they are |
---|
449 | | - * ZONE_DEVICE pages. |
---|
450 | | - */ |
---|
451 | | - expected_count += is_device_private_page(page); |
---|
452 | | - expected_count += is_device_public_page(page); |
---|
| 388 | + int expected_count = expected_page_refs(mapping, page) + extra_count; |
---|
| 389 | + int nr = thp_nr_pages(page); |
---|
453 | 390 | |
---|
454 | 391 | if (!mapping) { |
---|
455 | 392 | /* Anonymous page without mapping */ |
---|
.. | .. |
---|
468 | 405 | oldzone = page_zone(page); |
---|
469 | 406 | newzone = page_zone(newpage); |
---|
470 | 407 | |
---|
471 | | - xa_lock_irq(&mapping->i_pages); |
---|
472 | | - |
---|
473 | | - pslot = radix_tree_lookup_slot(&mapping->i_pages, |
---|
474 | | - page_index(page)); |
---|
475 | | - |
---|
476 | | - expected_count += hpage_nr_pages(page) + page_has_private(page); |
---|
477 | | - if (page_count(page) != expected_count || |
---|
478 | | - radix_tree_deref_slot_protected(pslot, |
---|
479 | | - &mapping->i_pages.xa_lock) != page) { |
---|
480 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 408 | + xas_lock_irq(&xas); |
---|
| 409 | + if (page_count(page) != expected_count || xas_load(&xas) != page) { |
---|
| 410 | + xas_unlock_irq(&xas); |
---|
481 | 411 | return -EAGAIN; |
---|
482 | 412 | } |
---|
483 | 413 | |
---|
484 | 414 | if (!page_ref_freeze(page, expected_count)) { |
---|
485 | | - xa_unlock_irq(&mapping->i_pages); |
---|
486 | | - return -EAGAIN; |
---|
487 | | - } |
---|
488 | | - |
---|
489 | | - /* |
---|
490 | | - * In the async migration case of moving a page with buffers, lock the |
---|
491 | | - * buffers using trylock before the mapping is moved. If the mapping |
---|
492 | | - * was moved, we later failed to lock the buffers and could not move |
---|
493 | | - * the mapping back due to an elevated page count, we would have to |
---|
494 | | - * block waiting on other references to be dropped. |
---|
495 | | - */ |
---|
496 | | - if (mode == MIGRATE_ASYNC && head && |
---|
497 | | - !buffer_migrate_lock_buffers(head, mode)) { |
---|
498 | | - page_ref_unfreeze(page, expected_count); |
---|
499 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 415 | + xas_unlock_irq(&xas); |
---|
500 | 416 | return -EAGAIN; |
---|
501 | 417 | } |
---|
502 | 418 | |
---|
.. | .. |
---|
506 | 422 | */ |
---|
507 | 423 | newpage->index = page->index; |
---|
508 | 424 | newpage->mapping = page->mapping; |
---|
509 | | - page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ |
---|
| 425 | + page_ref_add(newpage, nr); /* add cache reference */ |
---|
510 | 426 | if (PageSwapBacked(page)) { |
---|
511 | 427 | __SetPageSwapBacked(newpage); |
---|
512 | 428 | if (PageSwapCache(page)) { |
---|
.. | .. |
---|
524 | 440 | SetPageDirty(newpage); |
---|
525 | 441 | } |
---|
526 | 442 | |
---|
527 | | - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); |
---|
| 443 | + xas_store(&xas, newpage); |
---|
528 | 444 | if (PageTransHuge(page)) { |
---|
529 | 445 | int i; |
---|
530 | | - int index = page_index(page); |
---|
531 | 446 | |
---|
532 | | - for (i = 1; i < HPAGE_PMD_NR; i++) { |
---|
533 | | - pslot = radix_tree_lookup_slot(&mapping->i_pages, |
---|
534 | | - index + i); |
---|
535 | | - radix_tree_replace_slot(&mapping->i_pages, pslot, |
---|
536 | | - newpage + i); |
---|
| 447 | + for (i = 1; i < nr; i++) { |
---|
| 448 | + xas_next(&xas); |
---|
| 449 | + xas_store(&xas, newpage); |
---|
537 | 450 | } |
---|
538 | 451 | } |
---|
539 | 452 | |
---|
.. | .. |
---|
542 | 455 | * to one less reference. |
---|
543 | 456 | * We know this isn't the last reference. |
---|
544 | 457 | */ |
---|
545 | | - page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); |
---|
| 458 | + page_ref_unfreeze(page, expected_count - nr); |
---|
546 | 459 | |
---|
547 | | - xa_unlock(&mapping->i_pages); |
---|
| 460 | + xas_unlock(&xas); |
---|
548 | 461 | /* Leave irq disabled to prevent preemption while updating stats */ |
---|
549 | 462 | |
---|
550 | 463 | /* |
---|
.. | .. |
---|
558 | 471 | * are mapped to swap space. |
---|
559 | 472 | */ |
---|
560 | 473 | if (newzone != oldzone) { |
---|
561 | | - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); |
---|
562 | | - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); |
---|
| 474 | + struct lruvec *old_lruvec, *new_lruvec; |
---|
| 475 | + struct mem_cgroup *memcg; |
---|
| 476 | + |
---|
| 477 | + memcg = page_memcg(page); |
---|
| 478 | + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); |
---|
| 479 | + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); |
---|
| 480 | + |
---|
| 481 | + __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); |
---|
| 482 | + __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); |
---|
563 | 483 | if (PageSwapBacked(page) && !PageSwapCache(page)) { |
---|
564 | | - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); |
---|
565 | | - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); |
---|
| 484 | + __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); |
---|
| 485 | + __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); |
---|
566 | 486 | } |
---|
567 | | - if (dirty && mapping_cap_account_dirty(mapping)) { |
---|
568 | | - __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); |
---|
569 | | - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); |
---|
570 | | - __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); |
---|
571 | | - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); |
---|
| 487 | + if (dirty && mapping_can_writeback(mapping)) { |
---|
| 488 | + __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); |
---|
| 489 | + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); |
---|
| 490 | + __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); |
---|
| 491 | + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); |
---|
572 | 492 | } |
---|
573 | 493 | } |
---|
574 | 494 | local_irq_enable(); |
---|
.. | .. |
---|
584 | 504 | int migrate_huge_page_move_mapping(struct address_space *mapping, |
---|
585 | 505 | struct page *newpage, struct page *page) |
---|
586 | 506 | { |
---|
| 507 | + XA_STATE(xas, &mapping->i_pages, page_index(page)); |
---|
587 | 508 | int expected_count; |
---|
588 | | - void **pslot; |
---|
589 | 509 | |
---|
590 | | - xa_lock_irq(&mapping->i_pages); |
---|
591 | | - |
---|
592 | | - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); |
---|
593 | | - |
---|
| 510 | + xas_lock_irq(&xas); |
---|
594 | 511 | expected_count = 2 + page_has_private(page); |
---|
595 | | - if (page_count(page) != expected_count || |
---|
596 | | - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { |
---|
597 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 512 | + if (page_count(page) != expected_count || xas_load(&xas) != page) { |
---|
| 513 | + xas_unlock_irq(&xas); |
---|
598 | 514 | return -EAGAIN; |
---|
599 | 515 | } |
---|
600 | 516 | |
---|
601 | 517 | if (!page_ref_freeze(page, expected_count)) { |
---|
602 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 518 | + xas_unlock_irq(&xas); |
---|
603 | 519 | return -EAGAIN; |
---|
604 | 520 | } |
---|
605 | 521 | |
---|
.. | .. |
---|
608 | 524 | |
---|
609 | 525 | get_page(newpage); |
---|
610 | 526 | |
---|
611 | | - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); |
---|
| 527 | + xas_store(&xas, newpage); |
---|
612 | 528 | |
---|
613 | 529 | page_ref_unfreeze(page, expected_count - 1); |
---|
614 | 530 | |
---|
615 | | - xa_unlock_irq(&mapping->i_pages); |
---|
| 531 | + xas_unlock_irq(&xas); |
---|
616 | 532 | |
---|
617 | 533 | return MIGRATEPAGE_SUCCESS; |
---|
618 | 534 | } |
---|
.. | .. |
---|
656 | 572 | } else { |
---|
657 | 573 | /* thp page */ |
---|
658 | 574 | BUG_ON(!PageTransHuge(src)); |
---|
659 | | - nr_pages = hpage_nr_pages(src); |
---|
| 575 | + nr_pages = thp_nr_pages(src); |
---|
660 | 576 | } |
---|
661 | 577 | |
---|
662 | 578 | for (i = 0; i < nr_pages; i++) { |
---|
.. | .. |
---|
671 | 587 | void migrate_page_states(struct page *newpage, struct page *page) |
---|
672 | 588 | { |
---|
673 | 589 | int cpupid; |
---|
| 590 | + |
---|
| 591 | + trace_android_vh_migrate_page_states(page, newpage); |
---|
674 | 592 | |
---|
675 | 593 | if (PageError(page)) |
---|
676 | 594 | SetPageError(newpage); |
---|
.. | .. |
---|
689 | 607 | SetPageChecked(newpage); |
---|
690 | 608 | if (PageMappedToDisk(page)) |
---|
691 | 609 | SetPageMappedToDisk(newpage); |
---|
| 610 | + trace_android_vh_look_around_migrate_page(page, newpage); |
---|
692 | 611 | |
---|
693 | 612 | /* Move dirty on pages not done by migrate_page_move_mapping() */ |
---|
694 | 613 | if (PageDirty(page)) |
---|
.. | .. |
---|
723 | 642 | if (PageWriteback(newpage)) |
---|
724 | 643 | end_page_writeback(newpage); |
---|
725 | 644 | |
---|
| 645 | + /* |
---|
| 646 | + * PG_readahead shares the same bit with PG_reclaim. The above |
---|
| 647 | + * end_page_writeback() may clear PG_readahead mistakenly, so set the |
---|
| 648 | + * bit after that. |
---|
| 649 | + */ |
---|
| 650 | + if (PageReadahead(page)) |
---|
| 651 | + SetPageReadahead(newpage); |
---|
| 652 | + |
---|
726 | 653 | copy_page_owner(page, newpage); |
---|
727 | 654 | |
---|
728 | | - mem_cgroup_migrate(page, newpage); |
---|
| 655 | + if (!PageHuge(page)) |
---|
| 656 | + mem_cgroup_migrate(page, newpage); |
---|
729 | 657 | } |
---|
730 | 658 | EXPORT_SYMBOL(migrate_page_states); |
---|
731 | 659 | |
---|
.. | .. |
---|
758 | 686 | |
---|
759 | 687 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
---|
760 | 688 | |
---|
761 | | - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); |
---|
| 689 | + rc = migrate_page_move_mapping(mapping, newpage, page, 0); |
---|
762 | 690 | |
---|
763 | 691 | if (rc != MIGRATEPAGE_SUCCESS) |
---|
764 | 692 | return rc; |
---|
.. | .. |
---|
772 | 700 | EXPORT_SYMBOL(migrate_page); |
---|
773 | 701 | |
---|
774 | 702 | #ifdef CONFIG_BLOCK |
---|
775 | | -/* |
---|
776 | | - * Migration function for pages with buffers. This function can only be used |
---|
777 | | - * if the underlying filesystem guarantees that no other references to "page" |
---|
778 | | - * exist. |
---|
779 | | - */ |
---|
780 | | -int buffer_migrate_page(struct address_space *mapping, |
---|
781 | | - struct page *newpage, struct page *page, enum migrate_mode mode) |
---|
| 703 | +/* Returns true if all buffers are successfully locked */ |
---|
| 704 | +static bool buffer_migrate_lock_buffers(struct buffer_head *head, |
---|
| 705 | + enum migrate_mode mode) |
---|
| 706 | +{ |
---|
| 707 | + struct buffer_head *bh = head; |
---|
| 708 | + |
---|
| 709 | + /* Simple case, sync compaction */ |
---|
| 710 | + if (mode != MIGRATE_ASYNC) { |
---|
| 711 | + do { |
---|
| 712 | + lock_buffer(bh); |
---|
| 713 | + bh = bh->b_this_page; |
---|
| 714 | + |
---|
| 715 | + } while (bh != head); |
---|
| 716 | + |
---|
| 717 | + return true; |
---|
| 718 | + } |
---|
| 719 | + |
---|
| 720 | + /* async case, we cannot block on lock_buffer so use trylock_buffer */ |
---|
| 721 | + do { |
---|
| 722 | + if (!trylock_buffer(bh)) { |
---|
| 723 | + /* |
---|
| 724 | + * We failed to lock the buffer and cannot stall in |
---|
| 725 | + * async migration. Release the taken locks |
---|
| 726 | + */ |
---|
| 727 | + struct buffer_head *failed_bh = bh; |
---|
| 728 | + bh = head; |
---|
| 729 | + while (bh != failed_bh) { |
---|
| 730 | + unlock_buffer(bh); |
---|
| 731 | + bh = bh->b_this_page; |
---|
| 732 | + } |
---|
| 733 | + return false; |
---|
| 734 | + } |
---|
| 735 | + |
---|
| 736 | + bh = bh->b_this_page; |
---|
| 737 | + } while (bh != head); |
---|
| 738 | + return true; |
---|
| 739 | +} |
---|
| 740 | + |
---|
| 741 | +static int __buffer_migrate_page(struct address_space *mapping, |
---|
| 742 | + struct page *newpage, struct page *page, enum migrate_mode mode, |
---|
| 743 | + bool check_refs) |
---|
782 | 744 | { |
---|
783 | 745 | struct buffer_head *bh, *head; |
---|
784 | 746 | int rc; |
---|
| 747 | + int expected_count; |
---|
785 | 748 | |
---|
786 | 749 | if (!page_has_buffers(page)) |
---|
787 | 750 | return migrate_page(mapping, newpage, page, mode); |
---|
788 | 751 | |
---|
| 752 | + /* Check whether page does not have extra refs before we do more work */ |
---|
| 753 | + expected_count = expected_page_refs(mapping, page); |
---|
| 754 | + if (page_count(page) != expected_count) |
---|
| 755 | + return -EAGAIN; |
---|
| 756 | + |
---|
789 | 757 | head = page_buffers(page); |
---|
| 758 | + if (!buffer_migrate_lock_buffers(head, mode)) |
---|
| 759 | + return -EAGAIN; |
---|
790 | 760 | |
---|
791 | | - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); |
---|
| 761 | + if (check_refs) { |
---|
| 762 | + bool busy; |
---|
| 763 | + bool invalidated = false; |
---|
792 | 764 | |
---|
| 765 | +recheck_buffers: |
---|
| 766 | + busy = false; |
---|
| 767 | + spin_lock(&mapping->private_lock); |
---|
| 768 | + bh = head; |
---|
| 769 | + do { |
---|
| 770 | + if (atomic_read(&bh->b_count)) { |
---|
| 771 | + busy = true; |
---|
| 772 | + break; |
---|
| 773 | + } |
---|
| 774 | + bh = bh->b_this_page; |
---|
| 775 | + } while (bh != head); |
---|
| 776 | + if (busy) { |
---|
| 777 | + if (invalidated) { |
---|
| 778 | + rc = -EAGAIN; |
---|
| 779 | + goto unlock_buffers; |
---|
| 780 | + } |
---|
| 781 | + spin_unlock(&mapping->private_lock); |
---|
| 782 | + invalidate_bh_lrus(); |
---|
| 783 | + invalidated = true; |
---|
| 784 | + goto recheck_buffers; |
---|
| 785 | + } |
---|
| 786 | + } |
---|
| 787 | + |
---|
| 788 | + rc = migrate_page_move_mapping(mapping, newpage, page, 0); |
---|
793 | 789 | if (rc != MIGRATEPAGE_SUCCESS) |
---|
794 | | - return rc; |
---|
| 790 | + goto unlock_buffers; |
---|
795 | 791 | |
---|
796 | | - /* |
---|
797 | | - * In the async case, migrate_page_move_mapping locked the buffers |
---|
798 | | - * with an IRQ-safe spinlock held. In the sync case, the buffers |
---|
799 | | - * need to be locked now |
---|
800 | | - */ |
---|
801 | | - if (mode != MIGRATE_ASYNC) |
---|
802 | | - BUG_ON(!buffer_migrate_lock_buffers(head, mode)); |
---|
803 | | - |
---|
804 | | - ClearPagePrivate(page); |
---|
805 | | - set_page_private(newpage, page_private(page)); |
---|
806 | | - set_page_private(page, 0); |
---|
807 | | - put_page(page); |
---|
808 | | - get_page(newpage); |
---|
| 792 | + attach_page_private(newpage, detach_page_private(page)); |
---|
809 | 793 | |
---|
810 | 794 | bh = head; |
---|
811 | 795 | do { |
---|
.. | .. |
---|
814 | 798 | |
---|
815 | 799 | } while (bh != head); |
---|
816 | 800 | |
---|
817 | | - SetPagePrivate(newpage); |
---|
818 | | - |
---|
819 | 801 | if (mode != MIGRATE_SYNC_NO_COPY) |
---|
820 | 802 | migrate_page_copy(newpage, page); |
---|
821 | 803 | else |
---|
822 | 804 | migrate_page_states(newpage, page); |
---|
823 | 805 | |
---|
| 806 | + rc = MIGRATEPAGE_SUCCESS; |
---|
| 807 | +unlock_buffers: |
---|
| 808 | + if (check_refs) |
---|
| 809 | + spin_unlock(&mapping->private_lock); |
---|
824 | 810 | bh = head; |
---|
825 | 811 | do { |
---|
826 | 812 | unlock_buffer(bh); |
---|
827 | | - put_bh(bh); |
---|
828 | 813 | bh = bh->b_this_page; |
---|
829 | 814 | |
---|
830 | 815 | } while (bh != head); |
---|
831 | 816 | |
---|
832 | | - return MIGRATEPAGE_SUCCESS; |
---|
| 817 | + return rc; |
---|
| 818 | +} |
---|
| 819 | + |
---|
| 820 | +/* |
---|
| 821 | + * Migration function for pages with buffers. This function can only be used |
---|
| 822 | + * if the underlying filesystem guarantees that no other references to "page" |
---|
| 823 | + * exist. For example attached buffer heads are accessed only under page lock. |
---|
| 824 | + */ |
---|
| 825 | +int buffer_migrate_page(struct address_space *mapping, |
---|
| 826 | + struct page *newpage, struct page *page, enum migrate_mode mode) |
---|
| 827 | +{ |
---|
| 828 | + return __buffer_migrate_page(mapping, newpage, page, mode, false); |
---|
833 | 829 | } |
---|
834 | 830 | EXPORT_SYMBOL(buffer_migrate_page); |
---|
| 831 | + |
---|
| 832 | +/* |
---|
| 833 | + * Same as above except that this variant is more careful and checks that there |
---|
| 834 | + * are also no buffer head references. This function is the right one for |
---|
| 835 | + * mappings where buffer heads are directly looked up and referenced (such as |
---|
| 836 | + * block device mappings). |
---|
| 837 | + */ |
---|
| 838 | +int buffer_migrate_page_norefs(struct address_space *mapping, |
---|
| 839 | + struct page *newpage, struct page *page, enum migrate_mode mode) |
---|
| 840 | +{ |
---|
| 841 | + return __buffer_migrate_page(mapping, newpage, page, mode, true); |
---|
| 842 | +} |
---|
835 | 843 | #endif |
---|
836 | 844 | |
---|
837 | 845 | /* |
---|
.. | .. |
---|
899 | 907 | */ |
---|
900 | 908 | if (page_has_private(page) && |
---|
901 | 909 | !try_to_release_page(page, GFP_KERNEL)) |
---|
902 | | - return -EAGAIN; |
---|
| 910 | + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; |
---|
903 | 911 | |
---|
904 | 912 | return migrate_page(mapping, newpage, page, mode); |
---|
905 | 913 | } |
---|
.. | .. |
---|
951 | 959 | VM_BUG_ON_PAGE(!PageIsolated(page), page); |
---|
952 | 960 | if (!PageMovable(page)) { |
---|
953 | 961 | rc = MIGRATEPAGE_SUCCESS; |
---|
954 | | - __ClearPageIsolated(page); |
---|
| 962 | + ClearPageIsolated(page); |
---|
955 | 963 | goto out; |
---|
956 | 964 | } |
---|
957 | 965 | |
---|
.. | .. |
---|
973 | 981 | * We clear PG_movable under page_lock so any compactor |
---|
974 | 982 | * cannot try to migrate this page. |
---|
975 | 983 | */ |
---|
976 | | - __ClearPageIsolated(page); |
---|
| 984 | + ClearPageIsolated(page); |
---|
977 | 985 | } |
---|
978 | 986 | |
---|
979 | 987 | /* |
---|
980 | | - * Anonymous and movable page->mapping will be cleard by |
---|
| 988 | + * Anonymous and movable page->mapping will be cleared by |
---|
981 | 989 | * free_pages_prepare so don't reset it here for keeping |
---|
982 | 990 | * the type to work PageAnon, for example. |
---|
983 | 991 | */ |
---|
984 | 992 | if (!PageMappingFlags(page)) |
---|
985 | 993 | page->mapping = NULL; |
---|
986 | 994 | |
---|
987 | | - if (unlikely(is_zone_device_page(newpage))) { |
---|
988 | | - if (is_device_public_page(newpage)) |
---|
989 | | - flush_dcache_page(newpage); |
---|
990 | | - } else |
---|
991 | | - flush_dcache_page(newpage); |
---|
| 995 | + if (likely(!is_zone_device_page(newpage))) { |
---|
| 996 | + int i, nr = compound_nr(newpage); |
---|
992 | 997 | |
---|
| 998 | + for (i = 0; i < nr; i++) |
---|
| 999 | + flush_dcache_page(newpage + i); |
---|
| 1000 | + } |
---|
993 | 1001 | } |
---|
994 | 1002 | out: |
---|
995 | 1003 | return rc; |
---|
.. | .. |
---|
1013 | 1021 | * to the LRU. Later, when the IO completes the pages are |
---|
1014 | 1022 | * marked uptodate and unlocked. However, the queueing |
---|
1015 | 1023 | * could be merging multiple pages for one bio (e.g. |
---|
1016 | | - * mpage_readpages). If an allocation happens for the |
---|
| 1024 | + * mpage_readahead). If an allocation happens for the |
---|
1017 | 1025 | * second or third page, the process can end up locking |
---|
1018 | 1026 | * the same page twice and deadlocking. Rather than |
---|
1019 | 1027 | * trying to be clever about what pages can be locked, |
---|
.. | .. |
---|
1101 | 1109 | /* Establish migration ptes */ |
---|
1102 | 1110 | VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, |
---|
1103 | 1111 | page); |
---|
1104 | | - try_to_unmap(page, |
---|
1105 | | - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
---|
| 1112 | + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); |
---|
1106 | 1113 | page_was_mapped = 1; |
---|
1107 | 1114 | } |
---|
1108 | 1115 | |
---|
.. | .. |
---|
1141 | 1148 | } |
---|
1142 | 1149 | |
---|
1143 | 1150 | /* |
---|
1144 | | - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work |
---|
1145 | | - * around it. |
---|
1146 | | - */ |
---|
1147 | | -#if defined(CONFIG_ARM) && \ |
---|
1148 | | - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 |
---|
1149 | | -#define ICE_noinline noinline |
---|
1150 | | -#else |
---|
1151 | | -#define ICE_noinline |
---|
1152 | | -#endif |
---|
1153 | | - |
---|
1154 | | -/* |
---|
1155 | 1151 | * Obtain the lock on page, remove all ptes and migrate the page |
---|
1156 | 1152 | * to the newly allocated page in newpage. |
---|
1157 | 1153 | */ |
---|
1158 | | -static ICE_noinline int unmap_and_move(new_page_t get_new_page, |
---|
| 1154 | +static int unmap_and_move(new_page_t get_new_page, |
---|
1159 | 1155 | free_page_t put_new_page, |
---|
1160 | 1156 | unsigned long private, struct page *page, |
---|
1161 | 1157 | int force, enum migrate_mode mode, |
---|
1162 | 1158 | enum migrate_reason reason) |
---|
1163 | 1159 | { |
---|
1164 | 1160 | int rc = MIGRATEPAGE_SUCCESS; |
---|
1165 | | - struct page *newpage; |
---|
| 1161 | + struct page *newpage = NULL; |
---|
1166 | 1162 | |
---|
1167 | 1163 | if (!thp_migration_supported() && PageTransHuge(page)) |
---|
1168 | | - return -ENOMEM; |
---|
1169 | | - |
---|
1170 | | - newpage = get_new_page(page, private); |
---|
1171 | | - if (!newpage) |
---|
1172 | 1164 | return -ENOMEM; |
---|
1173 | 1165 | |
---|
1174 | 1166 | if (page_count(page) == 1) { |
---|
.. | .. |
---|
1178 | 1170 | if (unlikely(__PageMovable(page))) { |
---|
1179 | 1171 | lock_page(page); |
---|
1180 | 1172 | if (!PageMovable(page)) |
---|
1181 | | - __ClearPageIsolated(page); |
---|
| 1173 | + ClearPageIsolated(page); |
---|
1182 | 1174 | unlock_page(page); |
---|
1183 | 1175 | } |
---|
1184 | | - if (put_new_page) |
---|
1185 | | - put_new_page(newpage, private); |
---|
1186 | | - else |
---|
1187 | | - put_page(newpage); |
---|
1188 | 1176 | goto out; |
---|
1189 | 1177 | } |
---|
| 1178 | + |
---|
| 1179 | + newpage = get_new_page(page, private); |
---|
| 1180 | + if (!newpage) |
---|
| 1181 | + return -ENOMEM; |
---|
1190 | 1182 | |
---|
1191 | 1183 | rc = __unmap_and_move(page, newpage, force, mode); |
---|
1192 | 1184 | if (rc == MIGRATEPAGE_SUCCESS) |
---|
.. | .. |
---|
1197 | 1189 | /* |
---|
1198 | 1190 | * A page that has been migrated has all references |
---|
1199 | 1191 | * removed and will be freed. A page that has not been |
---|
1200 | | - * migrated will have kepts its references and be |
---|
1201 | | - * restored. |
---|
| 1192 | + * migrated will have kept its references and be restored. |
---|
1202 | 1193 | */ |
---|
1203 | 1194 | list_del(&page->lru); |
---|
1204 | 1195 | |
---|
.. | .. |
---|
1209 | 1200 | */ |
---|
1210 | 1201 | if (likely(!__PageMovable(page))) |
---|
1211 | 1202 | mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + |
---|
1212 | | - page_is_file_cache(page), -hpage_nr_pages(page)); |
---|
| 1203 | + page_is_file_lru(page), -thp_nr_pages(page)); |
---|
1213 | 1204 | } |
---|
1214 | 1205 | |
---|
1215 | 1206 | /* |
---|
.. | .. |
---|
1218 | 1209 | * we want to retry. |
---|
1219 | 1210 | */ |
---|
1220 | 1211 | if (rc == MIGRATEPAGE_SUCCESS) { |
---|
1221 | | - put_page(page); |
---|
1222 | | - if (reason == MR_MEMORY_FAILURE) { |
---|
| 1212 | + if (reason != MR_MEMORY_FAILURE) |
---|
1223 | 1213 | /* |
---|
1224 | | - * Set PG_HWPoison on just freed page |
---|
1225 | | - * intentionally. Although it's rather weird, |
---|
1226 | | - * it's how HWPoison flag works at the moment. |
---|
| 1214 | + * We release the page in page_handle_poison. |
---|
1227 | 1215 | */ |
---|
1228 | | - if (set_hwpoison_free_buddy_page(page)) |
---|
1229 | | - num_poisoned_pages_inc(); |
---|
1230 | | - } |
---|
| 1216 | + put_page(page); |
---|
1231 | 1217 | } else { |
---|
1232 | 1218 | if (rc != -EAGAIN) { |
---|
1233 | 1219 | if (likely(!__PageMovable(page))) { |
---|
.. | .. |
---|
1239 | 1225 | if (PageMovable(page)) |
---|
1240 | 1226 | putback_movable_page(page); |
---|
1241 | 1227 | else |
---|
1242 | | - __ClearPageIsolated(page); |
---|
| 1228 | + ClearPageIsolated(page); |
---|
1243 | 1229 | unlock_page(page); |
---|
1244 | 1230 | put_page(page); |
---|
1245 | 1231 | } |
---|
.. | .. |
---|
1280 | 1266 | int page_was_mapped = 0; |
---|
1281 | 1267 | struct page *new_hpage; |
---|
1282 | 1268 | struct anon_vma *anon_vma = NULL; |
---|
| 1269 | + struct address_space *mapping = NULL; |
---|
1283 | 1270 | |
---|
1284 | 1271 | /* |
---|
1285 | | - * Movability of hugepages depends on architectures and hugepage size. |
---|
| 1272 | + * Migratability of hugepages depends on architectures and their size. |
---|
1286 | 1273 | * This check is necessary because some callers of hugepage migration |
---|
1287 | 1274 | * like soft offline and memory hotremove don't walk through page |
---|
1288 | 1275 | * tables or check whether the hugepage is pmd-based or not before |
---|
.. | .. |
---|
1327 | 1314 | goto put_anon; |
---|
1328 | 1315 | |
---|
1329 | 1316 | if (page_mapped(hpage)) { |
---|
1330 | | - try_to_unmap(hpage, |
---|
1331 | | - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
---|
| 1317 | + bool mapping_locked = false; |
---|
| 1318 | + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; |
---|
| 1319 | + |
---|
| 1320 | + if (!PageAnon(hpage)) { |
---|
| 1321 | + /* |
---|
| 1322 | + * In shared mappings, try_to_unmap could potentially |
---|
| 1323 | + * call huge_pmd_unshare. Because of this, take |
---|
| 1324 | + * semaphore in write mode here and set TTU_RMAP_LOCKED |
---|
| 1325 | + * to let lower levels know we have taken the lock. |
---|
| 1326 | + */ |
---|
| 1327 | + mapping = hugetlb_page_mapping_lock_write(hpage); |
---|
| 1328 | + if (unlikely(!mapping)) |
---|
| 1329 | + goto unlock_put_anon; |
---|
| 1330 | + |
---|
| 1331 | + mapping_locked = true; |
---|
| 1332 | + ttu |= TTU_RMAP_LOCKED; |
---|
| 1333 | + } |
---|
| 1334 | + |
---|
| 1335 | + try_to_unmap(hpage, ttu); |
---|
1332 | 1336 | page_was_mapped = 1; |
---|
| 1337 | + |
---|
| 1338 | + if (mapping_locked) |
---|
| 1339 | + i_mmap_unlock_write(mapping); |
---|
1333 | 1340 | } |
---|
1334 | 1341 | |
---|
1335 | 1342 | if (!page_mapped(hpage)) |
---|
.. | .. |
---|
1339 | 1346 | remove_migration_ptes(hpage, |
---|
1340 | 1347 | rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); |
---|
1341 | 1348 | |
---|
| 1349 | +unlock_put_anon: |
---|
1342 | 1350 | unlock_page(new_hpage); |
---|
1343 | 1351 | |
---|
1344 | 1352 | put_anon: |
---|
.. | .. |
---|
1395 | 1403 | enum migrate_mode mode, int reason) |
---|
1396 | 1404 | { |
---|
1397 | 1405 | int retry = 1; |
---|
| 1406 | + int thp_retry = 1; |
---|
1398 | 1407 | int nr_failed = 0; |
---|
1399 | 1408 | int nr_succeeded = 0; |
---|
| 1409 | + int nr_thp_succeeded = 0; |
---|
| 1410 | + int nr_thp_failed = 0; |
---|
| 1411 | + int nr_thp_split = 0; |
---|
1400 | 1412 | int pass = 0; |
---|
| 1413 | + bool is_thp = false; |
---|
1401 | 1414 | struct page *page; |
---|
1402 | 1415 | struct page *page2; |
---|
1403 | 1416 | int swapwrite = current->flags & PF_SWAPWRITE; |
---|
1404 | | - int rc; |
---|
| 1417 | + int rc, nr_subpages; |
---|
| 1418 | + |
---|
| 1419 | + trace_mm_migrate_pages_start(mode, reason); |
---|
1405 | 1420 | |
---|
1406 | 1421 | if (!swapwrite) |
---|
1407 | 1422 | current->flags |= PF_SWAPWRITE; |
---|
1408 | 1423 | |
---|
1409 | | - for(pass = 0; pass < 10 && retry; pass++) { |
---|
| 1424 | + for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { |
---|
1410 | 1425 | retry = 0; |
---|
| 1426 | + thp_retry = 0; |
---|
1411 | 1427 | |
---|
1412 | 1428 | list_for_each_entry_safe(page, page2, from, lru) { |
---|
1413 | 1429 | retry: |
---|
| 1430 | + /* |
---|
| 1431 | + * THP statistics is based on the source huge page. |
---|
| 1432 | + * Capture required information that might get lost |
---|
| 1433 | + * during migration. |
---|
| 1434 | + */ |
---|
| 1435 | + is_thp = PageTransHuge(page) && !PageHuge(page); |
---|
| 1436 | + nr_subpages = thp_nr_pages(page); |
---|
1414 | 1437 | cond_resched(); |
---|
1415 | 1438 | |
---|
1416 | 1439 | if (PageHuge(page)) |
---|
.. | .. |
---|
1435 | 1458 | * we encounter them after the rest of the list |
---|
1436 | 1459 | * is processed. |
---|
1437 | 1460 | */ |
---|
1438 | | - if (PageTransHuge(page) && !PageHuge(page)) { |
---|
| 1461 | + if (is_thp) { |
---|
1439 | 1462 | lock_page(page); |
---|
1440 | 1463 | rc = split_huge_page_to_list(page, from); |
---|
1441 | 1464 | unlock_page(page); |
---|
1442 | 1465 | if (!rc) { |
---|
1443 | 1466 | list_safe_reset_next(page, page2, lru); |
---|
| 1467 | + nr_thp_split++; |
---|
1444 | 1468 | goto retry; |
---|
1445 | 1469 | } |
---|
| 1470 | + |
---|
| 1471 | + nr_thp_failed++; |
---|
| 1472 | + nr_failed += nr_subpages; |
---|
| 1473 | + goto out; |
---|
1446 | 1474 | } |
---|
1447 | 1475 | nr_failed++; |
---|
1448 | 1476 | goto out; |
---|
1449 | 1477 | case -EAGAIN: |
---|
| 1478 | + if (is_thp) { |
---|
| 1479 | + thp_retry++; |
---|
| 1480 | + break; |
---|
| 1481 | + } |
---|
1450 | 1482 | retry++; |
---|
1451 | 1483 | break; |
---|
1452 | 1484 | case MIGRATEPAGE_SUCCESS: |
---|
| 1485 | + if (is_thp) { |
---|
| 1486 | + nr_thp_succeeded++; |
---|
| 1487 | + nr_succeeded += nr_subpages; |
---|
| 1488 | + break; |
---|
| 1489 | + } |
---|
1453 | 1490 | nr_succeeded++; |
---|
1454 | 1491 | break; |
---|
1455 | 1492 | default: |
---|
.. | .. |
---|
1459 | 1496 | * removed from migration page list and not |
---|
1460 | 1497 | * retried in the next outer loop. |
---|
1461 | 1498 | */ |
---|
| 1499 | + if (is_thp) { |
---|
| 1500 | + nr_thp_failed++; |
---|
| 1501 | + nr_failed += nr_subpages; |
---|
| 1502 | + break; |
---|
| 1503 | + } |
---|
1462 | 1504 | nr_failed++; |
---|
1463 | 1505 | break; |
---|
1464 | 1506 | } |
---|
1465 | 1507 | } |
---|
1466 | 1508 | } |
---|
1467 | | - nr_failed += retry; |
---|
| 1509 | + nr_failed += retry + thp_retry; |
---|
| 1510 | + nr_thp_failed += thp_retry; |
---|
1468 | 1511 | rc = nr_failed; |
---|
1469 | 1512 | out: |
---|
1470 | | - if (nr_succeeded) |
---|
1471 | | - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); |
---|
1472 | | - if (nr_failed) |
---|
1473 | | - count_vm_events(PGMIGRATE_FAIL, nr_failed); |
---|
1474 | | - trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); |
---|
| 1513 | + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); |
---|
| 1514 | + count_vm_events(PGMIGRATE_FAIL, nr_failed); |
---|
| 1515 | + count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); |
---|
| 1516 | + count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); |
---|
| 1517 | + count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); |
---|
| 1518 | + trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, |
---|
| 1519 | + nr_thp_failed, nr_thp_split, mode, reason); |
---|
1475 | 1520 | |
---|
1476 | 1521 | if (!swapwrite) |
---|
1477 | 1522 | current->flags &= ~PF_SWAPWRITE; |
---|
1478 | 1523 | |
---|
1479 | 1524 | return rc; |
---|
| 1525 | +} |
---|
| 1526 | +EXPORT_SYMBOL_GPL(migrate_pages); |
---|
| 1527 | + |
---|
| 1528 | +struct page *alloc_migration_target(struct page *page, unsigned long private) |
---|
| 1529 | +{ |
---|
| 1530 | + struct migration_target_control *mtc; |
---|
| 1531 | + gfp_t gfp_mask; |
---|
| 1532 | + unsigned int order = 0; |
---|
| 1533 | + struct page *new_page = NULL; |
---|
| 1534 | + int nid; |
---|
| 1535 | + int zidx; |
---|
| 1536 | + |
---|
| 1537 | + mtc = (struct migration_target_control *)private; |
---|
| 1538 | + gfp_mask = mtc->gfp_mask; |
---|
| 1539 | + nid = mtc->nid; |
---|
| 1540 | + if (nid == NUMA_NO_NODE) |
---|
| 1541 | + nid = page_to_nid(page); |
---|
| 1542 | + |
---|
| 1543 | + if (PageHuge(page)) { |
---|
| 1544 | + struct hstate *h = page_hstate(compound_head(page)); |
---|
| 1545 | + |
---|
| 1546 | + gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); |
---|
| 1547 | + return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); |
---|
| 1548 | + } |
---|
| 1549 | + |
---|
| 1550 | + if (PageTransHuge(page)) { |
---|
| 1551 | + /* |
---|
| 1552 | + * clear __GFP_RECLAIM to make the migration callback |
---|
| 1553 | + * consistent with regular THP allocations. |
---|
| 1554 | + */ |
---|
| 1555 | + gfp_mask &= ~__GFP_RECLAIM; |
---|
| 1556 | + gfp_mask |= GFP_TRANSHUGE; |
---|
| 1557 | + order = HPAGE_PMD_ORDER; |
---|
| 1558 | + } |
---|
| 1559 | + zidx = zone_idx(page_zone(page)); |
---|
| 1560 | + if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) |
---|
| 1561 | + gfp_mask |= __GFP_HIGHMEM; |
---|
| 1562 | + |
---|
| 1563 | + new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); |
---|
| 1564 | + |
---|
| 1565 | + if (new_page && PageTransHuge(new_page)) |
---|
| 1566 | + prep_transhuge_page(new_page); |
---|
| 1567 | + |
---|
| 1568 | + return new_page; |
---|
1480 | 1569 | } |
---|
1481 | 1570 | |
---|
1482 | 1571 | #ifdef CONFIG_NUMA |
---|
.. | .. |
---|
1496 | 1585 | struct list_head *pagelist, int node) |
---|
1497 | 1586 | { |
---|
1498 | 1587 | int err; |
---|
| 1588 | + struct migration_target_control mtc = { |
---|
| 1589 | + .nid = node, |
---|
| 1590 | + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, |
---|
| 1591 | + }; |
---|
1499 | 1592 | |
---|
1500 | | - if (list_empty(pagelist)) |
---|
1501 | | - return 0; |
---|
1502 | | - |
---|
1503 | | - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, |
---|
1504 | | - MIGRATE_SYNC, MR_SYSCALL); |
---|
| 1593 | + err = migrate_pages(pagelist, alloc_migration_target, NULL, |
---|
| 1594 | + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); |
---|
1505 | 1595 | if (err) |
---|
1506 | 1596 | putback_movable_pages(pagelist); |
---|
1507 | 1597 | return err; |
---|
.. | .. |
---|
1524 | 1614 | unsigned int follflags; |
---|
1525 | 1615 | int err; |
---|
1526 | 1616 | |
---|
1527 | | - down_read(&mm->mmap_sem); |
---|
| 1617 | + mmap_read_lock(mm); |
---|
1528 | 1618 | err = -EFAULT; |
---|
1529 | 1619 | vma = find_vma(mm, addr); |
---|
1530 | 1620 | if (!vma || addr < vma->vm_start || !vma_migratable(vma)) |
---|
.. | .. |
---|
1552 | 1642 | |
---|
1553 | 1643 | if (PageHuge(page)) { |
---|
1554 | 1644 | if (PageHead(page)) { |
---|
1555 | | - isolate_huge_page(page, pagelist); |
---|
1556 | | - err = 1; |
---|
| 1645 | + err = isolate_hugetlb(page, pagelist); |
---|
| 1646 | + if (!err) |
---|
| 1647 | + err = 1; |
---|
1557 | 1648 | } |
---|
1558 | 1649 | } else { |
---|
1559 | 1650 | struct page *head; |
---|
.. | .. |
---|
1566 | 1657 | err = 1; |
---|
1567 | 1658 | list_add_tail(&head->lru, pagelist); |
---|
1568 | 1659 | mod_node_page_state(page_pgdat(head), |
---|
1569 | | - NR_ISOLATED_ANON + page_is_file_cache(head), |
---|
1570 | | - hpage_nr_pages(head)); |
---|
| 1660 | + NR_ISOLATED_ANON + page_is_file_lru(head), |
---|
| 1661 | + thp_nr_pages(head)); |
---|
1571 | 1662 | } |
---|
1572 | 1663 | out_putpage: |
---|
1573 | 1664 | /* |
---|
.. | .. |
---|
1575 | 1666 | * isolate_lru_page() or drop the page ref if it was |
---|
1576 | 1667 | * not isolated. |
---|
1577 | 1668 | */ |
---|
1578 | | - put_page(page); |
---|
| 1669 | + put_user_page(page); |
---|
1579 | 1670 | out: |
---|
1580 | | - up_read(&mm->mmap_sem); |
---|
| 1671 | + mmap_read_unlock(mm); |
---|
1581 | 1672 | return err; |
---|
| 1673 | +} |
---|
| 1674 | + |
---|
| 1675 | +static int move_pages_and_store_status(struct mm_struct *mm, int node, |
---|
| 1676 | + struct list_head *pagelist, int __user *status, |
---|
| 1677 | + int start, int i, unsigned long nr_pages) |
---|
| 1678 | +{ |
---|
| 1679 | + int err; |
---|
| 1680 | + |
---|
| 1681 | + if (list_empty(pagelist)) |
---|
| 1682 | + return 0; |
---|
| 1683 | + |
---|
| 1684 | + err = do_move_pages_to_node(mm, pagelist, node); |
---|
| 1685 | + if (err) { |
---|
| 1686 | + /* |
---|
| 1687 | + * Positive err means the number of failed |
---|
| 1688 | + * pages to migrate. Since we are going to |
---|
| 1689 | + * abort and return the number of non-migrated |
---|
| 1690 | + * pages, so need to incude the rest of the |
---|
| 1691 | + * nr_pages that have not been attempted as |
---|
| 1692 | + * well. |
---|
| 1693 | + */ |
---|
| 1694 | + if (err > 0) |
---|
| 1695 | + err += nr_pages - i - 1; |
---|
| 1696 | + return err; |
---|
| 1697 | + } |
---|
| 1698 | + return store_status(status, start, node, i - start); |
---|
1582 | 1699 | } |
---|
1583 | 1700 | |
---|
1584 | 1701 | /* |
---|
.. | .. |
---|
1596 | 1713 | int start, i; |
---|
1597 | 1714 | int err = 0, err1; |
---|
1598 | 1715 | |
---|
1599 | | - migrate_prep(); |
---|
| 1716 | + lru_cache_disable(); |
---|
1600 | 1717 | |
---|
1601 | 1718 | for (i = start = 0; i < nr_pages; i++) { |
---|
1602 | 1719 | const void __user *p; |
---|
.. | .. |
---|
1624 | 1741 | current_node = node; |
---|
1625 | 1742 | start = i; |
---|
1626 | 1743 | } else if (node != current_node) { |
---|
1627 | | - err = do_move_pages_to_node(mm, &pagelist, current_node); |
---|
1628 | | - if (err) { |
---|
1629 | | - /* |
---|
1630 | | - * Positive err means the number of failed |
---|
1631 | | - * pages to migrate. Since we are going to |
---|
1632 | | - * abort and return the number of non-migrated |
---|
1633 | | - * pages, so need to incude the rest of the |
---|
1634 | | - * nr_pages that have not been attempted as |
---|
1635 | | - * well. |
---|
1636 | | - */ |
---|
1637 | | - if (err > 0) |
---|
1638 | | - err += nr_pages - i - 1; |
---|
1639 | | - goto out; |
---|
1640 | | - } |
---|
1641 | | - err = store_status(status, start, current_node, i - start); |
---|
| 1744 | + err = move_pages_and_store_status(mm, current_node, |
---|
| 1745 | + &pagelist, status, start, i, nr_pages); |
---|
1642 | 1746 | if (err) |
---|
1643 | 1747 | goto out; |
---|
1644 | 1748 | start = i; |
---|
.. | .. |
---|
1652 | 1756 | err = add_page_for_migration(mm, addr, current_node, |
---|
1653 | 1757 | &pagelist, flags & MPOL_MF_MOVE_ALL); |
---|
1654 | 1758 | |
---|
1655 | | - if (!err) { |
---|
1656 | | - /* The page is already on the target node */ |
---|
1657 | | - err = store_status(status, i, current_node, 1); |
---|
1658 | | - if (err) |
---|
1659 | | - goto out_flush; |
---|
1660 | | - continue; |
---|
1661 | | - } else if (err > 0) { |
---|
| 1759 | + if (err > 0) { |
---|
1662 | 1760 | /* The page is successfully queued for migration */ |
---|
1663 | 1761 | continue; |
---|
1664 | 1762 | } |
---|
1665 | 1763 | |
---|
1666 | | - err = store_status(status, i, err, 1); |
---|
| 1764 | + /* |
---|
| 1765 | + * If the page is already on the target node (!err), store the |
---|
| 1766 | + * node, otherwise, store the err. |
---|
| 1767 | + */ |
---|
| 1768 | + err = store_status(status, i, err ? : current_node, 1); |
---|
1667 | 1769 | if (err) |
---|
1668 | 1770 | goto out_flush; |
---|
1669 | 1771 | |
---|
1670 | | - err = do_move_pages_to_node(mm, &pagelist, current_node); |
---|
1671 | | - if (err) { |
---|
1672 | | - if (err > 0) |
---|
1673 | | - err += nr_pages - i - 1; |
---|
| 1772 | + err = move_pages_and_store_status(mm, current_node, &pagelist, |
---|
| 1773 | + status, start, i, nr_pages); |
---|
| 1774 | + if (err) |
---|
1674 | 1775 | goto out; |
---|
1675 | | - } |
---|
1676 | | - if (i > start) { |
---|
1677 | | - err = store_status(status, start, current_node, i - start); |
---|
1678 | | - if (err) |
---|
1679 | | - goto out; |
---|
1680 | | - } |
---|
1681 | 1776 | current_node = NUMA_NO_NODE; |
---|
1682 | 1777 | } |
---|
1683 | 1778 | out_flush: |
---|
1684 | | - if (list_empty(&pagelist)) |
---|
1685 | | - return err; |
---|
1686 | | - |
---|
1687 | 1779 | /* Make sure we do not overwrite the existing error */ |
---|
1688 | | - err1 = do_move_pages_to_node(mm, &pagelist, current_node); |
---|
1689 | | - /* |
---|
1690 | | - * Don't have to report non-attempted pages here since: |
---|
1691 | | - * - If the above loop is done gracefully all pages have been |
---|
1692 | | - * attempted. |
---|
1693 | | - * - If the above loop is aborted it means a fatal error |
---|
1694 | | - * happened, should return ret. |
---|
1695 | | - */ |
---|
1696 | | - if (!err1) |
---|
1697 | | - err1 = store_status(status, start, current_node, i - start); |
---|
| 1780 | + err1 = move_pages_and_store_status(mm, current_node, &pagelist, |
---|
| 1781 | + status, start, i, nr_pages); |
---|
1698 | 1782 | if (err >= 0) |
---|
1699 | 1783 | err = err1; |
---|
1700 | 1784 | out: |
---|
| 1785 | + lru_cache_enable(); |
---|
1701 | 1786 | return err; |
---|
1702 | 1787 | } |
---|
1703 | 1788 | |
---|
.. | .. |
---|
1709 | 1794 | { |
---|
1710 | 1795 | unsigned long i; |
---|
1711 | 1796 | |
---|
1712 | | - down_read(&mm->mmap_sem); |
---|
| 1797 | + mmap_read_lock(mm); |
---|
1713 | 1798 | |
---|
1714 | 1799 | for (i = 0; i < nr_pages; i++) { |
---|
1715 | 1800 | unsigned long addr = (unsigned long)(*pages); |
---|
.. | .. |
---|
1736 | 1821 | status++; |
---|
1737 | 1822 | } |
---|
1738 | 1823 | |
---|
1739 | | - up_read(&mm->mmap_sem); |
---|
| 1824 | + mmap_read_unlock(mm); |
---|
1740 | 1825 | } |
---|
1741 | 1826 | |
---|
1742 | 1827 | /* |
---|
.. | .. |
---|
1773 | 1858 | return nr_pages ? -EFAULT : 0; |
---|
1774 | 1859 | } |
---|
1775 | 1860 | |
---|
| 1861 | +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) |
---|
| 1862 | +{ |
---|
| 1863 | + struct task_struct *task; |
---|
| 1864 | + struct mm_struct *mm; |
---|
| 1865 | + |
---|
| 1866 | + /* |
---|
| 1867 | + * There is no need to check if current process has the right to modify |
---|
| 1868 | + * the specified process when they are same. |
---|
| 1869 | + */ |
---|
| 1870 | + if (!pid) { |
---|
| 1871 | + mmget(current->mm); |
---|
| 1872 | + *mem_nodes = cpuset_mems_allowed(current); |
---|
| 1873 | + return current->mm; |
---|
| 1874 | + } |
---|
| 1875 | + |
---|
| 1876 | + /* Find the mm_struct */ |
---|
| 1877 | + rcu_read_lock(); |
---|
| 1878 | + task = find_task_by_vpid(pid); |
---|
| 1879 | + if (!task) { |
---|
| 1880 | + rcu_read_unlock(); |
---|
| 1881 | + return ERR_PTR(-ESRCH); |
---|
| 1882 | + } |
---|
| 1883 | + get_task_struct(task); |
---|
| 1884 | + |
---|
| 1885 | + /* |
---|
| 1886 | + * Check if this process has the right to modify the specified |
---|
| 1887 | + * process. Use the regular "ptrace_may_access()" checks. |
---|
| 1888 | + */ |
---|
| 1889 | + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { |
---|
| 1890 | + rcu_read_unlock(); |
---|
| 1891 | + mm = ERR_PTR(-EPERM); |
---|
| 1892 | + goto out; |
---|
| 1893 | + } |
---|
| 1894 | + rcu_read_unlock(); |
---|
| 1895 | + |
---|
| 1896 | + mm = ERR_PTR(security_task_movememory(task)); |
---|
| 1897 | + if (IS_ERR(mm)) |
---|
| 1898 | + goto out; |
---|
| 1899 | + *mem_nodes = cpuset_mems_allowed(task); |
---|
| 1900 | + mm = get_task_mm(task); |
---|
| 1901 | +out: |
---|
| 1902 | + put_task_struct(task); |
---|
| 1903 | + if (!mm) |
---|
| 1904 | + mm = ERR_PTR(-EINVAL); |
---|
| 1905 | + return mm; |
---|
| 1906 | +} |
---|
| 1907 | + |
---|
1776 | 1908 | /* |
---|
1777 | 1909 | * Move a list of pages in the address space of the currently executing |
---|
1778 | 1910 | * process. |
---|
.. | .. |
---|
1782 | 1914 | const int __user *nodes, |
---|
1783 | 1915 | int __user *status, int flags) |
---|
1784 | 1916 | { |
---|
1785 | | - struct task_struct *task; |
---|
1786 | 1917 | struct mm_struct *mm; |
---|
1787 | 1918 | int err; |
---|
1788 | 1919 | nodemask_t task_nodes; |
---|
.. | .. |
---|
1794 | 1925 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
---|
1795 | 1926 | return -EPERM; |
---|
1796 | 1927 | |
---|
1797 | | - /* Find the mm_struct */ |
---|
1798 | | - rcu_read_lock(); |
---|
1799 | | - task = pid ? find_task_by_vpid(pid) : current; |
---|
1800 | | - if (!task) { |
---|
1801 | | - rcu_read_unlock(); |
---|
1802 | | - return -ESRCH; |
---|
1803 | | - } |
---|
1804 | | - get_task_struct(task); |
---|
1805 | | - |
---|
1806 | | - /* |
---|
1807 | | - * Check if this process has the right to modify the specified |
---|
1808 | | - * process. Use the regular "ptrace_may_access()" checks. |
---|
1809 | | - */ |
---|
1810 | | - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { |
---|
1811 | | - rcu_read_unlock(); |
---|
1812 | | - err = -EPERM; |
---|
1813 | | - goto out; |
---|
1814 | | - } |
---|
1815 | | - rcu_read_unlock(); |
---|
1816 | | - |
---|
1817 | | - err = security_task_movememory(task); |
---|
1818 | | - if (err) |
---|
1819 | | - goto out; |
---|
1820 | | - |
---|
1821 | | - task_nodes = cpuset_mems_allowed(task); |
---|
1822 | | - mm = get_task_mm(task); |
---|
1823 | | - put_task_struct(task); |
---|
1824 | | - |
---|
1825 | | - if (!mm) |
---|
1826 | | - return -EINVAL; |
---|
| 1928 | + mm = find_mm_struct(pid, &task_nodes); |
---|
| 1929 | + if (IS_ERR(mm)) |
---|
| 1930 | + return PTR_ERR(mm); |
---|
1827 | 1931 | |
---|
1828 | 1932 | if (nodes) |
---|
1829 | 1933 | err = do_pages_move(mm, task_nodes, nr_pages, pages, |
---|
.. | .. |
---|
1832 | 1936 | err = do_pages_stat(mm, nr_pages, pages, status); |
---|
1833 | 1937 | |
---|
1834 | 1938 | mmput(mm); |
---|
1835 | | - return err; |
---|
1836 | | - |
---|
1837 | | -out: |
---|
1838 | | - put_task_struct(task); |
---|
1839 | 1939 | return err; |
---|
1840 | 1940 | } |
---|
1841 | 1941 | |
---|
.. | .. |
---|
1889 | 1989 | if (!zone_watermark_ok(zone, 0, |
---|
1890 | 1990 | high_wmark_pages(zone) + |
---|
1891 | 1991 | nr_migrate_pages, |
---|
1892 | | - 0, 0)) |
---|
| 1992 | + ZONE_MOVABLE, 0)) |
---|
1893 | 1993 | continue; |
---|
1894 | 1994 | return true; |
---|
1895 | 1995 | } |
---|
.. | .. |
---|
1918 | 2018 | VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); |
---|
1919 | 2019 | |
---|
1920 | 2020 | /* Avoid migrating to a node that is nearly full */ |
---|
1921 | | - if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
---|
| 2021 | + if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) |
---|
1922 | 2022 | return 0; |
---|
1923 | 2023 | |
---|
1924 | 2024 | if (isolate_lru_page(page)) |
---|
.. | .. |
---|
1936 | 2036 | return 0; |
---|
1937 | 2037 | } |
---|
1938 | 2038 | |
---|
1939 | | - page_lru = page_is_file_cache(page); |
---|
| 2039 | + page_lru = page_is_file_lru(page); |
---|
1940 | 2040 | mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, |
---|
1941 | | - hpage_nr_pages(page)); |
---|
| 2041 | + thp_nr_pages(page)); |
---|
1942 | 2042 | |
---|
1943 | 2043 | /* |
---|
1944 | 2044 | * Isolating the page has taken another reference, so the |
---|
.. | .. |
---|
1960 | 2060 | * node. Caller is expected to have an elevated reference count on |
---|
1961 | 2061 | * the page that will be dropped by this function before returning. |
---|
1962 | 2062 | */ |
---|
1963 | | -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, |
---|
| 2063 | +int migrate_misplaced_page(struct page *page, struct vm_fault *vmf, |
---|
1964 | 2064 | int node) |
---|
1965 | 2065 | { |
---|
1966 | 2066 | pg_data_t *pgdat = NODE_DATA(node); |
---|
.. | .. |
---|
1972 | 2072 | * Don't migrate file pages that are mapped in multiple processes |
---|
1973 | 2073 | * with execute permissions as they are probably shared libraries. |
---|
1974 | 2074 | */ |
---|
1975 | | - if (page_mapcount(page) != 1 && page_is_file_cache(page) && |
---|
1976 | | - (vma->vm_flags & VM_EXEC)) |
---|
| 2075 | + if (page_mapcount(page) != 1 && page_is_file_lru(page) && |
---|
| 2076 | + (vmf->vma_flags & VM_EXEC)) |
---|
1977 | 2077 | goto out; |
---|
1978 | 2078 | |
---|
1979 | 2079 | /* |
---|
1980 | 2080 | * Also do not migrate dirty pages as not all filesystems can move |
---|
1981 | 2081 | * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. |
---|
1982 | 2082 | */ |
---|
1983 | | - if (page_is_file_cache(page) && PageDirty(page)) |
---|
| 2083 | + if (page_is_file_lru(page) && PageDirty(page)) |
---|
1984 | 2084 | goto out; |
---|
1985 | 2085 | |
---|
1986 | 2086 | isolated = numamigrate_isolate_page(pgdat, page); |
---|
.. | .. |
---|
1995 | 2095 | if (!list_empty(&migratepages)) { |
---|
1996 | 2096 | list_del(&page->lru); |
---|
1997 | 2097 | dec_node_page_state(page, NR_ISOLATED_ANON + |
---|
1998 | | - page_is_file_cache(page)); |
---|
| 2098 | + page_is_file_lru(page)); |
---|
1999 | 2099 | putback_lru_page(page); |
---|
2000 | 2100 | } |
---|
2001 | 2101 | isolated = 0; |
---|
.. | .. |
---|
2025 | 2125 | pg_data_t *pgdat = NODE_DATA(node); |
---|
2026 | 2126 | int isolated = 0; |
---|
2027 | 2127 | struct page *new_page = NULL; |
---|
2028 | | - int page_lru = page_is_file_cache(page); |
---|
2029 | | - unsigned long mmun_start = address & HPAGE_PMD_MASK; |
---|
2030 | | - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; |
---|
| 2128 | + int page_lru = page_is_file_lru(page); |
---|
| 2129 | + unsigned long start = address & HPAGE_PMD_MASK; |
---|
2031 | 2130 | |
---|
2032 | 2131 | new_page = alloc_pages_node(node, |
---|
2033 | 2132 | (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), |
---|
.. | .. |
---|
2050 | 2149 | /* anon mapping, we can simply copy page->mapping to the new page: */ |
---|
2051 | 2150 | new_page->mapping = page->mapping; |
---|
2052 | 2151 | new_page->index = page->index; |
---|
| 2152 | + /* flush the cache before copying using the kernel virtual address */ |
---|
| 2153 | + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); |
---|
2053 | 2154 | migrate_page_copy(new_page, page); |
---|
2054 | 2155 | WARN_ON(PageLRU(new_page)); |
---|
2055 | 2156 | |
---|
2056 | 2157 | /* Recheck the target PMD */ |
---|
2057 | | - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
---|
2058 | 2158 | ptl = pmd_lock(mm, pmd); |
---|
2059 | 2159 | if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { |
---|
2060 | 2160 | spin_unlock(ptl); |
---|
2061 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
2062 | 2161 | |
---|
2063 | 2162 | /* Reverse changes made by migrate_page_copy() */ |
---|
2064 | 2163 | if (TestClearPageActive(new_page)) |
---|
.. | .. |
---|
2089 | 2188 | * new page and page_add_new_anon_rmap guarantee the copy is |
---|
2090 | 2189 | * visible before the pagetable update. |
---|
2091 | 2190 | */ |
---|
2092 | | - flush_cache_range(vma, mmun_start, mmun_end); |
---|
2093 | | - page_add_anon_rmap(new_page, vma, mmun_start, true); |
---|
| 2191 | + page_add_anon_rmap(new_page, vma, start, true); |
---|
2094 | 2192 | /* |
---|
2095 | 2193 | * At this point the pmd is numa/protnone (i.e. non present) and the TLB |
---|
2096 | 2194 | * has already been flushed globally. So no TLB can be currently |
---|
.. | .. |
---|
2098 | 2196 | * pmd before doing set_pmd_at(), nor to flush the TLB after |
---|
2099 | 2197 | * set_pmd_at(). Clearing the pmd here would introduce a race |
---|
2100 | 2198 | * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the |
---|
2101 | | - * mmap_sem for reading. If the pmd is set to NULL at any given time, |
---|
| 2199 | + * mmap_lock for reading. If the pmd is set to NULL at any given time, |
---|
2102 | 2200 | * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this |
---|
2103 | 2201 | * pmd. |
---|
2104 | 2202 | */ |
---|
2105 | | - set_pmd_at(mm, mmun_start, pmd, entry); |
---|
| 2203 | + set_pmd_at(mm, start, pmd, entry); |
---|
2106 | 2204 | update_mmu_cache_pmd(vma, address, &entry); |
---|
2107 | 2205 | |
---|
2108 | 2206 | page_ref_unfreeze(page, 2); |
---|
.. | .. |
---|
2111 | 2209 | set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); |
---|
2112 | 2210 | |
---|
2113 | 2211 | spin_unlock(ptl); |
---|
2114 | | - /* |
---|
2115 | | - * No need to double call mmu_notifier->invalidate_range() callback as |
---|
2116 | | - * the above pmdp_huge_clear_flush_notify() did already call it. |
---|
2117 | | - */ |
---|
2118 | | - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
---|
2119 | 2212 | |
---|
2120 | 2213 | /* Take an "isolate" reference and put new page on the LRU. */ |
---|
2121 | 2214 | get_page(new_page); |
---|
.. | .. |
---|
2139 | 2232 | ptl = pmd_lock(mm, pmd); |
---|
2140 | 2233 | if (pmd_same(*pmd, entry)) { |
---|
2141 | 2234 | entry = pmd_modify(entry, vma->vm_page_prot); |
---|
2142 | | - set_pmd_at(mm, mmun_start, pmd, entry); |
---|
| 2235 | + set_pmd_at(mm, start, pmd, entry); |
---|
2143 | 2236 | update_mmu_cache_pmd(vma, address, &entry); |
---|
2144 | 2237 | } |
---|
2145 | 2238 | spin_unlock(ptl); |
---|
.. | .. |
---|
2153 | 2246 | |
---|
2154 | 2247 | #endif /* CONFIG_NUMA */ |
---|
2155 | 2248 | |
---|
2156 | | -#if defined(CONFIG_MIGRATE_VMA_HELPER) |
---|
2157 | | -struct migrate_vma { |
---|
2158 | | - struct vm_area_struct *vma; |
---|
2159 | | - unsigned long *dst; |
---|
2160 | | - unsigned long *src; |
---|
2161 | | - unsigned long cpages; |
---|
2162 | | - unsigned long npages; |
---|
2163 | | - unsigned long start; |
---|
2164 | | - unsigned long end; |
---|
2165 | | -}; |
---|
2166 | | - |
---|
| 2249 | +#ifdef CONFIG_DEVICE_PRIVATE |
---|
2167 | 2250 | static int migrate_vma_collect_hole(unsigned long start, |
---|
2168 | 2251 | unsigned long end, |
---|
| 2252 | + __always_unused int depth, |
---|
2169 | 2253 | struct mm_walk *walk) |
---|
2170 | 2254 | { |
---|
2171 | 2255 | struct migrate_vma *migrate = walk->private; |
---|
2172 | 2256 | unsigned long addr; |
---|
2173 | 2257 | |
---|
2174 | | - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { |
---|
| 2258 | + /* Only allow populating anonymous memory. */ |
---|
| 2259 | + if (!vma_is_anonymous(walk->vma)) { |
---|
| 2260 | + for (addr = start; addr < end; addr += PAGE_SIZE) { |
---|
| 2261 | + migrate->src[migrate->npages] = 0; |
---|
| 2262 | + migrate->dst[migrate->npages] = 0; |
---|
| 2263 | + migrate->npages++; |
---|
| 2264 | + } |
---|
| 2265 | + return 0; |
---|
| 2266 | + } |
---|
| 2267 | + |
---|
| 2268 | + for (addr = start; addr < end; addr += PAGE_SIZE) { |
---|
2175 | 2269 | migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; |
---|
2176 | 2270 | migrate->dst[migrate->npages] = 0; |
---|
2177 | 2271 | migrate->npages++; |
---|
.. | .. |
---|
2188 | 2282 | struct migrate_vma *migrate = walk->private; |
---|
2189 | 2283 | unsigned long addr; |
---|
2190 | 2284 | |
---|
2191 | | - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { |
---|
| 2285 | + for (addr = start; addr < end; addr += PAGE_SIZE) { |
---|
2192 | 2286 | migrate->dst[migrate->npages] = 0; |
---|
2193 | 2287 | migrate->src[migrate->npages++] = 0; |
---|
2194 | 2288 | } |
---|
.. | .. |
---|
2210 | 2304 | |
---|
2211 | 2305 | again: |
---|
2212 | 2306 | if (pmd_none(*pmdp)) |
---|
2213 | | - return migrate_vma_collect_hole(start, end, walk); |
---|
| 2307 | + return migrate_vma_collect_hole(start, end, -1, walk); |
---|
2214 | 2308 | |
---|
2215 | 2309 | if (pmd_trans_huge(*pmdp)) { |
---|
2216 | 2310 | struct page *page; |
---|
.. | .. |
---|
2243 | 2337 | return migrate_vma_collect_skip(start, end, |
---|
2244 | 2338 | walk); |
---|
2245 | 2339 | if (pmd_none(*pmdp)) |
---|
2246 | | - return migrate_vma_collect_hole(start, end, |
---|
| 2340 | + return migrate_vma_collect_hole(start, end, -1, |
---|
2247 | 2341 | walk); |
---|
2248 | 2342 | } |
---|
2249 | 2343 | } |
---|
.. | .. |
---|
2255 | 2349 | arch_enter_lazy_mmu_mode(); |
---|
2256 | 2350 | |
---|
2257 | 2351 | for (; addr < end; addr += PAGE_SIZE, ptep++) { |
---|
2258 | | - unsigned long mpfn, pfn; |
---|
| 2352 | + unsigned long mpfn = 0, pfn; |
---|
2259 | 2353 | struct page *page; |
---|
2260 | 2354 | swp_entry_t entry; |
---|
2261 | 2355 | pte_t pte; |
---|
2262 | 2356 | |
---|
2263 | 2357 | pte = *ptep; |
---|
2264 | | - pfn = pte_pfn(pte); |
---|
2265 | 2358 | |
---|
2266 | 2359 | if (pte_none(pte)) { |
---|
2267 | | - mpfn = MIGRATE_PFN_MIGRATE; |
---|
2268 | | - migrate->cpages++; |
---|
2269 | | - pfn = 0; |
---|
| 2360 | + if (vma_is_anonymous(vma)) { |
---|
| 2361 | + mpfn = MIGRATE_PFN_MIGRATE; |
---|
| 2362 | + migrate->cpages++; |
---|
| 2363 | + } |
---|
2270 | 2364 | goto next; |
---|
2271 | 2365 | } |
---|
2272 | 2366 | |
---|
2273 | 2367 | if (!pte_present(pte)) { |
---|
2274 | | - mpfn = pfn = 0; |
---|
2275 | | - |
---|
2276 | 2368 | /* |
---|
2277 | 2369 | * Only care about unaddressable device page special |
---|
2278 | 2370 | * page table entry. Other special swap entries are not |
---|
.. | .. |
---|
2283 | 2375 | goto next; |
---|
2284 | 2376 | |
---|
2285 | 2377 | page = device_private_entry_to_page(entry); |
---|
2286 | | - mpfn = migrate_pfn(page_to_pfn(page))| |
---|
2287 | | - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; |
---|
| 2378 | + if (!(migrate->flags & |
---|
| 2379 | + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || |
---|
| 2380 | + page->pgmap->owner != migrate->pgmap_owner) |
---|
| 2381 | + goto next; |
---|
| 2382 | + |
---|
| 2383 | + mpfn = migrate_pfn(page_to_pfn(page)) | |
---|
| 2384 | + MIGRATE_PFN_MIGRATE; |
---|
2288 | 2385 | if (is_write_device_private_entry(entry)) |
---|
2289 | 2386 | mpfn |= MIGRATE_PFN_WRITE; |
---|
2290 | 2387 | } else { |
---|
| 2388 | + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) |
---|
| 2389 | + goto next; |
---|
| 2390 | + pfn = pte_pfn(pte); |
---|
2291 | 2391 | if (is_zero_pfn(pfn)) { |
---|
2292 | 2392 | mpfn = MIGRATE_PFN_MIGRATE; |
---|
2293 | 2393 | migrate->cpages++; |
---|
2294 | | - pfn = 0; |
---|
2295 | 2394 | goto next; |
---|
2296 | 2395 | } |
---|
2297 | | - page = _vm_normal_page(migrate->vma, addr, pte, true); |
---|
| 2396 | + page = vm_normal_page(migrate->vma, addr, pte); |
---|
2298 | 2397 | mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; |
---|
2299 | 2398 | mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; |
---|
2300 | 2399 | } |
---|
2301 | 2400 | |
---|
2302 | 2401 | /* FIXME support THP */ |
---|
2303 | 2402 | if (!page || !page->mapping || PageTransCompound(page)) { |
---|
2304 | | - mpfn = pfn = 0; |
---|
| 2403 | + mpfn = 0; |
---|
2305 | 2404 | goto next; |
---|
2306 | 2405 | } |
---|
2307 | | - pfn = page_to_pfn(page); |
---|
2308 | 2406 | |
---|
2309 | 2407 | /* |
---|
2310 | 2408 | * By getting a reference on the page we pin it and that blocks |
---|
.. | .. |
---|
2333 | 2431 | entry = make_migration_entry(page, mpfn & |
---|
2334 | 2432 | MIGRATE_PFN_WRITE); |
---|
2335 | 2433 | swp_pte = swp_entry_to_pte(entry); |
---|
2336 | | - if (pte_soft_dirty(pte)) |
---|
2337 | | - swp_pte = pte_swp_mksoft_dirty(swp_pte); |
---|
| 2434 | + if (pte_present(pte)) { |
---|
| 2435 | + if (pte_soft_dirty(pte)) |
---|
| 2436 | + swp_pte = pte_swp_mksoft_dirty(swp_pte); |
---|
| 2437 | + if (pte_uffd_wp(pte)) |
---|
| 2438 | + swp_pte = pte_swp_mkuffd_wp(swp_pte); |
---|
| 2439 | + } else { |
---|
| 2440 | + if (pte_swp_soft_dirty(pte)) |
---|
| 2441 | + swp_pte = pte_swp_mksoft_dirty(swp_pte); |
---|
| 2442 | + if (pte_swp_uffd_wp(pte)) |
---|
| 2443 | + swp_pte = pte_swp_mkuffd_wp(swp_pte); |
---|
| 2444 | + } |
---|
2338 | 2445 | set_pte_at(mm, addr, ptep, swp_pte); |
---|
2339 | 2446 | |
---|
2340 | 2447 | /* |
---|
.. | .. |
---|
2353 | 2460 | migrate->dst[migrate->npages] = 0; |
---|
2354 | 2461 | migrate->src[migrate->npages++] = mpfn; |
---|
2355 | 2462 | } |
---|
2356 | | - arch_leave_lazy_mmu_mode(); |
---|
2357 | | - pte_unmap_unlock(ptep - 1, ptl); |
---|
2358 | 2463 | |
---|
2359 | 2464 | /* Only flush the TLB if we actually modified any entries */ |
---|
2360 | 2465 | if (unmapped) |
---|
2361 | 2466 | flush_tlb_range(walk->vma, start, end); |
---|
2362 | 2467 | |
---|
| 2468 | + arch_leave_lazy_mmu_mode(); |
---|
| 2469 | + pte_unmap_unlock(ptep - 1, ptl); |
---|
| 2470 | + |
---|
2363 | 2471 | return 0; |
---|
2364 | 2472 | } |
---|
| 2473 | + |
---|
| 2474 | +static const struct mm_walk_ops migrate_vma_walk_ops = { |
---|
| 2475 | + .pmd_entry = migrate_vma_collect_pmd, |
---|
| 2476 | + .pte_hole = migrate_vma_collect_hole, |
---|
| 2477 | +}; |
---|
2365 | 2478 | |
---|
2366 | 2479 | /* |
---|
2367 | 2480 | * migrate_vma_collect() - collect pages over a range of virtual addresses |
---|
.. | .. |
---|
2373 | 2486 | */ |
---|
2374 | 2487 | static void migrate_vma_collect(struct migrate_vma *migrate) |
---|
2375 | 2488 | { |
---|
2376 | | - struct mm_walk mm_walk = { |
---|
2377 | | - .pmd_entry = migrate_vma_collect_pmd, |
---|
2378 | | - .pte_hole = migrate_vma_collect_hole, |
---|
2379 | | - .vma = migrate->vma, |
---|
2380 | | - .mm = migrate->vma->vm_mm, |
---|
2381 | | - .private = migrate, |
---|
2382 | | - }; |
---|
| 2489 | + struct mmu_notifier_range range; |
---|
2383 | 2490 | |
---|
2384 | | - mmu_notifier_invalidate_range_start(mm_walk.mm, |
---|
2385 | | - migrate->start, |
---|
2386 | | - migrate->end); |
---|
2387 | | - walk_page_range(migrate->start, migrate->end, &mm_walk); |
---|
2388 | | - mmu_notifier_invalidate_range_end(mm_walk.mm, |
---|
2389 | | - migrate->start, |
---|
2390 | | - migrate->end); |
---|
| 2491 | + /* |
---|
| 2492 | + * Note that the pgmap_owner is passed to the mmu notifier callback so |
---|
| 2493 | + * that the registered device driver can skip invalidating device |
---|
| 2494 | + * private page mappings that won't be migrated. |
---|
| 2495 | + */ |
---|
| 2496 | + mmu_notifier_range_init_migrate(&range, 0, migrate->vma, |
---|
| 2497 | + migrate->vma->vm_mm, migrate->start, migrate->end, |
---|
| 2498 | + migrate->pgmap_owner); |
---|
| 2499 | + mmu_notifier_invalidate_range_start(&range); |
---|
2391 | 2500 | |
---|
| 2501 | + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, |
---|
| 2502 | + &migrate_vma_walk_ops, migrate); |
---|
| 2503 | + |
---|
| 2504 | + mmu_notifier_invalidate_range_end(&range); |
---|
2392 | 2505 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); |
---|
2393 | 2506 | } |
---|
2394 | 2507 | |
---|
.. | .. |
---|
2432 | 2545 | * FIXME proper solution is to rework migration_entry_wait() so |
---|
2433 | 2546 | * it does not need to take a reference on page. |
---|
2434 | 2547 | */ |
---|
2435 | | - if (is_device_private_page(page)) |
---|
2436 | | - return true; |
---|
2437 | | - |
---|
2438 | | - /* |
---|
2439 | | - * Only allow device public page to be migrated and account for |
---|
2440 | | - * the extra reference count imply by ZONE_DEVICE pages. |
---|
2441 | | - */ |
---|
2442 | | - if (!is_device_public_page(page)) |
---|
2443 | | - return false; |
---|
2444 | | - extra++; |
---|
| 2548 | + return is_device_private_page(page); |
---|
2445 | 2549 | } |
---|
2446 | 2550 | |
---|
2447 | 2551 | /* For file back page */ |
---|
.. | .. |
---|
2575 | 2679 | */ |
---|
2576 | 2680 | static void migrate_vma_unmap(struct migrate_vma *migrate) |
---|
2577 | 2681 | { |
---|
2578 | | - int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
---|
| 2682 | + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; |
---|
2579 | 2683 | const unsigned long npages = migrate->npages; |
---|
2580 | 2684 | const unsigned long start = migrate->start; |
---|
2581 | 2685 | unsigned long addr, i, restore = 0; |
---|
.. | .. |
---|
2620 | 2724 | } |
---|
2621 | 2725 | } |
---|
2622 | 2726 | |
---|
| 2727 | +/** |
---|
| 2728 | + * migrate_vma_setup() - prepare to migrate a range of memory |
---|
| 2729 | + * @args: contains the vma, start, and pfns arrays for the migration |
---|
| 2730 | + * |
---|
| 2731 | + * Returns: negative errno on failures, 0 when 0 or more pages were migrated |
---|
| 2732 | + * without an error. |
---|
| 2733 | + * |
---|
| 2734 | + * Prepare to migrate a range of memory virtual address range by collecting all |
---|
| 2735 | + * the pages backing each virtual address in the range, saving them inside the |
---|
| 2736 | + * src array. Then lock those pages and unmap them. Once the pages are locked |
---|
| 2737 | + * and unmapped, check whether each page is pinned or not. Pages that aren't |
---|
| 2738 | + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the |
---|
| 2739 | + * corresponding src array entry. Then restores any pages that are pinned, by |
---|
| 2740 | + * remapping and unlocking those pages. |
---|
| 2741 | + * |
---|
| 2742 | + * The caller should then allocate destination memory and copy source memory to |
---|
| 2743 | + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE |
---|
| 2744 | + * flag set). Once these are allocated and copied, the caller must update each |
---|
| 2745 | + * corresponding entry in the dst array with the pfn value of the destination |
---|
| 2746 | + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set |
---|
| 2747 | + * (destination pages must have their struct pages locked, via lock_page()). |
---|
| 2748 | + * |
---|
| 2749 | + * Note that the caller does not have to migrate all the pages that are marked |
---|
| 2750 | + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from |
---|
| 2751 | + * device memory to system memory. If the caller cannot migrate a device page |
---|
| 2752 | + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe |
---|
| 2753 | + * consequences for the userspace process, so it must be avoided if at all |
---|
| 2754 | + * possible. |
---|
| 2755 | + * |
---|
| 2756 | + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we |
---|
| 2757 | + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus |
---|
| 2758 | + * allowing the caller to allocate device memory for those unback virtual |
---|
| 2759 | + * address. For this the caller simply has to allocate device memory and |
---|
| 2760 | + * properly set the destination entry like for regular migration. Note that |
---|
| 2761 | + * this can still fails and thus inside the device driver must check if the |
---|
| 2762 | + * migration was successful for those entries after calling migrate_vma_pages() |
---|
| 2763 | + * just like for regular migration. |
---|
| 2764 | + * |
---|
| 2765 | + * After that, the callers must call migrate_vma_pages() to go over each entry |
---|
| 2766 | + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag |
---|
| 2767 | + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, |
---|
| 2768 | + * then migrate_vma_pages() to migrate struct page information from the source |
---|
| 2769 | + * struct page to the destination struct page. If it fails to migrate the |
---|
| 2770 | + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the |
---|
| 2771 | + * src array. |
---|
| 2772 | + * |
---|
| 2773 | + * At this point all successfully migrated pages have an entry in the src |
---|
| 2774 | + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst |
---|
| 2775 | + * array entry with MIGRATE_PFN_VALID flag set. |
---|
| 2776 | + * |
---|
| 2777 | + * Once migrate_vma_pages() returns the caller may inspect which pages were |
---|
| 2778 | + * successfully migrated, and which were not. Successfully migrated pages will |
---|
| 2779 | + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. |
---|
| 2780 | + * |
---|
| 2781 | + * It is safe to update device page table after migrate_vma_pages() because |
---|
| 2782 | + * both destination and source page are still locked, and the mmap_lock is held |
---|
| 2783 | + * in read mode (hence no one can unmap the range being migrated). |
---|
| 2784 | + * |
---|
| 2785 | + * Once the caller is done cleaning up things and updating its page table (if it |
---|
| 2786 | + * chose to do so, this is not an obligation) it finally calls |
---|
| 2787 | + * migrate_vma_finalize() to update the CPU page table to point to new pages |
---|
| 2788 | + * for successfully migrated pages or otherwise restore the CPU page table to |
---|
| 2789 | + * point to the original source pages. |
---|
| 2790 | + */ |
---|
| 2791 | +int migrate_vma_setup(struct migrate_vma *args) |
---|
| 2792 | +{ |
---|
| 2793 | + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; |
---|
| 2794 | + |
---|
| 2795 | + args->start &= PAGE_MASK; |
---|
| 2796 | + args->end &= PAGE_MASK; |
---|
| 2797 | + if (!args->vma || is_vm_hugetlb_page(args->vma) || |
---|
| 2798 | + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) |
---|
| 2799 | + return -EINVAL; |
---|
| 2800 | + if (nr_pages <= 0) |
---|
| 2801 | + return -EINVAL; |
---|
| 2802 | + if (args->start < args->vma->vm_start || |
---|
| 2803 | + args->start >= args->vma->vm_end) |
---|
| 2804 | + return -EINVAL; |
---|
| 2805 | + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) |
---|
| 2806 | + return -EINVAL; |
---|
| 2807 | + if (!args->src || !args->dst) |
---|
| 2808 | + return -EINVAL; |
---|
| 2809 | + |
---|
| 2810 | + memset(args->src, 0, sizeof(*args->src) * nr_pages); |
---|
| 2811 | + args->cpages = 0; |
---|
| 2812 | + args->npages = 0; |
---|
| 2813 | + |
---|
| 2814 | + migrate_vma_collect(args); |
---|
| 2815 | + |
---|
| 2816 | + if (args->cpages) |
---|
| 2817 | + migrate_vma_prepare(args); |
---|
| 2818 | + if (args->cpages) |
---|
| 2819 | + migrate_vma_unmap(args); |
---|
| 2820 | + |
---|
| 2821 | + /* |
---|
| 2822 | + * At this point pages are locked and unmapped, and thus they have |
---|
| 2823 | + * stable content and can safely be copied to destination memory that |
---|
| 2824 | + * is allocated by the drivers. |
---|
| 2825 | + */ |
---|
| 2826 | + return 0; |
---|
| 2827 | + |
---|
| 2828 | +} |
---|
| 2829 | +EXPORT_SYMBOL(migrate_vma_setup); |
---|
| 2830 | + |
---|
| 2831 | +/* |
---|
| 2832 | + * This code closely matches the code in: |
---|
| 2833 | + * __handle_mm_fault() |
---|
| 2834 | + * handle_pte_fault() |
---|
| 2835 | + * do_anonymous_page() |
---|
| 2836 | + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE |
---|
| 2837 | + * private page. |
---|
| 2838 | + */ |
---|
2623 | 2839 | static void migrate_vma_insert_page(struct migrate_vma *migrate, |
---|
2624 | 2840 | unsigned long addr, |
---|
2625 | 2841 | struct page *page, |
---|
.. | .. |
---|
2628 | 2844 | { |
---|
2629 | 2845 | struct vm_area_struct *vma = migrate->vma; |
---|
2630 | 2846 | struct mm_struct *mm = vma->vm_mm; |
---|
2631 | | - struct mem_cgroup *memcg; |
---|
2632 | 2847 | bool flush = false; |
---|
2633 | 2848 | spinlock_t *ptl; |
---|
2634 | 2849 | pte_t entry; |
---|
.. | .. |
---|
2661 | 2876 | * pte_offset_map() on pmds where a huge pmd might be created |
---|
2662 | 2877 | * from a different thread. |
---|
2663 | 2878 | * |
---|
2664 | | - * pte_alloc_map() is safe to use under down_write(mmap_sem) or when |
---|
| 2879 | + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when |
---|
2665 | 2880 | * parallel threads are excluded by other means. |
---|
2666 | 2881 | * |
---|
2667 | | - * Here we only have down_read(mmap_sem). |
---|
| 2882 | + * Here we only have mmap_read_lock(mm). |
---|
2668 | 2883 | */ |
---|
2669 | | - if (pte_alloc(mm, pmdp, addr)) |
---|
| 2884 | + if (pte_alloc(mm, pmdp)) |
---|
2670 | 2885 | goto abort; |
---|
2671 | 2886 | |
---|
2672 | 2887 | /* See the comment in pte_alloc_one_map() */ |
---|
.. | .. |
---|
2675 | 2890 | |
---|
2676 | 2891 | if (unlikely(anon_vma_prepare(vma))) |
---|
2677 | 2892 | goto abort; |
---|
2678 | | - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) |
---|
| 2893 | + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) |
---|
2679 | 2894 | goto abort; |
---|
2680 | 2895 | |
---|
2681 | 2896 | /* |
---|
.. | .. |
---|
2691 | 2906 | |
---|
2692 | 2907 | swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); |
---|
2693 | 2908 | entry = swp_entry_to_pte(swp_entry); |
---|
2694 | | - } else if (is_device_public_page(page)) { |
---|
2695 | | - entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); |
---|
2696 | | - if (vma->vm_flags & VM_WRITE) |
---|
2697 | | - entry = pte_mkwrite(pte_mkdirty(entry)); |
---|
2698 | | - entry = pte_mkdevmap(entry); |
---|
| 2909 | + } else { |
---|
| 2910 | + /* |
---|
| 2911 | + * For now we only support migrating to un-addressable |
---|
| 2912 | + * device memory. |
---|
| 2913 | + */ |
---|
| 2914 | + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); |
---|
| 2915 | + goto abort; |
---|
2699 | 2916 | } |
---|
2700 | 2917 | } else { |
---|
2701 | 2918 | entry = mk_pte(page, vma->vm_page_prot); |
---|
.. | .. |
---|
2705 | 2922 | |
---|
2706 | 2923 | ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); |
---|
2707 | 2924 | |
---|
| 2925 | + if (check_stable_address_space(mm)) |
---|
| 2926 | + goto unlock_abort; |
---|
| 2927 | + |
---|
2708 | 2928 | if (pte_present(*ptep)) { |
---|
2709 | 2929 | unsigned long pfn = pte_pfn(*ptep); |
---|
2710 | 2930 | |
---|
2711 | | - if (!is_zero_pfn(pfn)) { |
---|
2712 | | - pte_unmap_unlock(ptep, ptl); |
---|
2713 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
2714 | | - goto abort; |
---|
2715 | | - } |
---|
| 2931 | + if (!is_zero_pfn(pfn)) |
---|
| 2932 | + goto unlock_abort; |
---|
2716 | 2933 | flush = true; |
---|
2717 | | - } else if (!pte_none(*ptep)) { |
---|
2718 | | - pte_unmap_unlock(ptep, ptl); |
---|
2719 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
2720 | | - goto abort; |
---|
2721 | | - } |
---|
| 2934 | + } else if (!pte_none(*ptep)) |
---|
| 2935 | + goto unlock_abort; |
---|
2722 | 2936 | |
---|
2723 | 2937 | /* |
---|
2724 | | - * Check for usefaultfd but do not deliver the fault. Instead, |
---|
| 2938 | + * Check for userfaultfd but do not deliver the fault. Instead, |
---|
2725 | 2939 | * just back off. |
---|
2726 | 2940 | */ |
---|
2727 | | - if (userfaultfd_missing(vma)) { |
---|
2728 | | - pte_unmap_unlock(ptep, ptl); |
---|
2729 | | - mem_cgroup_cancel_charge(page, memcg, false); |
---|
2730 | | - goto abort; |
---|
2731 | | - } |
---|
| 2941 | + if (userfaultfd_missing(vma)) |
---|
| 2942 | + goto unlock_abort; |
---|
2732 | 2943 | |
---|
2733 | 2944 | inc_mm_counter(mm, MM_ANONPAGES); |
---|
2734 | 2945 | page_add_new_anon_rmap(page, vma, addr, false); |
---|
2735 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
---|
2736 | 2946 | if (!is_zone_device_page(page)) |
---|
2737 | | - lru_cache_add_active_or_unevictable(page, vma); |
---|
| 2947 | + lru_cache_add_inactive_or_unevictable(page, vma); |
---|
2738 | 2948 | get_page(page); |
---|
2739 | 2949 | |
---|
2740 | 2950 | if (flush) { |
---|
.. | .. |
---|
2752 | 2962 | *src = MIGRATE_PFN_MIGRATE; |
---|
2753 | 2963 | return; |
---|
2754 | 2964 | |
---|
| 2965 | +unlock_abort: |
---|
| 2966 | + pte_unmap_unlock(ptep, ptl); |
---|
2755 | 2967 | abort: |
---|
2756 | 2968 | *src &= ~MIGRATE_PFN_MIGRATE; |
---|
2757 | 2969 | } |
---|
2758 | 2970 | |
---|
2759 | | -/* |
---|
| 2971 | +/** |
---|
2760 | 2972 | * migrate_vma_pages() - migrate meta-data from src page to dst page |
---|
2761 | 2973 | * @migrate: migrate struct containing all migration information |
---|
2762 | 2974 | * |
---|
.. | .. |
---|
2764 | 2976 | * struct page. This effectively finishes the migration from source page to the |
---|
2765 | 2977 | * destination page. |
---|
2766 | 2978 | */ |
---|
2767 | | -static void migrate_vma_pages(struct migrate_vma *migrate) |
---|
| 2979 | +void migrate_vma_pages(struct migrate_vma *migrate) |
---|
2768 | 2980 | { |
---|
2769 | 2981 | const unsigned long npages = migrate->npages; |
---|
2770 | 2982 | const unsigned long start = migrate->start; |
---|
2771 | | - struct vm_area_struct *vma = migrate->vma; |
---|
2772 | | - struct mm_struct *mm = vma->vm_mm; |
---|
2773 | | - unsigned long addr, i, mmu_start; |
---|
| 2983 | + struct mmu_notifier_range range; |
---|
| 2984 | + unsigned long addr, i; |
---|
2774 | 2985 | bool notified = false; |
---|
2775 | 2986 | |
---|
2776 | 2987 | for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { |
---|
.. | .. |
---|
2785 | 2996 | } |
---|
2786 | 2997 | |
---|
2787 | 2998 | if (!page) { |
---|
2788 | | - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { |
---|
| 2999 | + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) |
---|
2789 | 3000 | continue; |
---|
2790 | | - } |
---|
2791 | 3001 | if (!notified) { |
---|
2792 | | - mmu_start = addr; |
---|
2793 | 3002 | notified = true; |
---|
2794 | | - mmu_notifier_invalidate_range_start(mm, |
---|
2795 | | - mmu_start, |
---|
2796 | | - migrate->end); |
---|
| 3003 | + |
---|
| 3004 | + mmu_notifier_range_init(&range, |
---|
| 3005 | + MMU_NOTIFY_CLEAR, 0, |
---|
| 3006 | + NULL, |
---|
| 3007 | + migrate->vma->vm_mm, |
---|
| 3008 | + addr, migrate->end); |
---|
| 3009 | + mmu_notifier_invalidate_range_start(&range); |
---|
2797 | 3010 | } |
---|
2798 | 3011 | migrate_vma_insert_page(migrate, addr, newpage, |
---|
2799 | 3012 | &migrate->src[i], |
---|
.. | .. |
---|
2813 | 3026 | migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; |
---|
2814 | 3027 | continue; |
---|
2815 | 3028 | } |
---|
2816 | | - } else if (!is_device_public_page(newpage)) { |
---|
| 3029 | + } else { |
---|
2817 | 3030 | /* |
---|
2818 | 3031 | * Other types of ZONE_DEVICE page are not |
---|
2819 | 3032 | * supported. |
---|
.. | .. |
---|
2834 | 3047 | * did already call it. |
---|
2835 | 3048 | */ |
---|
2836 | 3049 | if (notified) |
---|
2837 | | - mmu_notifier_invalidate_range_only_end(mm, mmu_start, |
---|
2838 | | - migrate->end); |
---|
| 3050 | + mmu_notifier_invalidate_range_only_end(&range); |
---|
2839 | 3051 | } |
---|
| 3052 | +EXPORT_SYMBOL(migrate_vma_pages); |
---|
2840 | 3053 | |
---|
2841 | | -/* |
---|
| 3054 | +/** |
---|
2842 | 3055 | * migrate_vma_finalize() - restore CPU page table entry |
---|
2843 | 3056 | * @migrate: migrate struct containing all migration information |
---|
2844 | 3057 | * |
---|
.. | .. |
---|
2849 | 3062 | * This also unlocks the pages and puts them back on the lru, or drops the extra |
---|
2850 | 3063 | * refcount, for device pages. |
---|
2851 | 3064 | */ |
---|
2852 | | -static void migrate_vma_finalize(struct migrate_vma *migrate) |
---|
| 3065 | +void migrate_vma_finalize(struct migrate_vma *migrate) |
---|
2853 | 3066 | { |
---|
2854 | 3067 | const unsigned long npages = migrate->npages; |
---|
2855 | 3068 | unsigned long i; |
---|
.. | .. |
---|
2876 | 3089 | |
---|
2877 | 3090 | remove_migration_ptes(page, newpage, false); |
---|
2878 | 3091 | unlock_page(page); |
---|
2879 | | - migrate->cpages--; |
---|
2880 | 3092 | |
---|
2881 | 3093 | if (is_zone_device_page(page)) |
---|
2882 | 3094 | put_page(page); |
---|
.. | .. |
---|
2892 | 3104 | } |
---|
2893 | 3105 | } |
---|
2894 | 3106 | } |
---|
2895 | | - |
---|
2896 | | -/* |
---|
2897 | | - * migrate_vma() - migrate a range of memory inside vma |
---|
2898 | | - * |
---|
2899 | | - * @ops: migration callback for allocating destination memory and copying |
---|
2900 | | - * @vma: virtual memory area containing the range to be migrated |
---|
2901 | | - * @start: start address of the range to migrate (inclusive) |
---|
2902 | | - * @end: end address of the range to migrate (exclusive) |
---|
2903 | | - * @src: array of hmm_pfn_t containing source pfns |
---|
2904 | | - * @dst: array of hmm_pfn_t containing destination pfns |
---|
2905 | | - * @private: pointer passed back to each of the callback |
---|
2906 | | - * Returns: 0 on success, error code otherwise |
---|
2907 | | - * |
---|
2908 | | - * This function tries to migrate a range of memory virtual address range, using |
---|
2909 | | - * callbacks to allocate and copy memory from source to destination. First it |
---|
2910 | | - * collects all the pages backing each virtual address in the range, saving this |
---|
2911 | | - * inside the src array. Then it locks those pages and unmaps them. Once the pages |
---|
2912 | | - * are locked and unmapped, it checks whether each page is pinned or not. Pages |
---|
2913 | | - * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) |
---|
2914 | | - * in the corresponding src array entry. It then restores any pages that are |
---|
2915 | | - * pinned, by remapping and unlocking those pages. |
---|
2916 | | - * |
---|
2917 | | - * At this point it calls the alloc_and_copy() callback. For documentation on |
---|
2918 | | - * what is expected from that callback, see struct migrate_vma_ops comments in |
---|
2919 | | - * include/linux/migrate.h |
---|
2920 | | - * |
---|
2921 | | - * After the alloc_and_copy() callback, this function goes over each entry in |
---|
2922 | | - * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag |
---|
2923 | | - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, |
---|
2924 | | - * then the function tries to migrate struct page information from the source |
---|
2925 | | - * struct page to the destination struct page. If it fails to migrate the struct |
---|
2926 | | - * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src |
---|
2927 | | - * array. |
---|
2928 | | - * |
---|
2929 | | - * At this point all successfully migrated pages have an entry in the src |
---|
2930 | | - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst |
---|
2931 | | - * array entry with MIGRATE_PFN_VALID flag set. |
---|
2932 | | - * |
---|
2933 | | - * It then calls the finalize_and_map() callback. See comments for "struct |
---|
2934 | | - * migrate_vma_ops", in include/linux/migrate.h for details about |
---|
2935 | | - * finalize_and_map() behavior. |
---|
2936 | | - * |
---|
2937 | | - * After the finalize_and_map() callback, for successfully migrated pages, this |
---|
2938 | | - * function updates the CPU page table to point to new pages, otherwise it |
---|
2939 | | - * restores the CPU page table to point to the original source pages. |
---|
2940 | | - * |
---|
2941 | | - * Function returns 0 after the above steps, even if no pages were migrated |
---|
2942 | | - * (The function only returns an error if any of the arguments are invalid.) |
---|
2943 | | - * |
---|
2944 | | - * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT |
---|
2945 | | - * unsigned long entries. |
---|
2946 | | - */ |
---|
2947 | | -int migrate_vma(const struct migrate_vma_ops *ops, |
---|
2948 | | - struct vm_area_struct *vma, |
---|
2949 | | - unsigned long start, |
---|
2950 | | - unsigned long end, |
---|
2951 | | - unsigned long *src, |
---|
2952 | | - unsigned long *dst, |
---|
2953 | | - void *private) |
---|
2954 | | -{ |
---|
2955 | | - struct migrate_vma migrate; |
---|
2956 | | - |
---|
2957 | | - /* Sanity check the arguments */ |
---|
2958 | | - start &= PAGE_MASK; |
---|
2959 | | - end &= PAGE_MASK; |
---|
2960 | | - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || |
---|
2961 | | - vma_is_dax(vma)) |
---|
2962 | | - return -EINVAL; |
---|
2963 | | - if (start < vma->vm_start || start >= vma->vm_end) |
---|
2964 | | - return -EINVAL; |
---|
2965 | | - if (end <= vma->vm_start || end > vma->vm_end) |
---|
2966 | | - return -EINVAL; |
---|
2967 | | - if (!ops || !src || !dst || start >= end) |
---|
2968 | | - return -EINVAL; |
---|
2969 | | - |
---|
2970 | | - memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); |
---|
2971 | | - migrate.src = src; |
---|
2972 | | - migrate.dst = dst; |
---|
2973 | | - migrate.start = start; |
---|
2974 | | - migrate.npages = 0; |
---|
2975 | | - migrate.cpages = 0; |
---|
2976 | | - migrate.end = end; |
---|
2977 | | - migrate.vma = vma; |
---|
2978 | | - |
---|
2979 | | - /* Collect, and try to unmap source pages */ |
---|
2980 | | - migrate_vma_collect(&migrate); |
---|
2981 | | - if (!migrate.cpages) |
---|
2982 | | - return 0; |
---|
2983 | | - |
---|
2984 | | - /* Lock and isolate page */ |
---|
2985 | | - migrate_vma_prepare(&migrate); |
---|
2986 | | - if (!migrate.cpages) |
---|
2987 | | - return 0; |
---|
2988 | | - |
---|
2989 | | - /* Unmap pages */ |
---|
2990 | | - migrate_vma_unmap(&migrate); |
---|
2991 | | - if (!migrate.cpages) |
---|
2992 | | - return 0; |
---|
2993 | | - |
---|
2994 | | - /* |
---|
2995 | | - * At this point pages are locked and unmapped, and thus they have |
---|
2996 | | - * stable content and can safely be copied to destination memory that |
---|
2997 | | - * is allocated by the callback. |
---|
2998 | | - * |
---|
2999 | | - * Note that migration can fail in migrate_vma_struct_page() for each |
---|
3000 | | - * individual page. |
---|
3001 | | - */ |
---|
3002 | | - ops->alloc_and_copy(vma, src, dst, start, end, private); |
---|
3003 | | - |
---|
3004 | | - /* This does the real migration of struct page */ |
---|
3005 | | - migrate_vma_pages(&migrate); |
---|
3006 | | - |
---|
3007 | | - ops->finalize_and_map(vma, src, dst, start, end, private); |
---|
3008 | | - |
---|
3009 | | - /* Unlock and remap pages */ |
---|
3010 | | - migrate_vma_finalize(&migrate); |
---|
3011 | | - |
---|
3012 | | - return 0; |
---|
3013 | | -} |
---|
3014 | | -EXPORT_SYMBOL(migrate_vma); |
---|
3015 | | -#endif /* defined(MIGRATE_VMA_HELPER) */ |
---|
| 3107 | +EXPORT_SYMBOL(migrate_vma_finalize); |
---|
| 3108 | +#endif /* CONFIG_DEVICE_PRIVATE */ |
---|