| .. | .. |
|---|
| 36 | 36 | #include <linux/uio.h> |
|---|
| 37 | 37 | #include <linux/khugepaged.h> |
|---|
| 38 | 38 | #include <linux/hugetlb.h> |
|---|
| 39 | +#include <linux/frontswap.h> |
|---|
| 40 | +#include <linux/fs_parser.h> |
|---|
| 41 | +#include <linux/mm_inline.h> |
|---|
| 39 | 42 | |
|---|
| 40 | 43 | #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ |
|---|
| 44 | + |
|---|
| 45 | +#include "internal.h" |
|---|
| 46 | + |
|---|
| 47 | +#undef CREATE_TRACE_POINTS |
|---|
| 48 | +#include <trace/hooks/shmem_fs.h> |
|---|
| 49 | +#include <trace/hooks/mm.h> |
|---|
| 41 | 50 | |
|---|
| 42 | 51 | static struct vfsmount *shm_mnt; |
|---|
| 43 | 52 | |
|---|
| .. | .. |
|---|
| 80 | 89 | #include <linux/uuid.h> |
|---|
| 81 | 90 | |
|---|
| 82 | 91 | #include <linux/uaccess.h> |
|---|
| 83 | | -#include <asm/pgtable.h> |
|---|
| 84 | 92 | |
|---|
| 85 | 93 | #include "internal.h" |
|---|
| 86 | 94 | |
|---|
| .. | .. |
|---|
| 106 | 114 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ |
|---|
| 107 | 115 | }; |
|---|
| 108 | 116 | |
|---|
| 117 | +struct shmem_options { |
|---|
| 118 | + unsigned long long blocks; |
|---|
| 119 | + unsigned long long inodes; |
|---|
| 120 | + struct mempolicy *mpol; |
|---|
| 121 | + kuid_t uid; |
|---|
| 122 | + kgid_t gid; |
|---|
| 123 | + umode_t mode; |
|---|
| 124 | + bool full_inums; |
|---|
| 125 | + int huge; |
|---|
| 126 | + int seen; |
|---|
| 127 | +#define SHMEM_SEEN_BLOCKS 1 |
|---|
| 128 | +#define SHMEM_SEEN_INODES 2 |
|---|
| 129 | +#define SHMEM_SEEN_HUGE 4 |
|---|
| 130 | +#define SHMEM_SEEN_INUMS 8 |
|---|
| 131 | +}; |
|---|
| 132 | + |
|---|
| 109 | 133 | #ifdef CONFIG_TMPFS |
|---|
| 110 | 134 | static unsigned long shmem_default_max_blocks(void) |
|---|
| 111 | 135 | { |
|---|
| 112 | | - return totalram_pages / 2; |
|---|
| 136 | + return totalram_pages() / 2; |
|---|
| 113 | 137 | } |
|---|
| 114 | 138 | |
|---|
| 115 | 139 | static unsigned long shmem_default_max_inodes(void) |
|---|
| 116 | 140 | { |
|---|
| 117 | | - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); |
|---|
| 141 | + unsigned long nr_pages = totalram_pages(); |
|---|
| 142 | + |
|---|
| 143 | + return min(nr_pages - totalhigh_pages(), nr_pages / 2); |
|---|
| 118 | 144 | } |
|---|
| 119 | 145 | #endif |
|---|
| 120 | 146 | |
|---|
| 121 | 147 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); |
|---|
| 122 | 148 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, |
|---|
| 123 | 149 | struct shmem_inode_info *info, pgoff_t index); |
|---|
| 150 | +static int shmem_swapin_page(struct inode *inode, pgoff_t index, |
|---|
| 151 | + struct page **pagep, enum sgp_type sgp, |
|---|
| 152 | + gfp_t gfp, struct vm_area_struct *vma, |
|---|
| 153 | + vm_fault_t *fault_type); |
|---|
| 124 | 154 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
|---|
| 125 | 155 | struct page **pagep, enum sgp_type sgp, |
|---|
| 126 | 156 | gfp_t gfp, struct vm_area_struct *vma, |
|---|
| .. | .. |
|---|
| 239 | 269 | static LIST_HEAD(shmem_swaplist); |
|---|
| 240 | 270 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
|---|
| 241 | 271 | |
|---|
| 242 | | -static int shmem_reserve_inode(struct super_block *sb) |
|---|
| 272 | +/* |
|---|
| 273 | + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and |
|---|
| 274 | + * produces a novel ino for the newly allocated inode. |
|---|
| 275 | + * |
|---|
| 276 | + * It may also be called when making a hard link to permit the space needed by |
|---|
| 277 | + * each dentry. However, in that case, no new inode number is needed since that |
|---|
| 278 | + * internally draws from another pool of inode numbers (currently global |
|---|
| 279 | + * get_next_ino()). This case is indicated by passing NULL as inop. |
|---|
| 280 | + */ |
|---|
| 281 | +#define SHMEM_INO_BATCH 1024 |
|---|
| 282 | +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) |
|---|
| 243 | 283 | { |
|---|
| 244 | 284 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
|---|
| 245 | | - if (sbinfo->max_inodes) { |
|---|
| 285 | + ino_t ino; |
|---|
| 286 | + |
|---|
| 287 | + if (!(sb->s_flags & SB_KERNMOUNT)) { |
|---|
| 246 | 288 | spin_lock(&sbinfo->stat_lock); |
|---|
| 247 | | - if (!sbinfo->free_inodes) { |
|---|
| 248 | | - spin_unlock(&sbinfo->stat_lock); |
|---|
| 249 | | - return -ENOSPC; |
|---|
| 289 | + if (sbinfo->max_inodes) { |
|---|
| 290 | + if (!sbinfo->free_inodes) { |
|---|
| 291 | + spin_unlock(&sbinfo->stat_lock); |
|---|
| 292 | + return -ENOSPC; |
|---|
| 293 | + } |
|---|
| 294 | + sbinfo->free_inodes--; |
|---|
| 250 | 295 | } |
|---|
| 251 | | - sbinfo->free_inodes--; |
|---|
| 296 | + if (inop) { |
|---|
| 297 | + ino = sbinfo->next_ino++; |
|---|
| 298 | + if (unlikely(is_zero_ino(ino))) |
|---|
| 299 | + ino = sbinfo->next_ino++; |
|---|
| 300 | + if (unlikely(!sbinfo->full_inums && |
|---|
| 301 | + ino > UINT_MAX)) { |
|---|
| 302 | + /* |
|---|
| 303 | + * Emulate get_next_ino uint wraparound for |
|---|
| 304 | + * compatibility |
|---|
| 305 | + */ |
|---|
| 306 | + if (IS_ENABLED(CONFIG_64BIT)) |
|---|
| 307 | + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", |
|---|
| 308 | + __func__, MINOR(sb->s_dev)); |
|---|
| 309 | + sbinfo->next_ino = 1; |
|---|
| 310 | + ino = sbinfo->next_ino++; |
|---|
| 311 | + } |
|---|
| 312 | + *inop = ino; |
|---|
| 313 | + } |
|---|
| 252 | 314 | spin_unlock(&sbinfo->stat_lock); |
|---|
| 315 | + } else if (inop) { |
|---|
| 316 | + /* |
|---|
| 317 | + * __shmem_file_setup, one of our callers, is lock-free: it |
|---|
| 318 | + * doesn't hold stat_lock in shmem_reserve_inode since |
|---|
| 319 | + * max_inodes is always 0, and is called from potentially |
|---|
| 320 | + * unknown contexts. As such, use a per-cpu batched allocator |
|---|
| 321 | + * which doesn't require the per-sb stat_lock unless we are at |
|---|
| 322 | + * the batch boundary. |
|---|
| 323 | + * |
|---|
| 324 | + * We don't need to worry about inode{32,64} since SB_KERNMOUNT |
|---|
| 325 | + * shmem mounts are not exposed to userspace, so we don't need |
|---|
| 326 | + * to worry about things like glibc compatibility. |
|---|
| 327 | + */ |
|---|
| 328 | + ino_t *next_ino; |
|---|
| 329 | + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); |
|---|
| 330 | + ino = *next_ino; |
|---|
| 331 | + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { |
|---|
| 332 | + spin_lock(&sbinfo->stat_lock); |
|---|
| 333 | + ino = sbinfo->next_ino; |
|---|
| 334 | + sbinfo->next_ino += SHMEM_INO_BATCH; |
|---|
| 335 | + spin_unlock(&sbinfo->stat_lock); |
|---|
| 336 | + if (unlikely(is_zero_ino(ino))) |
|---|
| 337 | + ino++; |
|---|
| 338 | + } |
|---|
| 339 | + *inop = ino; |
|---|
| 340 | + *next_ino = ++ino; |
|---|
| 341 | + put_cpu(); |
|---|
| 253 | 342 | } |
|---|
| 343 | + |
|---|
| 254 | 344 | return 0; |
|---|
| 255 | 345 | } |
|---|
| 256 | 346 | |
|---|
| .. | .. |
|---|
| 326 | 416 | } |
|---|
| 327 | 417 | |
|---|
| 328 | 418 | /* |
|---|
| 329 | | - * Replace item expected in radix tree by a new item, while holding tree lock. |
|---|
| 419 | + * Replace item expected in xarray by a new item, while holding xa_lock. |
|---|
| 330 | 420 | */ |
|---|
| 331 | | -static int shmem_radix_tree_replace(struct address_space *mapping, |
|---|
| 421 | +static int shmem_replace_entry(struct address_space *mapping, |
|---|
| 332 | 422 | pgoff_t index, void *expected, void *replacement) |
|---|
| 333 | 423 | { |
|---|
| 334 | | - struct radix_tree_node *node; |
|---|
| 335 | | - void __rcu **pslot; |
|---|
| 424 | + XA_STATE(xas, &mapping->i_pages, index); |
|---|
| 336 | 425 | void *item; |
|---|
| 337 | 426 | |
|---|
| 338 | 427 | VM_BUG_ON(!expected); |
|---|
| 339 | 428 | VM_BUG_ON(!replacement); |
|---|
| 340 | | - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); |
|---|
| 341 | | - if (!item) |
|---|
| 342 | | - return -ENOENT; |
|---|
| 429 | + item = xas_load(&xas); |
|---|
| 343 | 430 | if (item != expected) |
|---|
| 344 | 431 | return -ENOENT; |
|---|
| 345 | | - __radix_tree_replace(&mapping->i_pages, node, pslot, |
|---|
| 346 | | - replacement, NULL); |
|---|
| 432 | + xas_store(&xas, replacement); |
|---|
| 347 | 433 | return 0; |
|---|
| 348 | 434 | } |
|---|
| 349 | 435 | |
|---|
| .. | .. |
|---|
| 357 | 443 | static bool shmem_confirm_swap(struct address_space *mapping, |
|---|
| 358 | 444 | pgoff_t index, swp_entry_t swap) |
|---|
| 359 | 445 | { |
|---|
| 360 | | - void *item; |
|---|
| 361 | | - |
|---|
| 362 | | - rcu_read_lock(); |
|---|
| 363 | | - item = radix_tree_lookup(&mapping->i_pages, index); |
|---|
| 364 | | - rcu_read_unlock(); |
|---|
| 365 | | - return item == swp_to_radix_entry(swap); |
|---|
| 446 | + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); |
|---|
| 366 | 447 | } |
|---|
| 367 | 448 | |
|---|
| 368 | 449 | /* |
|---|
| .. | .. |
|---|
| 397 | 478 | #define SHMEM_HUGE_DENY (-1) |
|---|
| 398 | 479 | #define SHMEM_HUGE_FORCE (-2) |
|---|
| 399 | 480 | |
|---|
| 400 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 481 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 401 | 482 | /* ifdef here to avoid bloating shmem.o when not necessary */ |
|---|
| 402 | 483 | |
|---|
| 403 | 484 | static int shmem_huge __read_mostly; |
|---|
| 404 | 485 | |
|---|
| 405 | | -#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) |
|---|
| 486 | +#if defined(CONFIG_SYSFS) |
|---|
| 406 | 487 | static int shmem_parse_huge(const char *str) |
|---|
| 407 | 488 | { |
|---|
| 408 | 489 | if (!strcmp(str, "never")) |
|---|
| .. | .. |
|---|
| 419 | 500 | return SHMEM_HUGE_FORCE; |
|---|
| 420 | 501 | return -EINVAL; |
|---|
| 421 | 502 | } |
|---|
| 503 | +#endif |
|---|
| 422 | 504 | |
|---|
| 505 | +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) |
|---|
| 423 | 506 | static const char *shmem_format_huge(int huge) |
|---|
| 424 | 507 | { |
|---|
| 425 | 508 | switch (huge) { |
|---|
| .. | .. |
|---|
| 570 | 653 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
|---|
| 571 | 654 | return READ_ONCE(sbinfo->shrinklist_len); |
|---|
| 572 | 655 | } |
|---|
| 573 | | -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ |
|---|
| 656 | +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 574 | 657 | |
|---|
| 575 | 658 | #define shmem_huge SHMEM_HUGE_DENY |
|---|
| 576 | 659 | |
|---|
| .. | .. |
|---|
| 579 | 662 | { |
|---|
| 580 | 663 | return 0; |
|---|
| 581 | 664 | } |
|---|
| 582 | | -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ |
|---|
| 665 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 583 | 666 | |
|---|
| 584 | 667 | static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) |
|---|
| 585 | 668 | { |
|---|
| 586 | | - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && |
|---|
| 669 | + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && |
|---|
| 587 | 670 | (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && |
|---|
| 588 | 671 | shmem_huge != SHMEM_HUGE_DENY) |
|---|
| 589 | 672 | return true; |
|---|
| .. | .. |
|---|
| 595 | 678 | */ |
|---|
| 596 | 679 | static int shmem_add_to_page_cache(struct page *page, |
|---|
| 597 | 680 | struct address_space *mapping, |
|---|
| 598 | | - pgoff_t index, void *expected) |
|---|
| 681 | + pgoff_t index, void *expected, gfp_t gfp, |
|---|
| 682 | + struct mm_struct *charge_mm) |
|---|
| 599 | 683 | { |
|---|
| 600 | | - int error, nr = hpage_nr_pages(page); |
|---|
| 684 | + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); |
|---|
| 685 | + unsigned long i = 0; |
|---|
| 686 | + unsigned long nr = compound_nr(page); |
|---|
| 687 | + int error; |
|---|
| 601 | 688 | |
|---|
| 602 | 689 | VM_BUG_ON_PAGE(PageTail(page), page); |
|---|
| 603 | 690 | VM_BUG_ON_PAGE(index != round_down(index, nr), page); |
|---|
| .. | .. |
|---|
| 609 | 696 | page->mapping = mapping; |
|---|
| 610 | 697 | page->index = index; |
|---|
| 611 | 698 | |
|---|
| 612 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 613 | | - if (PageTransHuge(page)) { |
|---|
| 614 | | - void __rcu **results; |
|---|
| 615 | | - pgoff_t idx; |
|---|
| 616 | | - int i; |
|---|
| 617 | | - |
|---|
| 618 | | - error = 0; |
|---|
| 619 | | - if (radix_tree_gang_lookup_slot(&mapping->i_pages, |
|---|
| 620 | | - &results, &idx, index, 1) && |
|---|
| 621 | | - idx < index + HPAGE_PMD_NR) { |
|---|
| 622 | | - error = -EEXIST; |
|---|
| 623 | | - } |
|---|
| 624 | | - |
|---|
| 625 | | - if (!error) { |
|---|
| 626 | | - for (i = 0; i < HPAGE_PMD_NR; i++) { |
|---|
| 627 | | - error = radix_tree_insert(&mapping->i_pages, |
|---|
| 628 | | - index + i, page + i); |
|---|
| 629 | | - VM_BUG_ON(error); |
|---|
| 699 | + if (!PageSwapCache(page)) { |
|---|
| 700 | + error = mem_cgroup_charge(page, charge_mm, gfp); |
|---|
| 701 | + if (error) { |
|---|
| 702 | + if (PageTransHuge(page)) { |
|---|
| 703 | + count_vm_event(THP_FILE_FALLBACK); |
|---|
| 704 | + count_vm_event(THP_FILE_FALLBACK_CHARGE); |
|---|
| 630 | 705 | } |
|---|
| 631 | | - count_vm_event(THP_FILE_ALLOC); |
|---|
| 706 | + goto error; |
|---|
| 632 | 707 | } |
|---|
| 633 | | - } else if (!expected) { |
|---|
| 634 | | - error = radix_tree_insert(&mapping->i_pages, index, page); |
|---|
| 635 | | - } else { |
|---|
| 636 | | - error = shmem_radix_tree_replace(mapping, index, expected, |
|---|
| 637 | | - page); |
|---|
| 708 | + } |
|---|
| 709 | + cgroup_throttle_swaprate(page, gfp); |
|---|
| 710 | + |
|---|
| 711 | + do { |
|---|
| 712 | + void *entry; |
|---|
| 713 | + xas_lock_irq(&xas); |
|---|
| 714 | + entry = xas_find_conflict(&xas); |
|---|
| 715 | + if (entry != expected) |
|---|
| 716 | + xas_set_err(&xas, -EEXIST); |
|---|
| 717 | + xas_create_range(&xas); |
|---|
| 718 | + if (xas_error(&xas)) |
|---|
| 719 | + goto unlock; |
|---|
| 720 | +next: |
|---|
| 721 | + xas_store(&xas, page); |
|---|
| 722 | + if (++i < nr) { |
|---|
| 723 | + xas_next(&xas); |
|---|
| 724 | + goto next; |
|---|
| 725 | + } |
|---|
| 726 | + if (PageTransHuge(page)) { |
|---|
| 727 | + count_vm_event(THP_FILE_ALLOC); |
|---|
| 728 | + __inc_node_page_state(page, NR_SHMEM_THPS); |
|---|
| 729 | + } |
|---|
| 730 | + mapping->nrpages += nr; |
|---|
| 731 | + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); |
|---|
| 732 | + __mod_lruvec_page_state(page, NR_SHMEM, nr); |
|---|
| 733 | +unlock: |
|---|
| 734 | + xas_unlock_irq(&xas); |
|---|
| 735 | + } while (xas_nomem(&xas, gfp)); |
|---|
| 736 | + |
|---|
| 737 | + if (xas_error(&xas)) { |
|---|
| 738 | + error = xas_error(&xas); |
|---|
| 739 | + goto error; |
|---|
| 638 | 740 | } |
|---|
| 639 | 741 | |
|---|
| 640 | | - if (!error) { |
|---|
| 641 | | - mapping->nrpages += nr; |
|---|
| 642 | | - if (PageTransHuge(page)) |
|---|
| 643 | | - __inc_node_page_state(page, NR_SHMEM_THPS); |
|---|
| 644 | | - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); |
|---|
| 645 | | - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); |
|---|
| 646 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 647 | | - } else { |
|---|
| 648 | | - page->mapping = NULL; |
|---|
| 649 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 650 | | - page_ref_sub(page, nr); |
|---|
| 651 | | - } |
|---|
| 742 | + return 0; |
|---|
| 743 | +error: |
|---|
| 744 | + page->mapping = NULL; |
|---|
| 745 | + page_ref_sub(page, nr); |
|---|
| 652 | 746 | return error; |
|---|
| 653 | 747 | } |
|---|
| 654 | 748 | |
|---|
| .. | .. |
|---|
| 663 | 757 | VM_BUG_ON_PAGE(PageCompound(page), page); |
|---|
| 664 | 758 | |
|---|
| 665 | 759 | xa_lock_irq(&mapping->i_pages); |
|---|
| 666 | | - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
|---|
| 760 | + error = shmem_replace_entry(mapping, page->index, page, radswap); |
|---|
| 667 | 761 | page->mapping = NULL; |
|---|
| 668 | 762 | mapping->nrpages--; |
|---|
| 669 | | - __dec_node_page_state(page, NR_FILE_PAGES); |
|---|
| 670 | | - __dec_node_page_state(page, NR_SHMEM); |
|---|
| 763 | + __dec_lruvec_page_state(page, NR_FILE_PAGES); |
|---|
| 764 | + __dec_lruvec_page_state(page, NR_SHMEM); |
|---|
| 671 | 765 | xa_unlock_irq(&mapping->i_pages); |
|---|
| 672 | 766 | put_page(page); |
|---|
| 673 | 767 | BUG_ON(error); |
|---|
| 674 | 768 | } |
|---|
| 675 | 769 | |
|---|
| 676 | 770 | /* |
|---|
| 677 | | - * Remove swap entry from radix tree, free the swap and its page cache. |
|---|
| 771 | + * Remove swap entry from page cache, free the swap and its page cache. |
|---|
| 678 | 772 | */ |
|---|
| 679 | 773 | static int shmem_free_swap(struct address_space *mapping, |
|---|
| 680 | 774 | pgoff_t index, void *radswap) |
|---|
| 681 | 775 | { |
|---|
| 682 | 776 | void *old; |
|---|
| 683 | 777 | |
|---|
| 684 | | - xa_lock_irq(&mapping->i_pages); |
|---|
| 685 | | - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); |
|---|
| 686 | | - xa_unlock_irq(&mapping->i_pages); |
|---|
| 778 | + old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); |
|---|
| 687 | 779 | if (old != radswap) |
|---|
| 688 | 780 | return -ENOENT; |
|---|
| 689 | 781 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
|---|
| .. | .. |
|---|
| 700 | 792 | unsigned long shmem_partial_swap_usage(struct address_space *mapping, |
|---|
| 701 | 793 | pgoff_t start, pgoff_t end) |
|---|
| 702 | 794 | { |
|---|
| 703 | | - struct radix_tree_iter iter; |
|---|
| 704 | | - void __rcu **slot; |
|---|
| 795 | + XA_STATE(xas, &mapping->i_pages, start); |
|---|
| 705 | 796 | struct page *page; |
|---|
| 706 | 797 | unsigned long swapped = 0; |
|---|
| 707 | 798 | |
|---|
| 708 | 799 | rcu_read_lock(); |
|---|
| 709 | | - |
|---|
| 710 | | - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { |
|---|
| 711 | | - if (iter.index >= end) |
|---|
| 712 | | - break; |
|---|
| 713 | | - |
|---|
| 714 | | - page = radix_tree_deref_slot(slot); |
|---|
| 715 | | - |
|---|
| 716 | | - if (radix_tree_deref_retry(page)) { |
|---|
| 717 | | - slot = radix_tree_iter_retry(&iter); |
|---|
| 800 | + xas_for_each(&xas, page, end - 1) { |
|---|
| 801 | + if (xas_retry(&xas, page)) |
|---|
| 718 | 802 | continue; |
|---|
| 719 | | - } |
|---|
| 720 | | - |
|---|
| 721 | | - if (radix_tree_exceptional_entry(page)) |
|---|
| 803 | + if (xa_is_value(page)) |
|---|
| 722 | 804 | swapped++; |
|---|
| 723 | 805 | |
|---|
| 724 | 806 | if (need_resched()) { |
|---|
| 725 | | - slot = radix_tree_iter_resume(slot, &iter); |
|---|
| 807 | + xas_pause(&xas); |
|---|
| 726 | 808 | cond_resched_rcu(); |
|---|
| 727 | 809 | } |
|---|
| 728 | 810 | } |
|---|
| .. | .. |
|---|
| 797 | 879 | } |
|---|
| 798 | 880 | |
|---|
| 799 | 881 | /* |
|---|
| 800 | | - * Remove range of pages and swap entries from radix tree, and free them. |
|---|
| 882 | + * Check whether a hole-punch or truncation needs to split a huge page, |
|---|
| 883 | + * returning true if no split was required, or the split has been successful. |
|---|
| 884 | + * |
|---|
| 885 | + * Eviction (or truncation to 0 size) should never need to split a huge page; |
|---|
| 886 | + * but in rare cases might do so, if shmem_undo_range() failed to trylock on |
|---|
| 887 | + * head, and then succeeded to trylock on tail. |
|---|
| 888 | + * |
|---|
| 889 | + * A split can only succeed when there are no additional references on the |
|---|
| 890 | + * huge page: so the split below relies upon find_get_entries() having stopped |
|---|
| 891 | + * when it found a subpage of the huge page, without getting further references. |
|---|
| 892 | + */ |
|---|
| 893 | +static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) |
|---|
| 894 | +{ |
|---|
| 895 | + if (!PageTransCompound(page)) |
|---|
| 896 | + return true; |
|---|
| 897 | + |
|---|
| 898 | + /* Just proceed to delete a huge page wholly within the range punched */ |
|---|
| 899 | + if (PageHead(page) && |
|---|
| 900 | + page->index >= start && page->index + HPAGE_PMD_NR <= end) |
|---|
| 901 | + return true; |
|---|
| 902 | + |
|---|
| 903 | + /* Try to split huge page, so we can truly punch the hole or truncate */ |
|---|
| 904 | + return split_huge_page(page) >= 0; |
|---|
| 905 | +} |
|---|
| 906 | + |
|---|
| 907 | +/* |
|---|
| 908 | + * Remove range of pages and swap entries from page cache, and free them. |
|---|
| 801 | 909 | * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. |
|---|
| 802 | 910 | */ |
|---|
| 803 | 911 | static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, |
|---|
| .. | .. |
|---|
| 833 | 941 | if (index >= end) |
|---|
| 834 | 942 | break; |
|---|
| 835 | 943 | |
|---|
| 836 | | - if (radix_tree_exceptional_entry(page)) { |
|---|
| 944 | + if (xa_is_value(page)) { |
|---|
| 837 | 945 | if (unfalloc) |
|---|
| 838 | 946 | continue; |
|---|
| 839 | 947 | nr_swaps_freed += !shmem_free_swap(mapping, |
|---|
| .. | .. |
|---|
| 846 | 954 | if (!trylock_page(page)) |
|---|
| 847 | 955 | continue; |
|---|
| 848 | 956 | |
|---|
| 849 | | - if (PageTransTail(page)) { |
|---|
| 850 | | - /* Middle of THP: zero out the page */ |
|---|
| 851 | | - clear_highpage(page); |
|---|
| 852 | | - unlock_page(page); |
|---|
| 853 | | - continue; |
|---|
| 854 | | - } else if (PageTransHuge(page)) { |
|---|
| 855 | | - if (index == round_down(end, HPAGE_PMD_NR)) { |
|---|
| 856 | | - /* |
|---|
| 857 | | - * Range ends in the middle of THP: |
|---|
| 858 | | - * zero out the page |
|---|
| 859 | | - */ |
|---|
| 860 | | - clear_highpage(page); |
|---|
| 861 | | - unlock_page(page); |
|---|
| 862 | | - continue; |
|---|
| 863 | | - } |
|---|
| 864 | | - index += HPAGE_PMD_NR - 1; |
|---|
| 865 | | - i += HPAGE_PMD_NR - 1; |
|---|
| 866 | | - } |
|---|
| 867 | | - |
|---|
| 868 | | - if (!unfalloc || !PageUptodate(page)) { |
|---|
| 869 | | - VM_BUG_ON_PAGE(PageTail(page), page); |
|---|
| 870 | | - if (page_mapping(page) == mapping) { |
|---|
| 871 | | - VM_BUG_ON_PAGE(PageWriteback(page), page); |
|---|
| 957 | + if ((!unfalloc || !PageUptodate(page)) && |
|---|
| 958 | + page_mapping(page) == mapping) { |
|---|
| 959 | + VM_BUG_ON_PAGE(PageWriteback(page), page); |
|---|
| 960 | + if (shmem_punch_compound(page, start, end)) |
|---|
| 872 | 961 | truncate_inode_page(mapping, page); |
|---|
| 873 | | - } |
|---|
| 874 | 962 | } |
|---|
| 875 | 963 | unlock_page(page); |
|---|
| 876 | 964 | } |
|---|
| .. | .. |
|---|
| 930 | 1018 | if (index >= end) |
|---|
| 931 | 1019 | break; |
|---|
| 932 | 1020 | |
|---|
| 933 | | - if (radix_tree_exceptional_entry(page)) { |
|---|
| 1021 | + if (xa_is_value(page)) { |
|---|
| 934 | 1022 | if (unfalloc) |
|---|
| 935 | 1023 | continue; |
|---|
| 936 | 1024 | if (shmem_free_swap(mapping, index, page)) { |
|---|
| .. | .. |
|---|
| 944 | 1032 | |
|---|
| 945 | 1033 | lock_page(page); |
|---|
| 946 | 1034 | |
|---|
| 947 | | - if (PageTransTail(page)) { |
|---|
| 948 | | - /* Middle of THP: zero out the page */ |
|---|
| 949 | | - clear_highpage(page); |
|---|
| 950 | | - unlock_page(page); |
|---|
| 951 | | - /* |
|---|
| 952 | | - * Partial thp truncate due 'start' in middle |
|---|
| 953 | | - * of THP: don't need to look on these pages |
|---|
| 954 | | - * again on !pvec.nr restart. |
|---|
| 955 | | - */ |
|---|
| 956 | | - if (index != round_down(end, HPAGE_PMD_NR)) |
|---|
| 957 | | - start++; |
|---|
| 958 | | - continue; |
|---|
| 959 | | - } else if (PageTransHuge(page)) { |
|---|
| 960 | | - if (index == round_down(end, HPAGE_PMD_NR)) { |
|---|
| 961 | | - /* |
|---|
| 962 | | - * Range ends in the middle of THP: |
|---|
| 963 | | - * zero out the page |
|---|
| 964 | | - */ |
|---|
| 965 | | - clear_highpage(page); |
|---|
| 966 | | - unlock_page(page); |
|---|
| 967 | | - continue; |
|---|
| 968 | | - } |
|---|
| 969 | | - index += HPAGE_PMD_NR - 1; |
|---|
| 970 | | - i += HPAGE_PMD_NR - 1; |
|---|
| 971 | | - } |
|---|
| 972 | | - |
|---|
| 973 | 1035 | if (!unfalloc || !PageUptodate(page)) { |
|---|
| 974 | | - VM_BUG_ON_PAGE(PageTail(page), page); |
|---|
| 975 | | - if (page_mapping(page) == mapping) { |
|---|
| 976 | | - VM_BUG_ON_PAGE(PageWriteback(page), page); |
|---|
| 977 | | - truncate_inode_page(mapping, page); |
|---|
| 978 | | - } else { |
|---|
| 1036 | + if (page_mapping(page) != mapping) { |
|---|
| 979 | 1037 | /* Page was replaced by swap: retry */ |
|---|
| 980 | 1038 | unlock_page(page); |
|---|
| 981 | 1039 | index--; |
|---|
| 982 | 1040 | break; |
|---|
| 1041 | + } |
|---|
| 1042 | + VM_BUG_ON_PAGE(PageWriteback(page), page); |
|---|
| 1043 | + if (shmem_punch_compound(page, start, end)) |
|---|
| 1044 | + truncate_inode_page(mapping, page); |
|---|
| 1045 | + else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { |
|---|
| 1046 | + /* Wipe the page and don't get stuck */ |
|---|
| 1047 | + clear_highpage(page); |
|---|
| 1048 | + flush_dcache_page(page); |
|---|
| 1049 | + set_page_dirty(page); |
|---|
| 1050 | + if (index < |
|---|
| 1051 | + round_up(start, HPAGE_PMD_NR)) |
|---|
| 1052 | + start = index + 1; |
|---|
| 983 | 1053 | } |
|---|
| 984 | 1054 | } |
|---|
| 985 | 1055 | unlock_page(page); |
|---|
| .. | .. |
|---|
| 1067 | 1137 | * Part of the huge page can be beyond i_size: subject |
|---|
| 1068 | 1138 | * to shrink under memory pressure. |
|---|
| 1069 | 1139 | */ |
|---|
| 1070 | | - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { |
|---|
| 1140 | + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { |
|---|
| 1071 | 1141 | spin_lock(&sbinfo->shrinklist_lock); |
|---|
| 1072 | 1142 | /* |
|---|
| 1073 | 1143 | * _careful to defend against unlocked access to |
|---|
| .. | .. |
|---|
| 1106 | 1176 | } |
|---|
| 1107 | 1177 | spin_unlock(&sbinfo->shrinklist_lock); |
|---|
| 1108 | 1178 | } |
|---|
| 1109 | | - if (!list_empty(&info->swaplist)) { |
|---|
| 1179 | + while (!list_empty(&info->swaplist)) { |
|---|
| 1180 | + /* Wait while shmem_unuse() is scanning this inode... */ |
|---|
| 1181 | + wait_var_event(&info->stop_eviction, |
|---|
| 1182 | + !atomic_read(&info->stop_eviction)); |
|---|
| 1110 | 1183 | mutex_lock(&shmem_swaplist_mutex); |
|---|
| 1111 | | - list_del_init(&info->swaplist); |
|---|
| 1184 | + /* ...but beware of the race if we peeked too early */ |
|---|
| 1185 | + if (!atomic_read(&info->stop_eviction)) |
|---|
| 1186 | + list_del_init(&info->swaplist); |
|---|
| 1112 | 1187 | mutex_unlock(&shmem_swaplist_mutex); |
|---|
| 1113 | 1188 | } |
|---|
| 1114 | 1189 | } |
|---|
| .. | .. |
|---|
| 1119 | 1194 | clear_inode(inode); |
|---|
| 1120 | 1195 | } |
|---|
| 1121 | 1196 | |
|---|
| 1122 | | -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) |
|---|
| 1197 | +extern struct swap_info_struct *swap_info[]; |
|---|
| 1198 | + |
|---|
| 1199 | +static int shmem_find_swap_entries(struct address_space *mapping, |
|---|
| 1200 | + pgoff_t start, unsigned int nr_entries, |
|---|
| 1201 | + struct page **entries, pgoff_t *indices, |
|---|
| 1202 | + unsigned int type, bool frontswap) |
|---|
| 1123 | 1203 | { |
|---|
| 1124 | | - struct radix_tree_iter iter; |
|---|
| 1125 | | - void __rcu **slot; |
|---|
| 1126 | | - unsigned long found = -1; |
|---|
| 1127 | | - unsigned int checked = 0; |
|---|
| 1204 | + XA_STATE(xas, &mapping->i_pages, start); |
|---|
| 1205 | + struct page *page; |
|---|
| 1206 | + swp_entry_t entry; |
|---|
| 1207 | + unsigned int ret = 0; |
|---|
| 1208 | + |
|---|
| 1209 | + if (!nr_entries) |
|---|
| 1210 | + return 0; |
|---|
| 1128 | 1211 | |
|---|
| 1129 | 1212 | rcu_read_lock(); |
|---|
| 1130 | | - radix_tree_for_each_slot(slot, root, &iter, 0) { |
|---|
| 1131 | | - void *entry = radix_tree_deref_slot(slot); |
|---|
| 1132 | | - |
|---|
| 1133 | | - if (radix_tree_deref_retry(entry)) { |
|---|
| 1134 | | - slot = radix_tree_iter_retry(&iter); |
|---|
| 1213 | + xas_for_each(&xas, page, ULONG_MAX) { |
|---|
| 1214 | + if (xas_retry(&xas, page)) |
|---|
| 1135 | 1215 | continue; |
|---|
| 1216 | + |
|---|
| 1217 | + if (!xa_is_value(page)) |
|---|
| 1218 | + continue; |
|---|
| 1219 | + |
|---|
| 1220 | + entry = radix_to_swp_entry(page); |
|---|
| 1221 | + if (swp_type(entry) != type) |
|---|
| 1222 | + continue; |
|---|
| 1223 | + if (frontswap && |
|---|
| 1224 | + !frontswap_test(swap_info[type], swp_offset(entry))) |
|---|
| 1225 | + continue; |
|---|
| 1226 | + |
|---|
| 1227 | + indices[ret] = xas.xa_index; |
|---|
| 1228 | + entries[ret] = page; |
|---|
| 1229 | + |
|---|
| 1230 | + if (need_resched()) { |
|---|
| 1231 | + xas_pause(&xas); |
|---|
| 1232 | + cond_resched_rcu(); |
|---|
| 1136 | 1233 | } |
|---|
| 1137 | | - if (entry == item) { |
|---|
| 1138 | | - found = iter.index; |
|---|
| 1234 | + if (++ret == nr_entries) |
|---|
| 1139 | 1235 | break; |
|---|
| 1140 | | - } |
|---|
| 1141 | | - checked++; |
|---|
| 1142 | | - if ((checked % 4096) != 0) |
|---|
| 1143 | | - continue; |
|---|
| 1144 | | - slot = radix_tree_iter_resume(slot, &iter); |
|---|
| 1145 | | - cond_resched_rcu(); |
|---|
| 1146 | 1236 | } |
|---|
| 1147 | | - |
|---|
| 1148 | 1237 | rcu_read_unlock(); |
|---|
| 1149 | | - return found; |
|---|
| 1238 | + |
|---|
| 1239 | + return ret; |
|---|
| 1240 | +} |
|---|
| 1241 | + |
|---|
| 1242 | +/* |
|---|
| 1243 | + * Move the swapped pages for an inode to page cache. Returns the count |
|---|
| 1244 | + * of pages swapped in, or the error in case of failure. |
|---|
| 1245 | + */ |
|---|
| 1246 | +static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, |
|---|
| 1247 | + pgoff_t *indices) |
|---|
| 1248 | +{ |
|---|
| 1249 | + int i = 0; |
|---|
| 1250 | + int ret = 0; |
|---|
| 1251 | + int error = 0; |
|---|
| 1252 | + struct address_space *mapping = inode->i_mapping; |
|---|
| 1253 | + |
|---|
| 1254 | + for (i = 0; i < pvec.nr; i++) { |
|---|
| 1255 | + struct page *page = pvec.pages[i]; |
|---|
| 1256 | + |
|---|
| 1257 | + if (!xa_is_value(page)) |
|---|
| 1258 | + continue; |
|---|
| 1259 | + error = shmem_swapin_page(inode, indices[i], |
|---|
| 1260 | + &page, SGP_CACHE, |
|---|
| 1261 | + mapping_gfp_mask(mapping), |
|---|
| 1262 | + NULL, NULL); |
|---|
| 1263 | + if (error == 0) { |
|---|
| 1264 | + unlock_page(page); |
|---|
| 1265 | + put_page(page); |
|---|
| 1266 | + ret++; |
|---|
| 1267 | + } |
|---|
| 1268 | + if (error == -ENOMEM) |
|---|
| 1269 | + break; |
|---|
| 1270 | + error = 0; |
|---|
| 1271 | + } |
|---|
| 1272 | + return error ? error : ret; |
|---|
| 1150 | 1273 | } |
|---|
| 1151 | 1274 | |
|---|
| 1152 | 1275 | /* |
|---|
| 1153 | 1276 | * If swap found in inode, free it and move page from swapcache to filecache. |
|---|
| 1154 | 1277 | */ |
|---|
| 1155 | | -static int shmem_unuse_inode(struct shmem_inode_info *info, |
|---|
| 1156 | | - swp_entry_t swap, struct page **pagep) |
|---|
| 1278 | +static int shmem_unuse_inode(struct inode *inode, unsigned int type, |
|---|
| 1279 | + bool frontswap, unsigned long *fs_pages_to_unuse) |
|---|
| 1157 | 1280 | { |
|---|
| 1158 | | - struct address_space *mapping = info->vfs_inode.i_mapping; |
|---|
| 1159 | | - void *radswap; |
|---|
| 1160 | | - pgoff_t index; |
|---|
| 1161 | | - gfp_t gfp; |
|---|
| 1162 | | - int error = 0; |
|---|
| 1281 | + struct address_space *mapping = inode->i_mapping; |
|---|
| 1282 | + pgoff_t start = 0; |
|---|
| 1283 | + struct pagevec pvec; |
|---|
| 1284 | + pgoff_t indices[PAGEVEC_SIZE]; |
|---|
| 1285 | + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); |
|---|
| 1286 | + int ret = 0; |
|---|
| 1163 | 1287 | |
|---|
| 1164 | | - radswap = swp_to_radix_entry(swap); |
|---|
| 1165 | | - index = find_swap_entry(&mapping->i_pages, radswap); |
|---|
| 1166 | | - if (index == -1) |
|---|
| 1167 | | - return -EAGAIN; /* tell shmem_unuse we found nothing */ |
|---|
| 1288 | + pagevec_init(&pvec); |
|---|
| 1289 | + do { |
|---|
| 1290 | + unsigned int nr_entries = PAGEVEC_SIZE; |
|---|
| 1168 | 1291 | |
|---|
| 1169 | | - /* |
|---|
| 1170 | | - * Move _head_ to start search for next from here. |
|---|
| 1171 | | - * But be careful: shmem_evict_inode checks list_empty without taking |
|---|
| 1172 | | - * mutex, and there's an instant in list_move_tail when info->swaplist |
|---|
| 1173 | | - * would appear empty, if it were the only one on shmem_swaplist. |
|---|
| 1174 | | - */ |
|---|
| 1175 | | - if (shmem_swaplist.next != &info->swaplist) |
|---|
| 1176 | | - list_move_tail(&shmem_swaplist, &info->swaplist); |
|---|
| 1292 | + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) |
|---|
| 1293 | + nr_entries = *fs_pages_to_unuse; |
|---|
| 1177 | 1294 | |
|---|
| 1178 | | - gfp = mapping_gfp_mask(mapping); |
|---|
| 1179 | | - if (shmem_should_replace_page(*pagep, gfp)) { |
|---|
| 1180 | | - mutex_unlock(&shmem_swaplist_mutex); |
|---|
| 1181 | | - error = shmem_replace_page(pagep, gfp, info, index); |
|---|
| 1182 | | - mutex_lock(&shmem_swaplist_mutex); |
|---|
| 1183 | | - /* |
|---|
| 1184 | | - * We needed to drop mutex to make that restrictive page |
|---|
| 1185 | | - * allocation, but the inode might have been freed while we |
|---|
| 1186 | | - * dropped it: although a racing shmem_evict_inode() cannot |
|---|
| 1187 | | - * complete without emptying the radix_tree, our page lock |
|---|
| 1188 | | - * on this swapcache page is not enough to prevent that - |
|---|
| 1189 | | - * free_swap_and_cache() of our swap entry will only |
|---|
| 1190 | | - * trylock_page(), removing swap from radix_tree whatever. |
|---|
| 1191 | | - * |
|---|
| 1192 | | - * We must not proceed to shmem_add_to_page_cache() if the |
|---|
| 1193 | | - * inode has been freed, but of course we cannot rely on |
|---|
| 1194 | | - * inode or mapping or info to check that. However, we can |
|---|
| 1195 | | - * safely check if our swap entry is still in use (and here |
|---|
| 1196 | | - * it can't have got reused for another page): if it's still |
|---|
| 1197 | | - * in use, then the inode cannot have been freed yet, and we |
|---|
| 1198 | | - * can safely proceed (if it's no longer in use, that tells |
|---|
| 1199 | | - * nothing about the inode, but we don't need to unuse swap). |
|---|
| 1200 | | - */ |
|---|
| 1201 | | - if (!page_swapcount(*pagep)) |
|---|
| 1202 | | - error = -ENOENT; |
|---|
| 1203 | | - } |
|---|
| 1204 | | - |
|---|
| 1205 | | - /* |
|---|
| 1206 | | - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
|---|
| 1207 | | - * but also to hold up shmem_evict_inode(): so inode cannot be freed |
|---|
| 1208 | | - * beneath us (pagelock doesn't help until the page is in pagecache). |
|---|
| 1209 | | - */ |
|---|
| 1210 | | - if (!error) |
|---|
| 1211 | | - error = shmem_add_to_page_cache(*pagep, mapping, index, |
|---|
| 1212 | | - radswap); |
|---|
| 1213 | | - if (error != -ENOMEM) { |
|---|
| 1214 | | - /* |
|---|
| 1215 | | - * Truncation and eviction use free_swap_and_cache(), which |
|---|
| 1216 | | - * only does trylock page: if we raced, best clean up here. |
|---|
| 1217 | | - */ |
|---|
| 1218 | | - delete_from_swap_cache(*pagep); |
|---|
| 1219 | | - set_page_dirty(*pagep); |
|---|
| 1220 | | - if (!error) { |
|---|
| 1221 | | - spin_lock_irq(&info->lock); |
|---|
| 1222 | | - info->swapped--; |
|---|
| 1223 | | - spin_unlock_irq(&info->lock); |
|---|
| 1224 | | - swap_free(swap); |
|---|
| 1295 | + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, |
|---|
| 1296 | + pvec.pages, indices, |
|---|
| 1297 | + type, frontswap); |
|---|
| 1298 | + if (pvec.nr == 0) { |
|---|
| 1299 | + ret = 0; |
|---|
| 1300 | + break; |
|---|
| 1225 | 1301 | } |
|---|
| 1226 | | - } |
|---|
| 1227 | | - return error; |
|---|
| 1302 | + |
|---|
| 1303 | + ret = shmem_unuse_swap_entries(inode, pvec, indices); |
|---|
| 1304 | + if (ret < 0) |
|---|
| 1305 | + break; |
|---|
| 1306 | + |
|---|
| 1307 | + if (frontswap_partial) { |
|---|
| 1308 | + *fs_pages_to_unuse -= ret; |
|---|
| 1309 | + if (*fs_pages_to_unuse == 0) { |
|---|
| 1310 | + ret = FRONTSWAP_PAGES_UNUSED; |
|---|
| 1311 | + break; |
|---|
| 1312 | + } |
|---|
| 1313 | + } |
|---|
| 1314 | + |
|---|
| 1315 | + start = indices[pvec.nr - 1]; |
|---|
| 1316 | + } while (true); |
|---|
| 1317 | + |
|---|
| 1318 | + return ret; |
|---|
| 1228 | 1319 | } |
|---|
| 1229 | 1320 | |
|---|
| 1230 | 1321 | /* |
|---|
| 1231 | | - * Search through swapped inodes to find and replace swap by page. |
|---|
| 1322 | + * Read all the shared memory data that resides in the swap |
|---|
| 1323 | + * device 'type' back into memory, so the swap device can be |
|---|
| 1324 | + * unused. |
|---|
| 1232 | 1325 | */ |
|---|
| 1233 | | -int shmem_unuse(swp_entry_t swap, struct page *page) |
|---|
| 1326 | +int shmem_unuse(unsigned int type, bool frontswap, |
|---|
| 1327 | + unsigned long *fs_pages_to_unuse) |
|---|
| 1234 | 1328 | { |
|---|
| 1235 | | - struct list_head *this, *next; |
|---|
| 1236 | | - struct shmem_inode_info *info; |
|---|
| 1237 | | - struct mem_cgroup *memcg; |
|---|
| 1329 | + struct shmem_inode_info *info, *next; |
|---|
| 1238 | 1330 | int error = 0; |
|---|
| 1239 | 1331 | |
|---|
| 1240 | | - /* |
|---|
| 1241 | | - * There's a faint possibility that swap page was replaced before |
|---|
| 1242 | | - * caller locked it: caller will come back later with the right page. |
|---|
| 1243 | | - */ |
|---|
| 1244 | | - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) |
|---|
| 1245 | | - goto out; |
|---|
| 1246 | | - |
|---|
| 1247 | | - /* |
|---|
| 1248 | | - * Charge page using GFP_KERNEL while we can wait, before taking |
|---|
| 1249 | | - * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
|---|
| 1250 | | - * Charged back to the user (not to caller) when swap account is used. |
|---|
| 1251 | | - */ |
|---|
| 1252 | | - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, |
|---|
| 1253 | | - &memcg, false); |
|---|
| 1254 | | - if (error) |
|---|
| 1255 | | - goto out; |
|---|
| 1256 | | - /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
|---|
| 1257 | | - error = -EAGAIN; |
|---|
| 1332 | + if (list_empty(&shmem_swaplist)) |
|---|
| 1333 | + return 0; |
|---|
| 1258 | 1334 | |
|---|
| 1259 | 1335 | mutex_lock(&shmem_swaplist_mutex); |
|---|
| 1260 | | - list_for_each_safe(this, next, &shmem_swaplist) { |
|---|
| 1261 | | - info = list_entry(this, struct shmem_inode_info, swaplist); |
|---|
| 1262 | | - if (info->swapped) |
|---|
| 1263 | | - error = shmem_unuse_inode(info, swap, &page); |
|---|
| 1264 | | - else |
|---|
| 1336 | + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { |
|---|
| 1337 | + if (!info->swapped) { |
|---|
| 1265 | 1338 | list_del_init(&info->swaplist); |
|---|
| 1339 | + continue; |
|---|
| 1340 | + } |
|---|
| 1341 | + /* |
|---|
| 1342 | + * Drop the swaplist mutex while searching the inode for swap; |
|---|
| 1343 | + * but before doing so, make sure shmem_evict_inode() will not |
|---|
| 1344 | + * remove placeholder inode from swaplist, nor let it be freed |
|---|
| 1345 | + * (igrab() would protect from unlink, but not from unmount). |
|---|
| 1346 | + */ |
|---|
| 1347 | + atomic_inc(&info->stop_eviction); |
|---|
| 1348 | + mutex_unlock(&shmem_swaplist_mutex); |
|---|
| 1349 | + |
|---|
| 1350 | + error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, |
|---|
| 1351 | + fs_pages_to_unuse); |
|---|
| 1266 | 1352 | cond_resched(); |
|---|
| 1267 | | - if (error != -EAGAIN) |
|---|
| 1353 | + |
|---|
| 1354 | + mutex_lock(&shmem_swaplist_mutex); |
|---|
| 1355 | + next = list_next_entry(info, swaplist); |
|---|
| 1356 | + if (!info->swapped) |
|---|
| 1357 | + list_del_init(&info->swaplist); |
|---|
| 1358 | + if (atomic_dec_and_test(&info->stop_eviction)) |
|---|
| 1359 | + wake_up_var(&info->stop_eviction); |
|---|
| 1360 | + if (error) |
|---|
| 1268 | 1361 | break; |
|---|
| 1269 | | - /* found nothing in this: move on to search the next */ |
|---|
| 1270 | 1362 | } |
|---|
| 1271 | 1363 | mutex_unlock(&shmem_swaplist_mutex); |
|---|
| 1272 | 1364 | |
|---|
| 1273 | | - if (error) { |
|---|
| 1274 | | - if (error != -ENOMEM) |
|---|
| 1275 | | - error = 0; |
|---|
| 1276 | | - mem_cgroup_cancel_charge(page, memcg, false); |
|---|
| 1277 | | - } else |
|---|
| 1278 | | - mem_cgroup_commit_charge(page, memcg, true, false); |
|---|
| 1279 | | -out: |
|---|
| 1280 | | - unlock_page(page); |
|---|
| 1281 | | - put_page(page); |
|---|
| 1282 | 1365 | return error; |
|---|
| 1283 | 1366 | } |
|---|
| 1284 | 1367 | |
|---|
| .. | .. |
|---|
| 1348 | 1431 | SetPageUptodate(page); |
|---|
| 1349 | 1432 | } |
|---|
| 1350 | 1433 | |
|---|
| 1434 | + trace_android_vh_set_shmem_page_flag(page); |
|---|
| 1351 | 1435 | swap = get_swap_page(page); |
|---|
| 1352 | 1436 | if (!swap.val) |
|---|
| 1353 | 1437 | goto redirty; |
|---|
| .. | .. |
|---|
| 1362 | 1446 | */ |
|---|
| 1363 | 1447 | mutex_lock(&shmem_swaplist_mutex); |
|---|
| 1364 | 1448 | if (list_empty(&info->swaplist)) |
|---|
| 1365 | | - list_add_tail(&info->swaplist, &shmem_swaplist); |
|---|
| 1449 | + list_add(&info->swaplist, &shmem_swaplist); |
|---|
| 1366 | 1450 | |
|---|
| 1367 | | - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
|---|
| 1451 | + if (add_to_swap_cache(page, swap, |
|---|
| 1452 | + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, |
|---|
| 1453 | + NULL) == 0) { |
|---|
| 1368 | 1454 | spin_lock_irq(&info->lock); |
|---|
| 1369 | 1455 | shmem_recalc_inode(inode); |
|---|
| 1370 | 1456 | info->swapped++; |
|---|
| .. | .. |
|---|
| 1447 | 1533 | { |
|---|
| 1448 | 1534 | struct vm_area_struct pvma; |
|---|
| 1449 | 1535 | struct page *page; |
|---|
| 1450 | | - struct vm_fault vmf; |
|---|
| 1536 | + struct vm_fault vmf = { |
|---|
| 1537 | + .vma = &pvma, |
|---|
| 1538 | + }; |
|---|
| 1451 | 1539 | |
|---|
| 1452 | 1540 | shmem_pseudo_vma_init(&pvma, info, index); |
|---|
| 1453 | | - vmf.vma = &pvma; |
|---|
| 1454 | | - vmf.address = 0; |
|---|
| 1455 | 1541 | page = swap_cluster_readahead(swap, gfp, &vmf); |
|---|
| 1456 | 1542 | shmem_pseudo_vma_destroy(&pvma); |
|---|
| 1457 | 1543 | |
|---|
| .. | .. |
|---|
| 1462 | 1548 | struct shmem_inode_info *info, pgoff_t index) |
|---|
| 1463 | 1549 | { |
|---|
| 1464 | 1550 | struct vm_area_struct pvma; |
|---|
| 1465 | | - struct inode *inode = &info->vfs_inode; |
|---|
| 1466 | | - struct address_space *mapping = inode->i_mapping; |
|---|
| 1467 | | - pgoff_t idx, hindex; |
|---|
| 1468 | | - void __rcu **results; |
|---|
| 1551 | + struct address_space *mapping = info->vfs_inode.i_mapping; |
|---|
| 1552 | + pgoff_t hindex; |
|---|
| 1469 | 1553 | struct page *page; |
|---|
| 1470 | 1554 | |
|---|
| 1471 | | - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) |
|---|
| 1472 | | - return NULL; |
|---|
| 1473 | | - |
|---|
| 1474 | 1555 | hindex = round_down(index, HPAGE_PMD_NR); |
|---|
| 1475 | | - rcu_read_lock(); |
|---|
| 1476 | | - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, |
|---|
| 1477 | | - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { |
|---|
| 1478 | | - rcu_read_unlock(); |
|---|
| 1556 | + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, |
|---|
| 1557 | + XA_PRESENT)) |
|---|
| 1479 | 1558 | return NULL; |
|---|
| 1480 | | - } |
|---|
| 1481 | | - rcu_read_unlock(); |
|---|
| 1482 | 1559 | |
|---|
| 1483 | 1560 | shmem_pseudo_vma_init(&pvma, info, hindex); |
|---|
| 1484 | 1561 | page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, |
|---|
| .. | .. |
|---|
| 1486 | 1563 | shmem_pseudo_vma_destroy(&pvma); |
|---|
| 1487 | 1564 | if (page) |
|---|
| 1488 | 1565 | prep_transhuge_page(page); |
|---|
| 1566 | + else |
|---|
| 1567 | + count_vm_event(THP_FILE_FALLBACK); |
|---|
| 1489 | 1568 | return page; |
|---|
| 1490 | 1569 | } |
|---|
| 1491 | 1570 | |
|---|
| .. | .. |
|---|
| 1493 | 1572 | struct shmem_inode_info *info, pgoff_t index) |
|---|
| 1494 | 1573 | { |
|---|
| 1495 | 1574 | struct vm_area_struct pvma; |
|---|
| 1496 | | - struct page *page; |
|---|
| 1575 | + struct page *page = NULL; |
|---|
| 1576 | + |
|---|
| 1577 | + trace_android_vh_shmem_alloc_page(&page); |
|---|
| 1578 | + if (page) |
|---|
| 1579 | + return page; |
|---|
| 1497 | 1580 | |
|---|
| 1498 | 1581 | shmem_pseudo_vma_init(&pvma, info, index); |
|---|
| 1499 | 1582 | page = alloc_page_vma(gfp, &pvma, 0); |
|---|
| .. | .. |
|---|
| 1511 | 1594 | int nr; |
|---|
| 1512 | 1595 | int err = -ENOSPC; |
|---|
| 1513 | 1596 | |
|---|
| 1514 | | - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) |
|---|
| 1597 | + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) |
|---|
| 1515 | 1598 | huge = false; |
|---|
| 1516 | 1599 | nr = huge ? HPAGE_PMD_NR : 1; |
|---|
| 1517 | 1600 | |
|---|
| .. | .. |
|---|
| 1589 | 1672 | * a nice clean interface for us to replace oldpage by newpage there. |
|---|
| 1590 | 1673 | */ |
|---|
| 1591 | 1674 | xa_lock_irq(&swap_mapping->i_pages); |
|---|
| 1592 | | - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
|---|
| 1593 | | - newpage); |
|---|
| 1675 | + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); |
|---|
| 1594 | 1676 | if (!error) { |
|---|
| 1595 | | - __inc_node_page_state(newpage, NR_FILE_PAGES); |
|---|
| 1596 | | - __dec_node_page_state(oldpage, NR_FILE_PAGES); |
|---|
| 1677 | + mem_cgroup_migrate(oldpage, newpage); |
|---|
| 1678 | + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); |
|---|
| 1679 | + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); |
|---|
| 1597 | 1680 | } |
|---|
| 1598 | 1681 | xa_unlock_irq(&swap_mapping->i_pages); |
|---|
| 1599 | 1682 | |
|---|
| .. | .. |
|---|
| 1605 | 1688 | */ |
|---|
| 1606 | 1689 | oldpage = newpage; |
|---|
| 1607 | 1690 | } else { |
|---|
| 1608 | | - mem_cgroup_migrate(oldpage, newpage); |
|---|
| 1609 | | - lru_cache_add_anon(newpage); |
|---|
| 1691 | + lru_cache_add(newpage); |
|---|
| 1610 | 1692 | *pagep = newpage; |
|---|
| 1611 | 1693 | } |
|---|
| 1612 | 1694 | |
|---|
| .. | .. |
|---|
| 1620 | 1702 | } |
|---|
| 1621 | 1703 | |
|---|
| 1622 | 1704 | /* |
|---|
| 1705 | + * Swap in the page pointed to by *pagep. |
|---|
| 1706 | + * Caller has to make sure that *pagep contains a valid swapped page. |
|---|
| 1707 | + * Returns 0 and the page in pagep if success. On failure, returns the |
|---|
| 1708 | + * error code and NULL in *pagep. |
|---|
| 1709 | + */ |
|---|
| 1710 | +static int shmem_swapin_page(struct inode *inode, pgoff_t index, |
|---|
| 1711 | + struct page **pagep, enum sgp_type sgp, |
|---|
| 1712 | + gfp_t gfp, struct vm_area_struct *vma, |
|---|
| 1713 | + vm_fault_t *fault_type) |
|---|
| 1714 | +{ |
|---|
| 1715 | + struct address_space *mapping = inode->i_mapping; |
|---|
| 1716 | + struct shmem_inode_info *info = SHMEM_I(inode); |
|---|
| 1717 | + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; |
|---|
| 1718 | + struct page *page; |
|---|
| 1719 | + swp_entry_t swap; |
|---|
| 1720 | + int error; |
|---|
| 1721 | + |
|---|
| 1722 | + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); |
|---|
| 1723 | + swap = radix_to_swp_entry(*pagep); |
|---|
| 1724 | + *pagep = NULL; |
|---|
| 1725 | + |
|---|
| 1726 | + /* Look it up and read it in.. */ |
|---|
| 1727 | + page = lookup_swap_cache(swap, NULL, 0); |
|---|
| 1728 | + if (!page) { |
|---|
| 1729 | + /* Or update major stats only when swapin succeeds?? */ |
|---|
| 1730 | + if (fault_type) { |
|---|
| 1731 | + *fault_type |= VM_FAULT_MAJOR; |
|---|
| 1732 | + count_vm_event(PGMAJFAULT); |
|---|
| 1733 | + count_memcg_event_mm(charge_mm, PGMAJFAULT); |
|---|
| 1734 | + } |
|---|
| 1735 | + /* Here we actually start the io */ |
|---|
| 1736 | + page = shmem_swapin(swap, gfp, info, index); |
|---|
| 1737 | + if (!page) { |
|---|
| 1738 | + error = -ENOMEM; |
|---|
| 1739 | + goto failed; |
|---|
| 1740 | + } |
|---|
| 1741 | + } |
|---|
| 1742 | + |
|---|
| 1743 | + /* We have to do this with page locked to prevent races */ |
|---|
| 1744 | + lock_page(page); |
|---|
| 1745 | + if (!PageSwapCache(page) || page_private(page) != swap.val || |
|---|
| 1746 | + !shmem_confirm_swap(mapping, index, swap)) { |
|---|
| 1747 | + error = -EEXIST; |
|---|
| 1748 | + goto unlock; |
|---|
| 1749 | + } |
|---|
| 1750 | + if (!PageUptodate(page)) { |
|---|
| 1751 | + error = -EIO; |
|---|
| 1752 | + goto failed; |
|---|
| 1753 | + } |
|---|
| 1754 | + wait_on_page_writeback(page); |
|---|
| 1755 | + |
|---|
| 1756 | + /* |
|---|
| 1757 | + * Some architectures may have to restore extra metadata to the |
|---|
| 1758 | + * physical page after reading from swap. |
|---|
| 1759 | + */ |
|---|
| 1760 | + arch_swap_restore(swap, page); |
|---|
| 1761 | + |
|---|
| 1762 | + if (shmem_should_replace_page(page, gfp)) { |
|---|
| 1763 | + error = shmem_replace_page(&page, gfp, info, index); |
|---|
| 1764 | + if (error) |
|---|
| 1765 | + goto failed; |
|---|
| 1766 | + } |
|---|
| 1767 | + |
|---|
| 1768 | + error = shmem_add_to_page_cache(page, mapping, index, |
|---|
| 1769 | + swp_to_radix_entry(swap), gfp, |
|---|
| 1770 | + charge_mm); |
|---|
| 1771 | + if (error) |
|---|
| 1772 | + goto failed; |
|---|
| 1773 | + |
|---|
| 1774 | + spin_lock_irq(&info->lock); |
|---|
| 1775 | + info->swapped--; |
|---|
| 1776 | + shmem_recalc_inode(inode); |
|---|
| 1777 | + spin_unlock_irq(&info->lock); |
|---|
| 1778 | + |
|---|
| 1779 | + if (sgp == SGP_WRITE) |
|---|
| 1780 | + mark_page_accessed(page); |
|---|
| 1781 | + |
|---|
| 1782 | + delete_from_swap_cache(page); |
|---|
| 1783 | + set_page_dirty(page); |
|---|
| 1784 | + swap_free(swap); |
|---|
| 1785 | + |
|---|
| 1786 | + *pagep = page; |
|---|
| 1787 | + return 0; |
|---|
| 1788 | +failed: |
|---|
| 1789 | + if (!shmem_confirm_swap(mapping, index, swap)) |
|---|
| 1790 | + error = -EEXIST; |
|---|
| 1791 | +unlock: |
|---|
| 1792 | + if (page) { |
|---|
| 1793 | + unlock_page(page); |
|---|
| 1794 | + put_page(page); |
|---|
| 1795 | + } |
|---|
| 1796 | + |
|---|
| 1797 | + return error; |
|---|
| 1798 | +} |
|---|
| 1799 | + |
|---|
| 1800 | +/* |
|---|
| 1623 | 1801 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
|---|
| 1624 | 1802 | * |
|---|
| 1625 | 1803 | * If we allocate a new one we do not mark it dirty. That's up to the |
|---|
| 1626 | 1804 | * vm. If we swap it in we mark it dirty since we also free the swap |
|---|
| 1627 | 1805 | * entry since a page cannot live in both the swap and page cache. |
|---|
| 1628 | 1806 | * |
|---|
| 1629 | | - * fault_mm and fault_type are only supplied by shmem_fault: |
|---|
| 1807 | + * vma, vmf, and fault_type are only supplied by shmem_fault: |
|---|
| 1630 | 1808 | * otherwise they are NULL. |
|---|
| 1631 | 1809 | */ |
|---|
| 1632 | 1810 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
|---|
| .. | .. |
|---|
| 1638 | 1816 | struct shmem_inode_info *info = SHMEM_I(inode); |
|---|
| 1639 | 1817 | struct shmem_sb_info *sbinfo; |
|---|
| 1640 | 1818 | struct mm_struct *charge_mm; |
|---|
| 1641 | | - struct mem_cgroup *memcg; |
|---|
| 1642 | 1819 | struct page *page; |
|---|
| 1643 | | - swp_entry_t swap; |
|---|
| 1644 | 1820 | enum sgp_type sgp_huge = sgp; |
|---|
| 1645 | 1821 | pgoff_t hindex = index; |
|---|
| 1646 | 1822 | int error; |
|---|
| .. | .. |
|---|
| 1652 | 1828 | if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) |
|---|
| 1653 | 1829 | sgp = SGP_CACHE; |
|---|
| 1654 | 1830 | repeat: |
|---|
| 1655 | | - swap.val = 0; |
|---|
| 1656 | | - page = find_lock_entry(mapping, index); |
|---|
| 1657 | | - if (radix_tree_exceptional_entry(page)) { |
|---|
| 1658 | | - swap = radix_to_swp_entry(page); |
|---|
| 1659 | | - page = NULL; |
|---|
| 1660 | | - } |
|---|
| 1661 | | - |
|---|
| 1662 | 1831 | if (sgp <= SGP_CACHE && |
|---|
| 1663 | 1832 | ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { |
|---|
| 1664 | | - error = -EINVAL; |
|---|
| 1665 | | - goto unlock; |
|---|
| 1833 | + return -EINVAL; |
|---|
| 1666 | 1834 | } |
|---|
| 1667 | 1835 | |
|---|
| 1836 | + sbinfo = SHMEM_SB(inode->i_sb); |
|---|
| 1837 | + charge_mm = vma ? vma->vm_mm : current->mm; |
|---|
| 1838 | + |
|---|
| 1839 | + page = find_lock_entry(mapping, index); |
|---|
| 1840 | + |
|---|
| 1841 | + if (page && vma && userfaultfd_minor(vma)) { |
|---|
| 1842 | + if (!xa_is_value(page)) { |
|---|
| 1843 | + unlock_page(page); |
|---|
| 1844 | + put_page(page); |
|---|
| 1845 | + } |
|---|
| 1846 | + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); |
|---|
| 1847 | + return 0; |
|---|
| 1848 | + } |
|---|
| 1849 | + |
|---|
| 1850 | + if (xa_is_value(page)) { |
|---|
| 1851 | + error = shmem_swapin_page(inode, index, &page, |
|---|
| 1852 | + sgp, gfp, vma, fault_type); |
|---|
| 1853 | + if (error == -EEXIST) |
|---|
| 1854 | + goto repeat; |
|---|
| 1855 | + |
|---|
| 1856 | + *pagep = page; |
|---|
| 1857 | + return error; |
|---|
| 1858 | + } |
|---|
| 1859 | + |
|---|
| 1860 | + if (page) |
|---|
| 1861 | + hindex = page->index; |
|---|
| 1668 | 1862 | if (page && sgp == SGP_WRITE) |
|---|
| 1669 | 1863 | mark_page_accessed(page); |
|---|
| 1670 | 1864 | |
|---|
| .. | .. |
|---|
| 1675 | 1869 | unlock_page(page); |
|---|
| 1676 | 1870 | put_page(page); |
|---|
| 1677 | 1871 | page = NULL; |
|---|
| 1872 | + hindex = index; |
|---|
| 1678 | 1873 | } |
|---|
| 1679 | | - if (page || (sgp == SGP_READ && !swap.val)) { |
|---|
| 1680 | | - *pagep = page; |
|---|
| 1681 | | - return 0; |
|---|
| 1682 | | - } |
|---|
| 1874 | + if (page || sgp == SGP_READ) |
|---|
| 1875 | + goto out; |
|---|
| 1683 | 1876 | |
|---|
| 1684 | 1877 | /* |
|---|
| 1685 | 1878 | * Fast cache lookup did not find it: |
|---|
| 1686 | 1879 | * bring it back from swap or allocate. |
|---|
| 1687 | 1880 | */ |
|---|
| 1688 | | - sbinfo = SHMEM_SB(inode->i_sb); |
|---|
| 1689 | | - charge_mm = vma ? vma->vm_mm : current->mm; |
|---|
| 1690 | 1881 | |
|---|
| 1691 | | - if (swap.val) { |
|---|
| 1692 | | - /* Look it up and read it in.. */ |
|---|
| 1693 | | - page = lookup_swap_cache(swap, NULL, 0); |
|---|
| 1694 | | - if (!page) { |
|---|
| 1695 | | - /* Or update major stats only when swapin succeeds?? */ |
|---|
| 1696 | | - if (fault_type) { |
|---|
| 1697 | | - *fault_type |= VM_FAULT_MAJOR; |
|---|
| 1698 | | - count_vm_event(PGMAJFAULT); |
|---|
| 1699 | | - count_memcg_event_mm(charge_mm, PGMAJFAULT); |
|---|
| 1700 | | - } |
|---|
| 1701 | | - /* Here we actually start the io */ |
|---|
| 1702 | | - page = shmem_swapin(swap, gfp, info, index); |
|---|
| 1703 | | - if (!page) { |
|---|
| 1704 | | - error = -ENOMEM; |
|---|
| 1705 | | - goto failed; |
|---|
| 1706 | | - } |
|---|
| 1707 | | - } |
|---|
| 1882 | + if (vma && userfaultfd_missing(vma)) { |
|---|
| 1883 | + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); |
|---|
| 1884 | + return 0; |
|---|
| 1885 | + } |
|---|
| 1708 | 1886 | |
|---|
| 1709 | | - /* We have to do this with page locked to prevent races */ |
|---|
| 1710 | | - lock_page(page); |
|---|
| 1711 | | - if (!PageSwapCache(page) || page_private(page) != swap.val || |
|---|
| 1712 | | - !shmem_confirm_swap(mapping, index, swap)) { |
|---|
| 1713 | | - error = -EEXIST; /* try again */ |
|---|
| 1714 | | - goto unlock; |
|---|
| 1715 | | - } |
|---|
| 1716 | | - if (!PageUptodate(page)) { |
|---|
| 1717 | | - error = -EIO; |
|---|
| 1718 | | - goto failed; |
|---|
| 1719 | | - } |
|---|
| 1720 | | - wait_on_page_writeback(page); |
|---|
| 1887 | + /* shmem_symlink() */ |
|---|
| 1888 | + if (mapping->a_ops != &shmem_aops) |
|---|
| 1889 | + goto alloc_nohuge; |
|---|
| 1890 | + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) |
|---|
| 1891 | + goto alloc_nohuge; |
|---|
| 1892 | + if (shmem_huge == SHMEM_HUGE_FORCE) |
|---|
| 1893 | + goto alloc_huge; |
|---|
| 1894 | + switch (sbinfo->huge) { |
|---|
| 1895 | + case SHMEM_HUGE_NEVER: |
|---|
| 1896 | + goto alloc_nohuge; |
|---|
| 1897 | + case SHMEM_HUGE_WITHIN_SIZE: { |
|---|
| 1898 | + loff_t i_size; |
|---|
| 1899 | + pgoff_t off; |
|---|
| 1721 | 1900 | |
|---|
| 1722 | | - if (shmem_should_replace_page(page, gfp)) { |
|---|
| 1723 | | - error = shmem_replace_page(&page, gfp, info, index); |
|---|
| 1724 | | - if (error) |
|---|
| 1725 | | - goto failed; |
|---|
| 1726 | | - } |
|---|
| 1727 | | - |
|---|
| 1728 | | - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, |
|---|
| 1729 | | - false); |
|---|
| 1730 | | - if (!error) { |
|---|
| 1731 | | - error = shmem_add_to_page_cache(page, mapping, index, |
|---|
| 1732 | | - swp_to_radix_entry(swap)); |
|---|
| 1733 | | - /* |
|---|
| 1734 | | - * We already confirmed swap under page lock, and make |
|---|
| 1735 | | - * no memory allocation here, so usually no possibility |
|---|
| 1736 | | - * of error; but free_swap_and_cache() only trylocks a |
|---|
| 1737 | | - * page, so it is just possible that the entry has been |
|---|
| 1738 | | - * truncated or holepunched since swap was confirmed. |
|---|
| 1739 | | - * shmem_undo_range() will have done some of the |
|---|
| 1740 | | - * unaccounting, now delete_from_swap_cache() will do |
|---|
| 1741 | | - * the rest. |
|---|
| 1742 | | - * Reset swap.val? No, leave it so "failed" goes back to |
|---|
| 1743 | | - * "repeat": reading a hole and writing should succeed. |
|---|
| 1744 | | - */ |
|---|
| 1745 | | - if (error) { |
|---|
| 1746 | | - mem_cgroup_cancel_charge(page, memcg, false); |
|---|
| 1747 | | - delete_from_swap_cache(page); |
|---|
| 1748 | | - } |
|---|
| 1749 | | - } |
|---|
| 1750 | | - if (error) |
|---|
| 1751 | | - goto failed; |
|---|
| 1752 | | - |
|---|
| 1753 | | - mem_cgroup_commit_charge(page, memcg, true, false); |
|---|
| 1754 | | - |
|---|
| 1755 | | - spin_lock_irq(&info->lock); |
|---|
| 1756 | | - info->swapped--; |
|---|
| 1757 | | - shmem_recalc_inode(inode); |
|---|
| 1758 | | - spin_unlock_irq(&info->lock); |
|---|
| 1759 | | - |
|---|
| 1760 | | - if (sgp == SGP_WRITE) |
|---|
| 1761 | | - mark_page_accessed(page); |
|---|
| 1762 | | - |
|---|
| 1763 | | - delete_from_swap_cache(page); |
|---|
| 1764 | | - set_page_dirty(page); |
|---|
| 1765 | | - swap_free(swap); |
|---|
| 1766 | | - |
|---|
| 1767 | | - } else { |
|---|
| 1768 | | - if (vma && userfaultfd_missing(vma)) { |
|---|
| 1769 | | - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); |
|---|
| 1770 | | - return 0; |
|---|
| 1771 | | - } |
|---|
| 1772 | | - |
|---|
| 1773 | | - /* shmem_symlink() */ |
|---|
| 1774 | | - if (mapping->a_ops != &shmem_aops) |
|---|
| 1775 | | - goto alloc_nohuge; |
|---|
| 1776 | | - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) |
|---|
| 1777 | | - goto alloc_nohuge; |
|---|
| 1778 | | - if (shmem_huge == SHMEM_HUGE_FORCE) |
|---|
| 1901 | + off = round_up(index, HPAGE_PMD_NR); |
|---|
| 1902 | + i_size = round_up(i_size_read(inode), PAGE_SIZE); |
|---|
| 1903 | + if (i_size >= HPAGE_PMD_SIZE && |
|---|
| 1904 | + i_size >> PAGE_SHIFT >= off) |
|---|
| 1779 | 1905 | goto alloc_huge; |
|---|
| 1780 | | - switch (sbinfo->huge) { |
|---|
| 1781 | | - loff_t i_size; |
|---|
| 1782 | | - pgoff_t off; |
|---|
| 1783 | | - case SHMEM_HUGE_NEVER: |
|---|
| 1784 | | - goto alloc_nohuge; |
|---|
| 1785 | | - case SHMEM_HUGE_WITHIN_SIZE: |
|---|
| 1786 | | - off = round_up(index, HPAGE_PMD_NR); |
|---|
| 1787 | | - i_size = round_up(i_size_read(inode), PAGE_SIZE); |
|---|
| 1788 | | - if (i_size >= HPAGE_PMD_SIZE && |
|---|
| 1789 | | - i_size >> PAGE_SHIFT >= off) |
|---|
| 1790 | | - goto alloc_huge; |
|---|
| 1791 | | - /* fallthrough */ |
|---|
| 1792 | | - case SHMEM_HUGE_ADVISE: |
|---|
| 1793 | | - if (sgp_huge == SGP_HUGE) |
|---|
| 1794 | | - goto alloc_huge; |
|---|
| 1795 | | - /* TODO: implement fadvise() hints */ |
|---|
| 1796 | | - goto alloc_nohuge; |
|---|
| 1797 | | - } |
|---|
| 1906 | + |
|---|
| 1907 | + fallthrough; |
|---|
| 1908 | + } |
|---|
| 1909 | + case SHMEM_HUGE_ADVISE: |
|---|
| 1910 | + if (sgp_huge == SGP_HUGE) |
|---|
| 1911 | + goto alloc_huge; |
|---|
| 1912 | + /* TODO: implement fadvise() hints */ |
|---|
| 1913 | + goto alloc_nohuge; |
|---|
| 1914 | + } |
|---|
| 1798 | 1915 | |
|---|
| 1799 | 1916 | alloc_huge: |
|---|
| 1800 | | - page = shmem_alloc_and_acct_page(gfp, inode, index, true); |
|---|
| 1801 | | - if (IS_ERR(page)) { |
|---|
| 1802 | | -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, |
|---|
| 1803 | | - index, false); |
|---|
| 1804 | | - } |
|---|
| 1805 | | - if (IS_ERR(page)) { |
|---|
| 1806 | | - int retry = 5; |
|---|
| 1807 | | - error = PTR_ERR(page); |
|---|
| 1808 | | - page = NULL; |
|---|
| 1809 | | - if (error != -ENOSPC) |
|---|
| 1810 | | - goto failed; |
|---|
| 1811 | | - /* |
|---|
| 1812 | | - * Try to reclaim some spece by splitting a huge page |
|---|
| 1813 | | - * beyond i_size on the filesystem. |
|---|
| 1814 | | - */ |
|---|
| 1815 | | - while (retry--) { |
|---|
| 1816 | | - int ret; |
|---|
| 1817 | | - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); |
|---|
| 1818 | | - if (ret == SHRINK_STOP) |
|---|
| 1819 | | - break; |
|---|
| 1820 | | - if (ret) |
|---|
| 1821 | | - goto alloc_nohuge; |
|---|
| 1822 | | - } |
|---|
| 1823 | | - goto failed; |
|---|
| 1824 | | - } |
|---|
| 1917 | + page = shmem_alloc_and_acct_page(gfp, inode, index, true); |
|---|
| 1918 | + if (IS_ERR(page)) { |
|---|
| 1919 | +alloc_nohuge: |
|---|
| 1920 | + page = shmem_alloc_and_acct_page(gfp, inode, |
|---|
| 1921 | + index, false); |
|---|
| 1922 | + } |
|---|
| 1923 | + if (IS_ERR(page)) { |
|---|
| 1924 | + int retry = 5; |
|---|
| 1825 | 1925 | |
|---|
| 1826 | | - if (PageTransHuge(page)) |
|---|
| 1827 | | - hindex = round_down(index, HPAGE_PMD_NR); |
|---|
| 1828 | | - else |
|---|
| 1829 | | - hindex = index; |
|---|
| 1830 | | - |
|---|
| 1831 | | - if (sgp == SGP_WRITE) |
|---|
| 1832 | | - __SetPageReferenced(page); |
|---|
| 1833 | | - |
|---|
| 1834 | | - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, |
|---|
| 1835 | | - PageTransHuge(page)); |
|---|
| 1836 | | - if (error) |
|---|
| 1837 | | - goto unacct; |
|---|
| 1838 | | - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, |
|---|
| 1839 | | - compound_order(page)); |
|---|
| 1840 | | - if (!error) { |
|---|
| 1841 | | - error = shmem_add_to_page_cache(page, mapping, hindex, |
|---|
| 1842 | | - NULL); |
|---|
| 1843 | | - radix_tree_preload_end(); |
|---|
| 1844 | | - } |
|---|
| 1845 | | - if (error) { |
|---|
| 1846 | | - mem_cgroup_cancel_charge(page, memcg, |
|---|
| 1847 | | - PageTransHuge(page)); |
|---|
| 1848 | | - goto unacct; |
|---|
| 1849 | | - } |
|---|
| 1850 | | - mem_cgroup_commit_charge(page, memcg, false, |
|---|
| 1851 | | - PageTransHuge(page)); |
|---|
| 1852 | | - lru_cache_add_anon(page); |
|---|
| 1853 | | - |
|---|
| 1854 | | - spin_lock_irq(&info->lock); |
|---|
| 1855 | | - info->alloced += 1 << compound_order(page); |
|---|
| 1856 | | - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); |
|---|
| 1857 | | - shmem_recalc_inode(inode); |
|---|
| 1858 | | - spin_unlock_irq(&info->lock); |
|---|
| 1859 | | - alloced = true; |
|---|
| 1860 | | - |
|---|
| 1861 | | - if (PageTransHuge(page) && |
|---|
| 1862 | | - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < |
|---|
| 1863 | | - hindex + HPAGE_PMD_NR - 1) { |
|---|
| 1864 | | - /* |
|---|
| 1865 | | - * Part of the huge page is beyond i_size: subject |
|---|
| 1866 | | - * to shrink under memory pressure. |
|---|
| 1867 | | - */ |
|---|
| 1868 | | - spin_lock(&sbinfo->shrinklist_lock); |
|---|
| 1869 | | - /* |
|---|
| 1870 | | - * _careful to defend against unlocked access to |
|---|
| 1871 | | - * ->shrink_list in shmem_unused_huge_shrink() |
|---|
| 1872 | | - */ |
|---|
| 1873 | | - if (list_empty_careful(&info->shrinklist)) { |
|---|
| 1874 | | - list_add_tail(&info->shrinklist, |
|---|
| 1875 | | - &sbinfo->shrinklist); |
|---|
| 1876 | | - sbinfo->shrinklist_len++; |
|---|
| 1877 | | - } |
|---|
| 1878 | | - spin_unlock(&sbinfo->shrinklist_lock); |
|---|
| 1879 | | - } |
|---|
| 1880 | | - |
|---|
| 1926 | + error = PTR_ERR(page); |
|---|
| 1927 | + page = NULL; |
|---|
| 1928 | + if (error != -ENOSPC) |
|---|
| 1929 | + goto unlock; |
|---|
| 1881 | 1930 | /* |
|---|
| 1882 | | - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
|---|
| 1931 | + * Try to reclaim some space by splitting a huge page |
|---|
| 1932 | + * beyond i_size on the filesystem. |
|---|
| 1883 | 1933 | */ |
|---|
| 1884 | | - if (sgp == SGP_FALLOC) |
|---|
| 1885 | | - sgp = SGP_WRITE; |
|---|
| 1934 | + while (retry--) { |
|---|
| 1935 | + int ret; |
|---|
| 1936 | + |
|---|
| 1937 | + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); |
|---|
| 1938 | + if (ret == SHRINK_STOP) |
|---|
| 1939 | + break; |
|---|
| 1940 | + if (ret) |
|---|
| 1941 | + goto alloc_nohuge; |
|---|
| 1942 | + } |
|---|
| 1943 | + goto unlock; |
|---|
| 1944 | + } |
|---|
| 1945 | + |
|---|
| 1946 | + if (PageTransHuge(page)) |
|---|
| 1947 | + hindex = round_down(index, HPAGE_PMD_NR); |
|---|
| 1948 | + else |
|---|
| 1949 | + hindex = index; |
|---|
| 1950 | + |
|---|
| 1951 | + if (sgp == SGP_WRITE) |
|---|
| 1952 | + __SetPageReferenced(page); |
|---|
| 1953 | + |
|---|
| 1954 | + error = shmem_add_to_page_cache(page, mapping, hindex, |
|---|
| 1955 | + NULL, gfp & GFP_RECLAIM_MASK, |
|---|
| 1956 | + charge_mm); |
|---|
| 1957 | + if (error) |
|---|
| 1958 | + goto unacct; |
|---|
| 1959 | + lru_cache_add(page); |
|---|
| 1960 | + |
|---|
| 1961 | + spin_lock_irq(&info->lock); |
|---|
| 1962 | + info->alloced += compound_nr(page); |
|---|
| 1963 | + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); |
|---|
| 1964 | + shmem_recalc_inode(inode); |
|---|
| 1965 | + spin_unlock_irq(&info->lock); |
|---|
| 1966 | + alloced = true; |
|---|
| 1967 | + |
|---|
| 1968 | + if (PageTransHuge(page) && |
|---|
| 1969 | + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < |
|---|
| 1970 | + hindex + HPAGE_PMD_NR - 1) { |
|---|
| 1971 | + /* |
|---|
| 1972 | + * Part of the huge page is beyond i_size: subject |
|---|
| 1973 | + * to shrink under memory pressure. |
|---|
| 1974 | + */ |
|---|
| 1975 | + spin_lock(&sbinfo->shrinklist_lock); |
|---|
| 1976 | + /* |
|---|
| 1977 | + * _careful to defend against unlocked access to |
|---|
| 1978 | + * ->shrink_list in shmem_unused_huge_shrink() |
|---|
| 1979 | + */ |
|---|
| 1980 | + if (list_empty_careful(&info->shrinklist)) { |
|---|
| 1981 | + list_add_tail(&info->shrinklist, |
|---|
| 1982 | + &sbinfo->shrinklist); |
|---|
| 1983 | + sbinfo->shrinklist_len++; |
|---|
| 1984 | + } |
|---|
| 1985 | + spin_unlock(&sbinfo->shrinklist_lock); |
|---|
| 1986 | + } |
|---|
| 1987 | + |
|---|
| 1988 | + /* |
|---|
| 1989 | + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
|---|
| 1990 | + */ |
|---|
| 1991 | + if (sgp == SGP_FALLOC) |
|---|
| 1992 | + sgp = SGP_WRITE; |
|---|
| 1886 | 1993 | clear: |
|---|
| 1887 | | - /* |
|---|
| 1888 | | - * Let SGP_WRITE caller clear ends if write does not fill page; |
|---|
| 1889 | | - * but SGP_FALLOC on a page fallocated earlier must initialize |
|---|
| 1890 | | - * it now, lest undo on failure cancel our earlier guarantee. |
|---|
| 1891 | | - */ |
|---|
| 1892 | | - if (sgp != SGP_WRITE && !PageUptodate(page)) { |
|---|
| 1893 | | - struct page *head = compound_head(page); |
|---|
| 1894 | | - int i; |
|---|
| 1994 | + /* |
|---|
| 1995 | + * Let SGP_WRITE caller clear ends if write does not fill page; |
|---|
| 1996 | + * but SGP_FALLOC on a page fallocated earlier must initialize |
|---|
| 1997 | + * it now, lest undo on failure cancel our earlier guarantee. |
|---|
| 1998 | + */ |
|---|
| 1999 | + if (sgp != SGP_WRITE && !PageUptodate(page)) { |
|---|
| 2000 | + int i; |
|---|
| 1895 | 2001 | |
|---|
| 1896 | | - for (i = 0; i < (1 << compound_order(head)); i++) { |
|---|
| 1897 | | - clear_highpage(head + i); |
|---|
| 1898 | | - flush_dcache_page(head + i); |
|---|
| 1899 | | - } |
|---|
| 1900 | | - SetPageUptodate(head); |
|---|
| 2002 | + for (i = 0; i < compound_nr(page); i++) { |
|---|
| 2003 | + clear_highpage(page + i); |
|---|
| 2004 | + flush_dcache_page(page + i); |
|---|
| 1901 | 2005 | } |
|---|
| 2006 | + SetPageUptodate(page); |
|---|
| 1902 | 2007 | } |
|---|
| 1903 | 2008 | |
|---|
| 1904 | 2009 | /* Perhaps the file has been truncated since we checked */ |
|---|
| .. | .. |
|---|
| 1914 | 2019 | error = -EINVAL; |
|---|
| 1915 | 2020 | goto unlock; |
|---|
| 1916 | 2021 | } |
|---|
| 2022 | +out: |
|---|
| 1917 | 2023 | *pagep = page + index - hindex; |
|---|
| 1918 | 2024 | return 0; |
|---|
| 1919 | 2025 | |
|---|
| .. | .. |
|---|
| 1921 | 2027 | * Error recovery. |
|---|
| 1922 | 2028 | */ |
|---|
| 1923 | 2029 | unacct: |
|---|
| 1924 | | - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); |
|---|
| 2030 | + shmem_inode_unacct_blocks(inode, compound_nr(page)); |
|---|
| 1925 | 2031 | |
|---|
| 1926 | 2032 | if (PageTransHuge(page)) { |
|---|
| 1927 | 2033 | unlock_page(page); |
|---|
| 1928 | 2034 | put_page(page); |
|---|
| 1929 | 2035 | goto alloc_nohuge; |
|---|
| 1930 | 2036 | } |
|---|
| 1931 | | -failed: |
|---|
| 1932 | | - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) |
|---|
| 1933 | | - error = -EEXIST; |
|---|
| 1934 | 2037 | unlock: |
|---|
| 1935 | 2038 | if (page) { |
|---|
| 1936 | 2039 | unlock_page(page); |
|---|
| .. | .. |
|---|
| 1942 | 2045 | spin_unlock_irq(&info->lock); |
|---|
| 1943 | 2046 | goto repeat; |
|---|
| 1944 | 2047 | } |
|---|
| 1945 | | - if (error == -EEXIST) /* from above or from radix_tree_insert */ |
|---|
| 2048 | + if (error == -EEXIST) |
|---|
| 1946 | 2049 | goto repeat; |
|---|
| 1947 | 2050 | return error; |
|---|
| 1948 | 2051 | } |
|---|
| .. | .. |
|---|
| 1994 | 2097 | shmem_falloc->waitq && |
|---|
| 1995 | 2098 | vmf->pgoff >= shmem_falloc->start && |
|---|
| 1996 | 2099 | vmf->pgoff < shmem_falloc->next) { |
|---|
| 2100 | + struct file *fpin; |
|---|
| 1997 | 2101 | wait_queue_head_t *shmem_falloc_waitq; |
|---|
| 1998 | 2102 | DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); |
|---|
| 1999 | 2103 | |
|---|
| 2000 | 2104 | ret = VM_FAULT_NOPAGE; |
|---|
| 2001 | | - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && |
|---|
| 2002 | | - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { |
|---|
| 2003 | | - /* It's polite to up mmap_sem if we can */ |
|---|
| 2004 | | - up_read(&vma->vm_mm->mmap_sem); |
|---|
| 2105 | + fpin = maybe_unlock_mmap_for_io(vmf, NULL); |
|---|
| 2106 | + if (fpin) |
|---|
| 2005 | 2107 | ret = VM_FAULT_RETRY; |
|---|
| 2006 | | - } |
|---|
| 2007 | 2108 | |
|---|
| 2008 | 2109 | shmem_falloc_waitq = shmem_falloc->waitq; |
|---|
| 2009 | 2110 | prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, |
|---|
| .. | .. |
|---|
| 2021 | 2122 | spin_lock(&inode->i_lock); |
|---|
| 2022 | 2123 | finish_wait(shmem_falloc_waitq, &shmem_fault_wait); |
|---|
| 2023 | 2124 | spin_unlock(&inode->i_lock); |
|---|
| 2125 | + |
|---|
| 2126 | + if (fpin) |
|---|
| 2127 | + fput(fpin); |
|---|
| 2024 | 2128 | return ret; |
|---|
| 2025 | 2129 | } |
|---|
| 2026 | 2130 | spin_unlock(&inode->i_lock); |
|---|
| .. | .. |
|---|
| 2059 | 2163 | get_area = current->mm->get_unmapped_area; |
|---|
| 2060 | 2164 | addr = get_area(file, uaddr, len, pgoff, flags); |
|---|
| 2061 | 2165 | |
|---|
| 2062 | | - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) |
|---|
| 2166 | + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) |
|---|
| 2063 | 2167 | return addr; |
|---|
| 2064 | 2168 | if (IS_ERR_VALUE(addr)) |
|---|
| 2065 | 2169 | return addr; |
|---|
| .. | .. |
|---|
| 2179 | 2283 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) |
|---|
| 2180 | 2284 | { |
|---|
| 2181 | 2285 | struct shmem_inode_info *info = SHMEM_I(file_inode(file)); |
|---|
| 2286 | + int ret; |
|---|
| 2182 | 2287 | |
|---|
| 2183 | | - if (info->seals & F_SEAL_FUTURE_WRITE) { |
|---|
| 2184 | | - /* |
|---|
| 2185 | | - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when |
|---|
| 2186 | | - * "future write" seal active. |
|---|
| 2187 | | - */ |
|---|
| 2188 | | - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) |
|---|
| 2189 | | - return -EPERM; |
|---|
| 2288 | + ret = seal_check_future_write(info->seals, vma); |
|---|
| 2289 | + if (ret) |
|---|
| 2290 | + return ret; |
|---|
| 2190 | 2291 | |
|---|
| 2191 | | - /* |
|---|
| 2192 | | - * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED |
|---|
| 2193 | | - * read-only mapping, take care to not allow mprotect to revert |
|---|
| 2194 | | - * protections. |
|---|
| 2195 | | - */ |
|---|
| 2196 | | - vma->vm_flags &= ~(VM_MAYWRITE); |
|---|
| 2197 | | - } |
|---|
| 2292 | + /* arm64 - allow memory tagging on RAM-based files */ |
|---|
| 2293 | + vma->vm_flags |= VM_MTE_ALLOWED; |
|---|
| 2198 | 2294 | |
|---|
| 2199 | 2295 | file_accessed(file); |
|---|
| 2200 | 2296 | vma->vm_ops = &shmem_vm_ops; |
|---|
| 2201 | | - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && |
|---|
| 2297 | + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && |
|---|
| 2202 | 2298 | ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < |
|---|
| 2203 | 2299 | (vma->vm_end & HPAGE_PMD_MASK)) { |
|---|
| 2204 | 2300 | khugepaged_enter(vma, vma->vm_flags); |
|---|
| .. | .. |
|---|
| 2212 | 2308 | struct inode *inode; |
|---|
| 2213 | 2309 | struct shmem_inode_info *info; |
|---|
| 2214 | 2310 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
|---|
| 2311 | + ino_t ino; |
|---|
| 2215 | 2312 | |
|---|
| 2216 | | - if (shmem_reserve_inode(sb)) |
|---|
| 2313 | + if (shmem_reserve_inode(sb, &ino)) |
|---|
| 2217 | 2314 | return NULL; |
|---|
| 2218 | 2315 | |
|---|
| 2219 | 2316 | inode = new_inode(sb); |
|---|
| 2220 | 2317 | if (inode) { |
|---|
| 2221 | | - inode->i_ino = get_next_ino(); |
|---|
| 2318 | + inode->i_ino = ino; |
|---|
| 2222 | 2319 | inode_init_owner(inode, dir, mode); |
|---|
| 2223 | 2320 | inode->i_blocks = 0; |
|---|
| 2224 | 2321 | inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); |
|---|
| .. | .. |
|---|
| 2226 | 2323 | info = SHMEM_I(inode); |
|---|
| 2227 | 2324 | memset(info, 0, (char *)inode - (char *)info); |
|---|
| 2228 | 2325 | spin_lock_init(&info->lock); |
|---|
| 2326 | + atomic_set(&info->stop_eviction, 0); |
|---|
| 2229 | 2327 | info->seals = F_SEAL_SEAL; |
|---|
| 2230 | 2328 | info->flags = flags & VM_NORESERVE; |
|---|
| 2231 | 2329 | INIT_LIST_HEAD(&info->shrinklist); |
|---|
| .. | .. |
|---|
| 2272 | 2370 | return mapping->a_ops == &shmem_aops; |
|---|
| 2273 | 2371 | } |
|---|
| 2274 | 2372 | |
|---|
| 2275 | | -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, |
|---|
| 2276 | | - pmd_t *dst_pmd, |
|---|
| 2277 | | - struct vm_area_struct *dst_vma, |
|---|
| 2278 | | - unsigned long dst_addr, |
|---|
| 2279 | | - unsigned long src_addr, |
|---|
| 2280 | | - bool zeropage, |
|---|
| 2281 | | - struct page **pagep) |
|---|
| 2373 | +#ifdef CONFIG_USERFAULTFD |
|---|
| 2374 | +int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, |
|---|
| 2375 | + pmd_t *dst_pmd, |
|---|
| 2376 | + struct vm_area_struct *dst_vma, |
|---|
| 2377 | + unsigned long dst_addr, |
|---|
| 2378 | + unsigned long src_addr, |
|---|
| 2379 | + bool zeropage, |
|---|
| 2380 | + struct page **pagep) |
|---|
| 2282 | 2381 | { |
|---|
| 2283 | 2382 | struct inode *inode = file_inode(dst_vma->vm_file); |
|---|
| 2284 | 2383 | struct shmem_inode_info *info = SHMEM_I(inode); |
|---|
| 2285 | 2384 | struct address_space *mapping = inode->i_mapping; |
|---|
| 2286 | 2385 | gfp_t gfp = mapping_gfp_mask(mapping); |
|---|
| 2287 | 2386 | pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); |
|---|
| 2288 | | - struct mem_cgroup *memcg; |
|---|
| 2289 | | - spinlock_t *ptl; |
|---|
| 2290 | 2387 | void *page_kaddr; |
|---|
| 2291 | 2388 | struct page *page; |
|---|
| 2292 | | - pte_t _dst_pte, *dst_pte; |
|---|
| 2293 | 2389 | int ret; |
|---|
| 2294 | | - pgoff_t offset, max_off; |
|---|
| 2390 | + pgoff_t max_off; |
|---|
| 2295 | 2391 | |
|---|
| 2296 | | - ret = -ENOMEM; |
|---|
| 2297 | 2392 | if (!shmem_inode_acct_block(inode, 1)) { |
|---|
| 2298 | 2393 | /* |
|---|
| 2299 | 2394 | * We may have got a page, returned -ENOENT triggering a retry, |
|---|
| .. | .. |
|---|
| 2304 | 2399 | put_page(*pagep); |
|---|
| 2305 | 2400 | *pagep = NULL; |
|---|
| 2306 | 2401 | } |
|---|
| 2307 | | - goto out; |
|---|
| 2402 | + return -ENOMEM; |
|---|
| 2308 | 2403 | } |
|---|
| 2309 | 2404 | |
|---|
| 2310 | 2405 | if (!*pagep) { |
|---|
| 2406 | + ret = -ENOMEM; |
|---|
| 2311 | 2407 | page = shmem_alloc_page(gfp, info, pgoff); |
|---|
| 2312 | 2408 | if (!page) |
|---|
| 2313 | 2409 | goto out_unacct_blocks; |
|---|
| 2314 | 2410 | |
|---|
| 2315 | | - if (!zeropage) { /* mcopy_atomic */ |
|---|
| 2411 | + if (!zeropage) { /* COPY */ |
|---|
| 2316 | 2412 | page_kaddr = kmap_atomic(page); |
|---|
| 2317 | 2413 | ret = copy_from_user(page_kaddr, |
|---|
| 2318 | 2414 | (const void __user *)src_addr, |
|---|
| 2319 | 2415 | PAGE_SIZE); |
|---|
| 2320 | 2416 | kunmap_atomic(page_kaddr); |
|---|
| 2321 | 2417 | |
|---|
| 2322 | | - /* fallback to copy_from_user outside mmap_sem */ |
|---|
| 2418 | + /* fallback to copy_from_user outside mmap_lock */ |
|---|
| 2323 | 2419 | if (unlikely(ret)) { |
|---|
| 2324 | 2420 | *pagep = page; |
|---|
| 2325 | | - shmem_inode_unacct_blocks(inode, 1); |
|---|
| 2421 | + ret = -ENOENT; |
|---|
| 2326 | 2422 | /* don't free the page */ |
|---|
| 2327 | | - return -ENOENT; |
|---|
| 2423 | + goto out_unacct_blocks; |
|---|
| 2328 | 2424 | } |
|---|
| 2329 | | - } else { /* mfill_zeropage_atomic */ |
|---|
| 2425 | + } else { /* ZEROPAGE */ |
|---|
| 2330 | 2426 | clear_highpage(page); |
|---|
| 2331 | 2427 | } |
|---|
| 2332 | 2428 | } else { |
|---|
| .. | .. |
|---|
| 2334 | 2430 | *pagep = NULL; |
|---|
| 2335 | 2431 | } |
|---|
| 2336 | 2432 | |
|---|
| 2337 | | - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); |
|---|
| 2433 | + VM_BUG_ON(PageLocked(page)); |
|---|
| 2434 | + VM_BUG_ON(PageSwapBacked(page)); |
|---|
| 2338 | 2435 | __SetPageLocked(page); |
|---|
| 2339 | 2436 | __SetPageSwapBacked(page); |
|---|
| 2340 | 2437 | __SetPageUptodate(page); |
|---|
| 2341 | 2438 | |
|---|
| 2342 | 2439 | ret = -EFAULT; |
|---|
| 2343 | | - offset = linear_page_index(dst_vma, dst_addr); |
|---|
| 2344 | 2440 | max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
|---|
| 2345 | | - if (unlikely(offset >= max_off)) |
|---|
| 2441 | + if (unlikely(pgoff >= max_off)) |
|---|
| 2346 | 2442 | goto out_release; |
|---|
| 2347 | 2443 | |
|---|
| 2348 | | - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); |
|---|
| 2444 | + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, |
|---|
| 2445 | + gfp & GFP_RECLAIM_MASK, dst_mm); |
|---|
| 2349 | 2446 | if (ret) |
|---|
| 2350 | 2447 | goto out_release; |
|---|
| 2351 | 2448 | |
|---|
| 2352 | | - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); |
|---|
| 2353 | | - if (!ret) { |
|---|
| 2354 | | - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); |
|---|
| 2355 | | - radix_tree_preload_end(); |
|---|
| 2356 | | - } |
|---|
| 2449 | + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, |
|---|
| 2450 | + page, true, false); |
|---|
| 2357 | 2451 | if (ret) |
|---|
| 2358 | | - goto out_release_uncharge; |
|---|
| 2359 | | - |
|---|
| 2360 | | - mem_cgroup_commit_charge(page, memcg, false, false); |
|---|
| 2361 | | - |
|---|
| 2362 | | - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); |
|---|
| 2363 | | - if (dst_vma->vm_flags & VM_WRITE) |
|---|
| 2364 | | - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); |
|---|
| 2365 | | - else { |
|---|
| 2366 | | - /* |
|---|
| 2367 | | - * We don't set the pte dirty if the vma has no |
|---|
| 2368 | | - * VM_WRITE permission, so mark the page dirty or it |
|---|
| 2369 | | - * could be freed from under us. We could do it |
|---|
| 2370 | | - * unconditionally before unlock_page(), but doing it |
|---|
| 2371 | | - * only if VM_WRITE is not set is faster. |
|---|
| 2372 | | - */ |
|---|
| 2373 | | - set_page_dirty(page); |
|---|
| 2374 | | - } |
|---|
| 2375 | | - |
|---|
| 2376 | | - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); |
|---|
| 2377 | | - |
|---|
| 2378 | | - ret = -EFAULT; |
|---|
| 2379 | | - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
|---|
| 2380 | | - if (unlikely(offset >= max_off)) |
|---|
| 2381 | | - goto out_release_uncharge_unlock; |
|---|
| 2382 | | - |
|---|
| 2383 | | - ret = -EEXIST; |
|---|
| 2384 | | - if (!pte_none(*dst_pte)) |
|---|
| 2385 | | - goto out_release_uncharge_unlock; |
|---|
| 2386 | | - |
|---|
| 2387 | | - lru_cache_add_anon(page); |
|---|
| 2452 | + goto out_delete_from_cache; |
|---|
| 2388 | 2453 | |
|---|
| 2389 | 2454 | spin_lock_irq(&info->lock); |
|---|
| 2390 | 2455 | info->alloced++; |
|---|
| .. | .. |
|---|
| 2392 | 2457 | shmem_recalc_inode(inode); |
|---|
| 2393 | 2458 | spin_unlock_irq(&info->lock); |
|---|
| 2394 | 2459 | |
|---|
| 2395 | | - inc_mm_counter(dst_mm, mm_counter_file(page)); |
|---|
| 2396 | | - page_add_file_rmap(page, false); |
|---|
| 2397 | | - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); |
|---|
| 2398 | | - |
|---|
| 2399 | | - /* No need to invalidate - it was non-present before */ |
|---|
| 2400 | | - update_mmu_cache(dst_vma, dst_addr, dst_pte); |
|---|
| 2401 | | - pte_unmap_unlock(dst_pte, ptl); |
|---|
| 2460 | + SetPageDirty(page); |
|---|
| 2402 | 2461 | unlock_page(page); |
|---|
| 2403 | | - ret = 0; |
|---|
| 2404 | | -out: |
|---|
| 2405 | | - return ret; |
|---|
| 2406 | | -out_release_uncharge_unlock: |
|---|
| 2407 | | - pte_unmap_unlock(dst_pte, ptl); |
|---|
| 2408 | | - ClearPageDirty(page); |
|---|
| 2462 | + return 0; |
|---|
| 2463 | +out_delete_from_cache: |
|---|
| 2409 | 2464 | delete_from_page_cache(page); |
|---|
| 2410 | | -out_release_uncharge: |
|---|
| 2411 | | - mem_cgroup_cancel_charge(page, memcg, false); |
|---|
| 2412 | 2465 | out_release: |
|---|
| 2413 | 2466 | unlock_page(page); |
|---|
| 2414 | 2467 | put_page(page); |
|---|
| 2415 | 2468 | out_unacct_blocks: |
|---|
| 2416 | 2469 | shmem_inode_unacct_blocks(inode, 1); |
|---|
| 2417 | | - goto out; |
|---|
| 2470 | + return ret; |
|---|
| 2418 | 2471 | } |
|---|
| 2419 | | - |
|---|
| 2420 | | -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, |
|---|
| 2421 | | - pmd_t *dst_pmd, |
|---|
| 2422 | | - struct vm_area_struct *dst_vma, |
|---|
| 2423 | | - unsigned long dst_addr, |
|---|
| 2424 | | - unsigned long src_addr, |
|---|
| 2425 | | - struct page **pagep) |
|---|
| 2426 | | -{ |
|---|
| 2427 | | - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, |
|---|
| 2428 | | - dst_addr, src_addr, false, pagep); |
|---|
| 2429 | | -} |
|---|
| 2430 | | - |
|---|
| 2431 | | -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, |
|---|
| 2432 | | - pmd_t *dst_pmd, |
|---|
| 2433 | | - struct vm_area_struct *dst_vma, |
|---|
| 2434 | | - unsigned long dst_addr) |
|---|
| 2435 | | -{ |
|---|
| 2436 | | - struct page *page = NULL; |
|---|
| 2437 | | - |
|---|
| 2438 | | - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, |
|---|
| 2439 | | - dst_addr, 0, true, &page); |
|---|
| 2440 | | -} |
|---|
| 2472 | +#endif /* CONFIG_USERFAULTFD */ |
|---|
| 2441 | 2473 | |
|---|
| 2442 | 2474 | #ifdef CONFIG_TMPFS |
|---|
| 2443 | 2475 | static const struct inode_operations shmem_symlink_inode_operations; |
|---|
| .. | .. |
|---|
| 2617 | 2649 | } |
|---|
| 2618 | 2650 | |
|---|
| 2619 | 2651 | /* |
|---|
| 2620 | | - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. |
|---|
| 2652 | + * llseek SEEK_DATA or SEEK_HOLE through the page cache. |
|---|
| 2621 | 2653 | */ |
|---|
| 2622 | 2654 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, |
|---|
| 2623 | 2655 | pgoff_t index, pgoff_t end, int whence) |
|---|
| .. | .. |
|---|
| 2647 | 2679 | index = indices[i]; |
|---|
| 2648 | 2680 | } |
|---|
| 2649 | 2681 | page = pvec.pages[i]; |
|---|
| 2650 | | - if (page && !radix_tree_exceptional_entry(page)) { |
|---|
| 2682 | + if (page && !xa_is_value(page)) { |
|---|
| 2651 | 2683 | if (!PageUptodate(page)) |
|---|
| 2652 | 2684 | page = NULL; |
|---|
| 2653 | 2685 | } |
|---|
| .. | .. |
|---|
| 2943 | 2975 | * first link must skip that, to get the accounting right. |
|---|
| 2944 | 2976 | */ |
|---|
| 2945 | 2977 | if (inode->i_nlink) { |
|---|
| 2946 | | - ret = shmem_reserve_inode(inode->i_sb); |
|---|
| 2978 | + ret = shmem_reserve_inode(inode->i_sb, NULL); |
|---|
| 2947 | 2979 | if (ret) |
|---|
| 2948 | 2980 | goto out; |
|---|
| 2949 | 2981 | } |
|---|
| .. | .. |
|---|
| 3095 | 3127 | |
|---|
| 3096 | 3128 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
|---|
| 3097 | 3129 | shmem_initxattrs, NULL); |
|---|
| 3098 | | - if (error) { |
|---|
| 3099 | | - if (error != -EOPNOTSUPP) { |
|---|
| 3100 | | - iput(inode); |
|---|
| 3101 | | - return error; |
|---|
| 3102 | | - } |
|---|
| 3103 | | - error = 0; |
|---|
| 3130 | + if (error && error != -EOPNOTSUPP) { |
|---|
| 3131 | + iput(inode); |
|---|
| 3132 | + return error; |
|---|
| 3104 | 3133 | } |
|---|
| 3105 | 3134 | |
|---|
| 3106 | 3135 | inode->i_size = len-1; |
|---|
| .. | .. |
|---|
| 3192 | 3221 | new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, |
|---|
| 3193 | 3222 | GFP_KERNEL); |
|---|
| 3194 | 3223 | if (!new_xattr->name) { |
|---|
| 3195 | | - kfree(new_xattr); |
|---|
| 3224 | + kvfree(new_xattr); |
|---|
| 3196 | 3225 | return -ENOMEM; |
|---|
| 3197 | 3226 | } |
|---|
| 3198 | 3227 | |
|---|
| .. | .. |
|---|
| 3209 | 3238 | |
|---|
| 3210 | 3239 | static int shmem_xattr_handler_get(const struct xattr_handler *handler, |
|---|
| 3211 | 3240 | struct dentry *unused, struct inode *inode, |
|---|
| 3212 | | - const char *name, void *buffer, size_t size) |
|---|
| 3241 | + const char *name, void *buffer, size_t size, |
|---|
| 3242 | + int flags) |
|---|
| 3213 | 3243 | { |
|---|
| 3214 | 3244 | struct shmem_inode_info *info = SHMEM_I(inode); |
|---|
| 3215 | 3245 | |
|---|
| .. | .. |
|---|
| 3225 | 3255 | struct shmem_inode_info *info = SHMEM_I(inode); |
|---|
| 3226 | 3256 | |
|---|
| 3227 | 3257 | name = xattr_full_name(handler, name); |
|---|
| 3228 | | - return simple_xattr_set(&info->xattrs, name, value, size, flags); |
|---|
| 3258 | + return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); |
|---|
| 3229 | 3259 | } |
|---|
| 3230 | 3260 | |
|---|
| 3231 | 3261 | static const struct xattr_handler shmem_security_xattr_handler = { |
|---|
| .. | .. |
|---|
| 3352 | 3382 | .fh_to_dentry = shmem_fh_to_dentry, |
|---|
| 3353 | 3383 | }; |
|---|
| 3354 | 3384 | |
|---|
| 3355 | | -static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, |
|---|
| 3356 | | - bool remount) |
|---|
| 3385 | +enum shmem_param { |
|---|
| 3386 | + Opt_gid, |
|---|
| 3387 | + Opt_huge, |
|---|
| 3388 | + Opt_mode, |
|---|
| 3389 | + Opt_mpol, |
|---|
| 3390 | + Opt_nr_blocks, |
|---|
| 3391 | + Opt_nr_inodes, |
|---|
| 3392 | + Opt_size, |
|---|
| 3393 | + Opt_uid, |
|---|
| 3394 | + Opt_inode32, |
|---|
| 3395 | + Opt_inode64, |
|---|
| 3396 | +}; |
|---|
| 3397 | + |
|---|
| 3398 | +static const struct constant_table shmem_param_enums_huge[] = { |
|---|
| 3399 | + {"never", SHMEM_HUGE_NEVER }, |
|---|
| 3400 | + {"always", SHMEM_HUGE_ALWAYS }, |
|---|
| 3401 | + {"within_size", SHMEM_HUGE_WITHIN_SIZE }, |
|---|
| 3402 | + {"advise", SHMEM_HUGE_ADVISE }, |
|---|
| 3403 | + {} |
|---|
| 3404 | +}; |
|---|
| 3405 | + |
|---|
| 3406 | +const struct fs_parameter_spec shmem_fs_parameters[] = { |
|---|
| 3407 | + fsparam_u32 ("gid", Opt_gid), |
|---|
| 3408 | + fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), |
|---|
| 3409 | + fsparam_u32oct("mode", Opt_mode), |
|---|
| 3410 | + fsparam_string("mpol", Opt_mpol), |
|---|
| 3411 | + fsparam_string("nr_blocks", Opt_nr_blocks), |
|---|
| 3412 | + fsparam_string("nr_inodes", Opt_nr_inodes), |
|---|
| 3413 | + fsparam_string("size", Opt_size), |
|---|
| 3414 | + fsparam_u32 ("uid", Opt_uid), |
|---|
| 3415 | + fsparam_flag ("inode32", Opt_inode32), |
|---|
| 3416 | + fsparam_flag ("inode64", Opt_inode64), |
|---|
| 3417 | + {} |
|---|
| 3418 | +}; |
|---|
| 3419 | + |
|---|
| 3420 | +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) |
|---|
| 3357 | 3421 | { |
|---|
| 3358 | | - char *this_char, *value, *rest; |
|---|
| 3359 | | - struct mempolicy *mpol = NULL; |
|---|
| 3360 | | - uid_t uid; |
|---|
| 3361 | | - gid_t gid; |
|---|
| 3422 | + struct shmem_options *ctx = fc->fs_private; |
|---|
| 3423 | + struct fs_parse_result result; |
|---|
| 3424 | + unsigned long long size; |
|---|
| 3425 | + char *rest; |
|---|
| 3426 | + int opt; |
|---|
| 3427 | + kuid_t kuid; |
|---|
| 3428 | + kgid_t kgid; |
|---|
| 3429 | + |
|---|
| 3430 | + opt = fs_parse(fc, shmem_fs_parameters, param, &result); |
|---|
| 3431 | + if (opt < 0) |
|---|
| 3432 | + return opt; |
|---|
| 3433 | + |
|---|
| 3434 | + switch (opt) { |
|---|
| 3435 | + case Opt_size: |
|---|
| 3436 | + size = memparse(param->string, &rest); |
|---|
| 3437 | + if (*rest == '%') { |
|---|
| 3438 | + size <<= PAGE_SHIFT; |
|---|
| 3439 | + size *= totalram_pages(); |
|---|
| 3440 | + do_div(size, 100); |
|---|
| 3441 | + rest++; |
|---|
| 3442 | + } |
|---|
| 3443 | + if (*rest) |
|---|
| 3444 | + goto bad_value; |
|---|
| 3445 | + ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); |
|---|
| 3446 | + ctx->seen |= SHMEM_SEEN_BLOCKS; |
|---|
| 3447 | + break; |
|---|
| 3448 | + case Opt_nr_blocks: |
|---|
| 3449 | + ctx->blocks = memparse(param->string, &rest); |
|---|
| 3450 | + if (*rest) |
|---|
| 3451 | + goto bad_value; |
|---|
| 3452 | + ctx->seen |= SHMEM_SEEN_BLOCKS; |
|---|
| 3453 | + break; |
|---|
| 3454 | + case Opt_nr_inodes: |
|---|
| 3455 | + ctx->inodes = memparse(param->string, &rest); |
|---|
| 3456 | + if (*rest) |
|---|
| 3457 | + goto bad_value; |
|---|
| 3458 | + ctx->seen |= SHMEM_SEEN_INODES; |
|---|
| 3459 | + break; |
|---|
| 3460 | + case Opt_mode: |
|---|
| 3461 | + ctx->mode = result.uint_32 & 07777; |
|---|
| 3462 | + break; |
|---|
| 3463 | + case Opt_uid: |
|---|
| 3464 | + kuid = make_kuid(current_user_ns(), result.uint_32); |
|---|
| 3465 | + if (!uid_valid(kuid)) |
|---|
| 3466 | + goto bad_value; |
|---|
| 3467 | + |
|---|
| 3468 | + /* |
|---|
| 3469 | + * The requested uid must be representable in the |
|---|
| 3470 | + * filesystem's idmapping. |
|---|
| 3471 | + */ |
|---|
| 3472 | + if (!kuid_has_mapping(fc->user_ns, kuid)) |
|---|
| 3473 | + goto bad_value; |
|---|
| 3474 | + |
|---|
| 3475 | + ctx->uid = kuid; |
|---|
| 3476 | + break; |
|---|
| 3477 | + case Opt_gid: |
|---|
| 3478 | + kgid = make_kgid(current_user_ns(), result.uint_32); |
|---|
| 3479 | + if (!gid_valid(kgid)) |
|---|
| 3480 | + goto bad_value; |
|---|
| 3481 | + |
|---|
| 3482 | + /* |
|---|
| 3483 | + * The requested gid must be representable in the |
|---|
| 3484 | + * filesystem's idmapping. |
|---|
| 3485 | + */ |
|---|
| 3486 | + if (!kgid_has_mapping(fc->user_ns, kgid)) |
|---|
| 3487 | + goto bad_value; |
|---|
| 3488 | + |
|---|
| 3489 | + ctx->gid = kgid; |
|---|
| 3490 | + break; |
|---|
| 3491 | + case Opt_huge: |
|---|
| 3492 | + ctx->huge = result.uint_32; |
|---|
| 3493 | + if (ctx->huge != SHMEM_HUGE_NEVER && |
|---|
| 3494 | + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && |
|---|
| 3495 | + has_transparent_hugepage())) |
|---|
| 3496 | + goto unsupported_parameter; |
|---|
| 3497 | + ctx->seen |= SHMEM_SEEN_HUGE; |
|---|
| 3498 | + break; |
|---|
| 3499 | + case Opt_mpol: |
|---|
| 3500 | + if (IS_ENABLED(CONFIG_NUMA)) { |
|---|
| 3501 | + mpol_put(ctx->mpol); |
|---|
| 3502 | + ctx->mpol = NULL; |
|---|
| 3503 | + if (mpol_parse_str(param->string, &ctx->mpol)) |
|---|
| 3504 | + goto bad_value; |
|---|
| 3505 | + break; |
|---|
| 3506 | + } |
|---|
| 3507 | + goto unsupported_parameter; |
|---|
| 3508 | + case Opt_inode32: |
|---|
| 3509 | + ctx->full_inums = false; |
|---|
| 3510 | + ctx->seen |= SHMEM_SEEN_INUMS; |
|---|
| 3511 | + break; |
|---|
| 3512 | + case Opt_inode64: |
|---|
| 3513 | + if (sizeof(ino_t) < 8) { |
|---|
| 3514 | + return invalfc(fc, |
|---|
| 3515 | + "Cannot use inode64 with <64bit inums in kernel\n"); |
|---|
| 3516 | + } |
|---|
| 3517 | + ctx->full_inums = true; |
|---|
| 3518 | + ctx->seen |= SHMEM_SEEN_INUMS; |
|---|
| 3519 | + break; |
|---|
| 3520 | + } |
|---|
| 3521 | + return 0; |
|---|
| 3522 | + |
|---|
| 3523 | +unsupported_parameter: |
|---|
| 3524 | + return invalfc(fc, "Unsupported parameter '%s'", param->key); |
|---|
| 3525 | +bad_value: |
|---|
| 3526 | + return invalfc(fc, "Bad value for '%s'", param->key); |
|---|
| 3527 | +} |
|---|
| 3528 | + |
|---|
| 3529 | +static int shmem_parse_options(struct fs_context *fc, void *data) |
|---|
| 3530 | +{ |
|---|
| 3531 | + char *options = data; |
|---|
| 3532 | + |
|---|
| 3533 | + if (options) { |
|---|
| 3534 | + int err = security_sb_eat_lsm_opts(options, &fc->security); |
|---|
| 3535 | + if (err) |
|---|
| 3536 | + return err; |
|---|
| 3537 | + } |
|---|
| 3362 | 3538 | |
|---|
| 3363 | 3539 | while (options != NULL) { |
|---|
| 3364 | | - this_char = options; |
|---|
| 3540 | + char *this_char = options; |
|---|
| 3365 | 3541 | for (;;) { |
|---|
| 3366 | 3542 | /* |
|---|
| 3367 | 3543 | * NUL-terminate this option: unfortunately, |
|---|
| .. | .. |
|---|
| 3377 | 3553 | break; |
|---|
| 3378 | 3554 | } |
|---|
| 3379 | 3555 | } |
|---|
| 3380 | | - if (!*this_char) |
|---|
| 3381 | | - continue; |
|---|
| 3382 | | - if ((value = strchr(this_char,'=')) != NULL) { |
|---|
| 3383 | | - *value++ = 0; |
|---|
| 3384 | | - } else { |
|---|
| 3385 | | - pr_err("tmpfs: No value for mount option '%s'\n", |
|---|
| 3386 | | - this_char); |
|---|
| 3387 | | - goto error; |
|---|
| 3388 | | - } |
|---|
| 3556 | + if (*this_char) { |
|---|
| 3557 | + char *value = strchr(this_char,'='); |
|---|
| 3558 | + size_t len = 0; |
|---|
| 3559 | + int err; |
|---|
| 3389 | 3560 | |
|---|
| 3390 | | - if (!strcmp(this_char,"size")) { |
|---|
| 3391 | | - unsigned long long size; |
|---|
| 3392 | | - size = memparse(value,&rest); |
|---|
| 3393 | | - if (*rest == '%') { |
|---|
| 3394 | | - size <<= PAGE_SHIFT; |
|---|
| 3395 | | - size *= totalram_pages; |
|---|
| 3396 | | - do_div(size, 100); |
|---|
| 3397 | | - rest++; |
|---|
| 3561 | + if (value) { |
|---|
| 3562 | + *value++ = '\0'; |
|---|
| 3563 | + len = strlen(value); |
|---|
| 3398 | 3564 | } |
|---|
| 3399 | | - if (*rest) |
|---|
| 3400 | | - goto bad_val; |
|---|
| 3401 | | - sbinfo->max_blocks = |
|---|
| 3402 | | - DIV_ROUND_UP(size, PAGE_SIZE); |
|---|
| 3403 | | - } else if (!strcmp(this_char,"nr_blocks")) { |
|---|
| 3404 | | - sbinfo->max_blocks = memparse(value, &rest); |
|---|
| 3405 | | - if (*rest) |
|---|
| 3406 | | - goto bad_val; |
|---|
| 3407 | | - } else if (!strcmp(this_char,"nr_inodes")) { |
|---|
| 3408 | | - sbinfo->max_inodes = memparse(value, &rest); |
|---|
| 3409 | | - if (*rest) |
|---|
| 3410 | | - goto bad_val; |
|---|
| 3411 | | - } else if (!strcmp(this_char,"mode")) { |
|---|
| 3412 | | - if (remount) |
|---|
| 3413 | | - continue; |
|---|
| 3414 | | - sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; |
|---|
| 3415 | | - if (*rest) |
|---|
| 3416 | | - goto bad_val; |
|---|
| 3417 | | - } else if (!strcmp(this_char,"uid")) { |
|---|
| 3418 | | - if (remount) |
|---|
| 3419 | | - continue; |
|---|
| 3420 | | - uid = simple_strtoul(value, &rest, 0); |
|---|
| 3421 | | - if (*rest) |
|---|
| 3422 | | - goto bad_val; |
|---|
| 3423 | | - sbinfo->uid = make_kuid(current_user_ns(), uid); |
|---|
| 3424 | | - if (!uid_valid(sbinfo->uid)) |
|---|
| 3425 | | - goto bad_val; |
|---|
| 3426 | | - } else if (!strcmp(this_char,"gid")) { |
|---|
| 3427 | | - if (remount) |
|---|
| 3428 | | - continue; |
|---|
| 3429 | | - gid = simple_strtoul(value, &rest, 0); |
|---|
| 3430 | | - if (*rest) |
|---|
| 3431 | | - goto bad_val; |
|---|
| 3432 | | - sbinfo->gid = make_kgid(current_user_ns(), gid); |
|---|
| 3433 | | - if (!gid_valid(sbinfo->gid)) |
|---|
| 3434 | | - goto bad_val; |
|---|
| 3435 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 3436 | | - } else if (!strcmp(this_char, "huge")) { |
|---|
| 3437 | | - int huge; |
|---|
| 3438 | | - huge = shmem_parse_huge(value); |
|---|
| 3439 | | - if (huge < 0) |
|---|
| 3440 | | - goto bad_val; |
|---|
| 3441 | | - if (!has_transparent_hugepage() && |
|---|
| 3442 | | - huge != SHMEM_HUGE_NEVER) |
|---|
| 3443 | | - goto bad_val; |
|---|
| 3444 | | - sbinfo->huge = huge; |
|---|
| 3445 | | -#endif |
|---|
| 3446 | | -#ifdef CONFIG_NUMA |
|---|
| 3447 | | - } else if (!strcmp(this_char,"mpol")) { |
|---|
| 3448 | | - mpol_put(mpol); |
|---|
| 3449 | | - mpol = NULL; |
|---|
| 3450 | | - if (mpol_parse_str(value, &mpol)) |
|---|
| 3451 | | - goto bad_val; |
|---|
| 3452 | | -#endif |
|---|
| 3453 | | - } else { |
|---|
| 3454 | | - pr_err("tmpfs: Bad mount option %s\n", this_char); |
|---|
| 3455 | | - goto error; |
|---|
| 3565 | + err = vfs_parse_fs_string(fc, this_char, value, len); |
|---|
| 3566 | + if (err < 0) |
|---|
| 3567 | + return err; |
|---|
| 3456 | 3568 | } |
|---|
| 3457 | 3569 | } |
|---|
| 3458 | | - sbinfo->mpol = mpol; |
|---|
| 3459 | 3570 | return 0; |
|---|
| 3460 | | - |
|---|
| 3461 | | -bad_val: |
|---|
| 3462 | | - pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", |
|---|
| 3463 | | - value, this_char); |
|---|
| 3464 | | -error: |
|---|
| 3465 | | - mpol_put(mpol); |
|---|
| 3466 | | - return 1; |
|---|
| 3467 | | - |
|---|
| 3468 | 3571 | } |
|---|
| 3469 | 3572 | |
|---|
| 3470 | | -static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) |
|---|
| 3573 | +/* |
|---|
| 3574 | + * Reconfigure a shmem filesystem. |
|---|
| 3575 | + * |
|---|
| 3576 | + * Note that we disallow change from limited->unlimited blocks/inodes while any |
|---|
| 3577 | + * are in use; but we must separately disallow unlimited->limited, because in |
|---|
| 3578 | + * that case we have no record of how much is already in use. |
|---|
| 3579 | + */ |
|---|
| 3580 | +static int shmem_reconfigure(struct fs_context *fc) |
|---|
| 3471 | 3581 | { |
|---|
| 3472 | | - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
|---|
| 3473 | | - struct shmem_sb_info config = *sbinfo; |
|---|
| 3582 | + struct shmem_options *ctx = fc->fs_private; |
|---|
| 3583 | + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); |
|---|
| 3474 | 3584 | unsigned long inodes; |
|---|
| 3475 | | - int error = -EINVAL; |
|---|
| 3476 | | - |
|---|
| 3477 | | - config.mpol = NULL; |
|---|
| 3478 | | - if (shmem_parse_options(data, &config, true)) |
|---|
| 3479 | | - return error; |
|---|
| 3585 | + const char *err; |
|---|
| 3480 | 3586 | |
|---|
| 3481 | 3587 | spin_lock(&sbinfo->stat_lock); |
|---|
| 3482 | 3588 | inodes = sbinfo->max_inodes - sbinfo->free_inodes; |
|---|
| 3483 | | - if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) |
|---|
| 3484 | | - goto out; |
|---|
| 3485 | | - if (config.max_inodes < inodes) |
|---|
| 3486 | | - goto out; |
|---|
| 3487 | | - /* |
|---|
| 3488 | | - * Those tests disallow limited->unlimited while any are in use; |
|---|
| 3489 | | - * but we must separately disallow unlimited->limited, because |
|---|
| 3490 | | - * in that case we have no record of how much is already in use. |
|---|
| 3491 | | - */ |
|---|
| 3492 | | - if (config.max_blocks && !sbinfo->max_blocks) |
|---|
| 3493 | | - goto out; |
|---|
| 3494 | | - if (config.max_inodes && !sbinfo->max_inodes) |
|---|
| 3495 | | - goto out; |
|---|
| 3589 | + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { |
|---|
| 3590 | + if (!sbinfo->max_blocks) { |
|---|
| 3591 | + err = "Cannot retroactively limit size"; |
|---|
| 3592 | + goto out; |
|---|
| 3593 | + } |
|---|
| 3594 | + if (percpu_counter_compare(&sbinfo->used_blocks, |
|---|
| 3595 | + ctx->blocks) > 0) { |
|---|
| 3596 | + err = "Too small a size for current use"; |
|---|
| 3597 | + goto out; |
|---|
| 3598 | + } |
|---|
| 3599 | + } |
|---|
| 3600 | + if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { |
|---|
| 3601 | + if (!sbinfo->max_inodes) { |
|---|
| 3602 | + err = "Cannot retroactively limit inodes"; |
|---|
| 3603 | + goto out; |
|---|
| 3604 | + } |
|---|
| 3605 | + if (ctx->inodes < inodes) { |
|---|
| 3606 | + err = "Too few inodes for current use"; |
|---|
| 3607 | + goto out; |
|---|
| 3608 | + } |
|---|
| 3609 | + } |
|---|
| 3496 | 3610 | |
|---|
| 3497 | | - error = 0; |
|---|
| 3498 | | - sbinfo->huge = config.huge; |
|---|
| 3499 | | - sbinfo->max_blocks = config.max_blocks; |
|---|
| 3500 | | - sbinfo->max_inodes = config.max_inodes; |
|---|
| 3501 | | - sbinfo->free_inodes = config.max_inodes - inodes; |
|---|
| 3611 | + if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && |
|---|
| 3612 | + sbinfo->next_ino > UINT_MAX) { |
|---|
| 3613 | + err = "Current inum too high to switch to 32-bit inums"; |
|---|
| 3614 | + goto out; |
|---|
| 3615 | + } |
|---|
| 3616 | + |
|---|
| 3617 | + if (ctx->seen & SHMEM_SEEN_HUGE) |
|---|
| 3618 | + sbinfo->huge = ctx->huge; |
|---|
| 3619 | + if (ctx->seen & SHMEM_SEEN_INUMS) |
|---|
| 3620 | + sbinfo->full_inums = ctx->full_inums; |
|---|
| 3621 | + if (ctx->seen & SHMEM_SEEN_BLOCKS) |
|---|
| 3622 | + sbinfo->max_blocks = ctx->blocks; |
|---|
| 3623 | + if (ctx->seen & SHMEM_SEEN_INODES) { |
|---|
| 3624 | + sbinfo->max_inodes = ctx->inodes; |
|---|
| 3625 | + sbinfo->free_inodes = ctx->inodes - inodes; |
|---|
| 3626 | + } |
|---|
| 3502 | 3627 | |
|---|
| 3503 | 3628 | /* |
|---|
| 3504 | 3629 | * Preserve previous mempolicy unless mpol remount option was specified. |
|---|
| 3505 | 3630 | */ |
|---|
| 3506 | | - if (config.mpol) { |
|---|
| 3631 | + if (ctx->mpol) { |
|---|
| 3507 | 3632 | mpol_put(sbinfo->mpol); |
|---|
| 3508 | | - sbinfo->mpol = config.mpol; /* transfers initial ref */ |
|---|
| 3633 | + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ |
|---|
| 3634 | + ctx->mpol = NULL; |
|---|
| 3509 | 3635 | } |
|---|
| 3636 | + spin_unlock(&sbinfo->stat_lock); |
|---|
| 3637 | + return 0; |
|---|
| 3510 | 3638 | out: |
|---|
| 3511 | 3639 | spin_unlock(&sbinfo->stat_lock); |
|---|
| 3512 | | - return error; |
|---|
| 3640 | + return invalfc(fc, "%s", err); |
|---|
| 3513 | 3641 | } |
|---|
| 3514 | 3642 | |
|---|
| 3515 | 3643 | static int shmem_show_options(struct seq_file *seq, struct dentry *root) |
|---|
| .. | .. |
|---|
| 3529 | 3657 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) |
|---|
| 3530 | 3658 | seq_printf(seq, ",gid=%u", |
|---|
| 3531 | 3659 | from_kgid_munged(&init_user_ns, sbinfo->gid)); |
|---|
| 3532 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 3660 | + |
|---|
| 3661 | + /* |
|---|
| 3662 | + * Showing inode{64,32} might be useful even if it's the system default, |
|---|
| 3663 | + * since then people don't have to resort to checking both here and |
|---|
| 3664 | + * /proc/config.gz to confirm 64-bit inums were successfully applied |
|---|
| 3665 | + * (which may not even exist if IKCONFIG_PROC isn't enabled). |
|---|
| 3666 | + * |
|---|
| 3667 | + * We hide it when inode64 isn't the default and we are using 32-bit |
|---|
| 3668 | + * inodes, since that probably just means the feature isn't even under |
|---|
| 3669 | + * consideration. |
|---|
| 3670 | + * |
|---|
| 3671 | + * As such: |
|---|
| 3672 | + * |
|---|
| 3673 | + * +-----------------+-----------------+ |
|---|
| 3674 | + * | TMPFS_INODE64=y | TMPFS_INODE64=n | |
|---|
| 3675 | + * +------------------+-----------------+-----------------+ |
|---|
| 3676 | + * | full_inums=true | show | show | |
|---|
| 3677 | + * | full_inums=false | show | hide | |
|---|
| 3678 | + * +------------------+-----------------+-----------------+ |
|---|
| 3679 | + * |
|---|
| 3680 | + */ |
|---|
| 3681 | + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) |
|---|
| 3682 | + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); |
|---|
| 3683 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 3533 | 3684 | /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ |
|---|
| 3534 | 3685 | if (sbinfo->huge) |
|---|
| 3535 | 3686 | seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); |
|---|
| .. | .. |
|---|
| 3544 | 3695 | { |
|---|
| 3545 | 3696 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
|---|
| 3546 | 3697 | |
|---|
| 3698 | + free_percpu(sbinfo->ino_batch); |
|---|
| 3547 | 3699 | percpu_counter_destroy(&sbinfo->used_blocks); |
|---|
| 3548 | 3700 | mpol_put(sbinfo->mpol); |
|---|
| 3549 | 3701 | kfree(sbinfo); |
|---|
| 3550 | 3702 | sb->s_fs_info = NULL; |
|---|
| 3551 | 3703 | } |
|---|
| 3552 | 3704 | |
|---|
| 3553 | | -int shmem_fill_super(struct super_block *sb, void *data, int silent) |
|---|
| 3705 | +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) |
|---|
| 3554 | 3706 | { |
|---|
| 3707 | + struct shmem_options *ctx = fc->fs_private; |
|---|
| 3555 | 3708 | struct inode *inode; |
|---|
| 3556 | 3709 | struct shmem_sb_info *sbinfo; |
|---|
| 3557 | 3710 | int err = -ENOMEM; |
|---|
| .. | .. |
|---|
| 3562 | 3715 | if (!sbinfo) |
|---|
| 3563 | 3716 | return -ENOMEM; |
|---|
| 3564 | 3717 | |
|---|
| 3565 | | - sbinfo->mode = 0777 | S_ISVTX; |
|---|
| 3566 | | - sbinfo->uid = current_fsuid(); |
|---|
| 3567 | | - sbinfo->gid = current_fsgid(); |
|---|
| 3568 | 3718 | sb->s_fs_info = sbinfo; |
|---|
| 3569 | 3719 | |
|---|
| 3570 | 3720 | #ifdef CONFIG_TMPFS |
|---|
| .. | .. |
|---|
| 3574 | 3724 | * but the internal instance is left unlimited. |
|---|
| 3575 | 3725 | */ |
|---|
| 3576 | 3726 | if (!(sb->s_flags & SB_KERNMOUNT)) { |
|---|
| 3577 | | - sbinfo->max_blocks = shmem_default_max_blocks(); |
|---|
| 3578 | | - sbinfo->max_inodes = shmem_default_max_inodes(); |
|---|
| 3579 | | - if (shmem_parse_options(data, sbinfo, false)) { |
|---|
| 3580 | | - err = -EINVAL; |
|---|
| 3581 | | - goto failed; |
|---|
| 3582 | | - } |
|---|
| 3727 | + if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) |
|---|
| 3728 | + ctx->blocks = shmem_default_max_blocks(); |
|---|
| 3729 | + if (!(ctx->seen & SHMEM_SEEN_INODES)) |
|---|
| 3730 | + ctx->inodes = shmem_default_max_inodes(); |
|---|
| 3731 | + if (!(ctx->seen & SHMEM_SEEN_INUMS)) |
|---|
| 3732 | + ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); |
|---|
| 3583 | 3733 | } else { |
|---|
| 3584 | 3734 | sb->s_flags |= SB_NOUSER; |
|---|
| 3585 | 3735 | } |
|---|
| .. | .. |
|---|
| 3588 | 3738 | #else |
|---|
| 3589 | 3739 | sb->s_flags |= SB_NOUSER; |
|---|
| 3590 | 3740 | #endif |
|---|
| 3741 | + sbinfo->max_blocks = ctx->blocks; |
|---|
| 3742 | + sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; |
|---|
| 3743 | + if (sb->s_flags & SB_KERNMOUNT) { |
|---|
| 3744 | + sbinfo->ino_batch = alloc_percpu(ino_t); |
|---|
| 3745 | + if (!sbinfo->ino_batch) |
|---|
| 3746 | + goto failed; |
|---|
| 3747 | + } |
|---|
| 3748 | + sbinfo->uid = ctx->uid; |
|---|
| 3749 | + sbinfo->gid = ctx->gid; |
|---|
| 3750 | + sbinfo->full_inums = ctx->full_inums; |
|---|
| 3751 | + sbinfo->mode = ctx->mode; |
|---|
| 3752 | + sbinfo->huge = ctx->huge; |
|---|
| 3753 | + sbinfo->mpol = ctx->mpol; |
|---|
| 3754 | + ctx->mpol = NULL; |
|---|
| 3591 | 3755 | |
|---|
| 3592 | 3756 | spin_lock_init(&sbinfo->stat_lock); |
|---|
| 3593 | 3757 | if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) |
|---|
| 3594 | 3758 | goto failed; |
|---|
| 3595 | | - sbinfo->free_inodes = sbinfo->max_inodes; |
|---|
| 3596 | 3759 | spin_lock_init(&sbinfo->shrinklist_lock); |
|---|
| 3597 | 3760 | INIT_LIST_HEAD(&sbinfo->shrinklist); |
|---|
| 3598 | 3761 | |
|---|
| .. | .. |
|---|
| 3625 | 3788 | return err; |
|---|
| 3626 | 3789 | } |
|---|
| 3627 | 3790 | |
|---|
| 3791 | +static int shmem_get_tree(struct fs_context *fc) |
|---|
| 3792 | +{ |
|---|
| 3793 | + return get_tree_nodev(fc, shmem_fill_super); |
|---|
| 3794 | +} |
|---|
| 3795 | + |
|---|
| 3796 | +static void shmem_free_fc(struct fs_context *fc) |
|---|
| 3797 | +{ |
|---|
| 3798 | + struct shmem_options *ctx = fc->fs_private; |
|---|
| 3799 | + |
|---|
| 3800 | + if (ctx) { |
|---|
| 3801 | + mpol_put(ctx->mpol); |
|---|
| 3802 | + kfree(ctx); |
|---|
| 3803 | + } |
|---|
| 3804 | +} |
|---|
| 3805 | + |
|---|
| 3806 | +static const struct fs_context_operations shmem_fs_context_ops = { |
|---|
| 3807 | + .free = shmem_free_fc, |
|---|
| 3808 | + .get_tree = shmem_get_tree, |
|---|
| 3809 | +#ifdef CONFIG_TMPFS |
|---|
| 3810 | + .parse_monolithic = shmem_parse_options, |
|---|
| 3811 | + .parse_param = shmem_parse_one, |
|---|
| 3812 | + .reconfigure = shmem_reconfigure, |
|---|
| 3813 | +#endif |
|---|
| 3814 | +}; |
|---|
| 3815 | + |
|---|
| 3628 | 3816 | static struct kmem_cache *shmem_inode_cachep; |
|---|
| 3629 | 3817 | |
|---|
| 3630 | 3818 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
|---|
| .. | .. |
|---|
| 3636 | 3824 | return &info->vfs_inode; |
|---|
| 3637 | 3825 | } |
|---|
| 3638 | 3826 | |
|---|
| 3639 | | -static void shmem_destroy_callback(struct rcu_head *head) |
|---|
| 3827 | +static void shmem_free_in_core_inode(struct inode *inode) |
|---|
| 3640 | 3828 | { |
|---|
| 3641 | | - struct inode *inode = container_of(head, struct inode, i_rcu); |
|---|
| 3642 | 3829 | if (S_ISLNK(inode->i_mode)) |
|---|
| 3643 | 3830 | kfree(inode->i_link); |
|---|
| 3644 | 3831 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
|---|
| .. | .. |
|---|
| 3648 | 3835 | { |
|---|
| 3649 | 3836 | if (S_ISREG(inode->i_mode)) |
|---|
| 3650 | 3837 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
|---|
| 3651 | | - call_rcu(&inode->i_rcu, shmem_destroy_callback); |
|---|
| 3652 | 3838 | } |
|---|
| 3653 | 3839 | |
|---|
| 3654 | 3840 | static void shmem_init_inode(void *foo) |
|---|
| .. | .. |
|---|
| 3739 | 3925 | |
|---|
| 3740 | 3926 | static const struct super_operations shmem_ops = { |
|---|
| 3741 | 3927 | .alloc_inode = shmem_alloc_inode, |
|---|
| 3928 | + .free_inode = shmem_free_in_core_inode, |
|---|
| 3742 | 3929 | .destroy_inode = shmem_destroy_inode, |
|---|
| 3743 | 3930 | #ifdef CONFIG_TMPFS |
|---|
| 3744 | 3931 | .statfs = shmem_statfs, |
|---|
| 3745 | | - .remount_fs = shmem_remount_fs, |
|---|
| 3746 | 3932 | .show_options = shmem_show_options, |
|---|
| 3747 | 3933 | #endif |
|---|
| 3748 | 3934 | .evict_inode = shmem_evict_inode, |
|---|
| 3749 | 3935 | .drop_inode = generic_delete_inode, |
|---|
| 3750 | 3936 | .put_super = shmem_put_super, |
|---|
| 3751 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 3937 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 3752 | 3938 | .nr_cached_objects = shmem_unused_huge_count, |
|---|
| 3753 | 3939 | .free_cached_objects = shmem_unused_huge_scan, |
|---|
| 3754 | 3940 | #endif |
|---|
| .. | .. |
|---|
| 3761 | 3947 | .set_policy = shmem_set_policy, |
|---|
| 3762 | 3948 | .get_policy = shmem_get_policy, |
|---|
| 3763 | 3949 | #endif |
|---|
| 3950 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
|---|
| 3951 | + .allow_speculation = filemap_allow_speculation, |
|---|
| 3952 | +#endif |
|---|
| 3764 | 3953 | }; |
|---|
| 3765 | 3954 | |
|---|
| 3766 | | -static struct dentry *shmem_mount(struct file_system_type *fs_type, |
|---|
| 3767 | | - int flags, const char *dev_name, void *data) |
|---|
| 3955 | +int shmem_init_fs_context(struct fs_context *fc) |
|---|
| 3768 | 3956 | { |
|---|
| 3769 | | - return mount_nodev(fs_type, flags, data, shmem_fill_super); |
|---|
| 3957 | + struct shmem_options *ctx; |
|---|
| 3958 | + |
|---|
| 3959 | + ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); |
|---|
| 3960 | + if (!ctx) |
|---|
| 3961 | + return -ENOMEM; |
|---|
| 3962 | + |
|---|
| 3963 | + ctx->mode = 0777 | S_ISVTX; |
|---|
| 3964 | + ctx->uid = current_fsuid(); |
|---|
| 3965 | + ctx->gid = current_fsgid(); |
|---|
| 3966 | + |
|---|
| 3967 | + fc->fs_private = ctx; |
|---|
| 3968 | + fc->ops = &shmem_fs_context_ops; |
|---|
| 3969 | + return 0; |
|---|
| 3770 | 3970 | } |
|---|
| 3771 | 3971 | |
|---|
| 3772 | 3972 | static struct file_system_type shmem_fs_type = { |
|---|
| 3773 | 3973 | .owner = THIS_MODULE, |
|---|
| 3774 | 3974 | .name = "tmpfs", |
|---|
| 3775 | | - .mount = shmem_mount, |
|---|
| 3975 | + .init_fs_context = shmem_init_fs_context, |
|---|
| 3976 | +#ifdef CONFIG_TMPFS |
|---|
| 3977 | + .parameters = shmem_fs_parameters, |
|---|
| 3978 | +#endif |
|---|
| 3776 | 3979 | .kill_sb = kill_litter_super, |
|---|
| 3777 | | - .fs_flags = FS_USERNS_MOUNT, |
|---|
| 3980 | + .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT, |
|---|
| 3778 | 3981 | }; |
|---|
| 3779 | 3982 | |
|---|
| 3780 | 3983 | int __init shmem_init(void) |
|---|
| 3781 | 3984 | { |
|---|
| 3782 | 3985 | int error; |
|---|
| 3783 | | - |
|---|
| 3784 | | - /* If rootfs called this, don't re-init */ |
|---|
| 3785 | | - if (shmem_inode_cachep) |
|---|
| 3786 | | - return 0; |
|---|
| 3787 | 3986 | |
|---|
| 3788 | 3987 | shmem_init_inodecache(); |
|---|
| 3789 | 3988 | |
|---|
| .. | .. |
|---|
| 3800 | 3999 | goto out1; |
|---|
| 3801 | 4000 | } |
|---|
| 3802 | 4001 | |
|---|
| 3803 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 4002 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 3804 | 4003 | if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) |
|---|
| 3805 | 4004 | SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; |
|---|
| 3806 | 4005 | else |
|---|
| .. | .. |
|---|
| 3816 | 4015 | return error; |
|---|
| 3817 | 4016 | } |
|---|
| 3818 | 4017 | |
|---|
| 3819 | | -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) |
|---|
| 4018 | +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) |
|---|
| 3820 | 4019 | static ssize_t shmem_enabled_show(struct kobject *kobj, |
|---|
| 3821 | 4020 | struct kobj_attribute *attr, char *buf) |
|---|
| 3822 | 4021 | { |
|---|
| 3823 | | - int values[] = { |
|---|
| 4022 | + static const int values[] = { |
|---|
| 3824 | 4023 | SHMEM_HUGE_ALWAYS, |
|---|
| 3825 | 4024 | SHMEM_HUGE_WITHIN_SIZE, |
|---|
| 3826 | 4025 | SHMEM_HUGE_ADVISE, |
|---|
| .. | .. |
|---|
| 3868 | 4067 | |
|---|
| 3869 | 4068 | struct kobj_attribute shmem_enabled_attr = |
|---|
| 3870 | 4069 | __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); |
|---|
| 3871 | | -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ |
|---|
| 4070 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ |
|---|
| 3872 | 4071 | |
|---|
| 3873 | | -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE |
|---|
| 4072 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 3874 | 4073 | bool shmem_huge_enabled(struct vm_area_struct *vma) |
|---|
| 3875 | 4074 | { |
|---|
| 3876 | 4075 | struct inode *inode = file_inode(vma->vm_file); |
|---|
| .. | .. |
|---|
| 3878 | 4077 | loff_t i_size; |
|---|
| 3879 | 4078 | pgoff_t off; |
|---|
| 3880 | 4079 | |
|---|
| 4080 | + if (!transhuge_vma_enabled(vma, vma->vm_flags)) |
|---|
| 4081 | + return false; |
|---|
| 3881 | 4082 | if (shmem_huge == SHMEM_HUGE_FORCE) |
|---|
| 3882 | 4083 | return true; |
|---|
| 3883 | 4084 | if (shmem_huge == SHMEM_HUGE_DENY) |
|---|
| .. | .. |
|---|
| 3893 | 4094 | if (i_size >= HPAGE_PMD_SIZE && |
|---|
| 3894 | 4095 | i_size >> PAGE_SHIFT >= off) |
|---|
| 3895 | 4096 | return true; |
|---|
| 3896 | | - /* fall through */ |
|---|
| 4097 | + fallthrough; |
|---|
| 3897 | 4098 | case SHMEM_HUGE_ADVISE: |
|---|
| 3898 | 4099 | /* TODO: implement fadvise() hints */ |
|---|
| 3899 | 4100 | return (vma->vm_flags & VM_HUGEPAGE); |
|---|
| .. | .. |
|---|
| 3902 | 4103 | return false; |
|---|
| 3903 | 4104 | } |
|---|
| 3904 | 4105 | } |
|---|
| 3905 | | -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ |
|---|
| 4106 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 3906 | 4107 | |
|---|
| 3907 | 4108 | #else /* !CONFIG_SHMEM */ |
|---|
| 3908 | 4109 | |
|---|
| .. | .. |
|---|
| 3917 | 4118 | |
|---|
| 3918 | 4119 | static struct file_system_type shmem_fs_type = { |
|---|
| 3919 | 4120 | .name = "tmpfs", |
|---|
| 3920 | | - .mount = ramfs_mount, |
|---|
| 3921 | | - .kill_sb = kill_litter_super, |
|---|
| 4121 | + .init_fs_context = ramfs_init_fs_context, |
|---|
| 4122 | + .parameters = ramfs_fs_parameters, |
|---|
| 4123 | + .kill_sb = ramfs_kill_sb, |
|---|
| 3922 | 4124 | .fs_flags = FS_USERNS_MOUNT, |
|---|
| 3923 | 4125 | }; |
|---|
| 3924 | 4126 | |
|---|
| .. | .. |
|---|
| 3932 | 4134 | return 0; |
|---|
| 3933 | 4135 | } |
|---|
| 3934 | 4136 | |
|---|
| 3935 | | -int shmem_unuse(swp_entry_t swap, struct page *page) |
|---|
| 4137 | +int shmem_unuse(unsigned int type, bool frontswap, |
|---|
| 4138 | + unsigned long *fs_pages_to_unuse) |
|---|
| 3936 | 4139 | { |
|---|
| 3937 | 4140 | return 0; |
|---|
| 3938 | 4141 | } |
|---|
| .. | .. |
|---|
| 4047 | 4250 | |
|---|
| 4048 | 4251 | /** |
|---|
| 4049 | 4252 | * shmem_zero_setup - setup a shared anonymous mapping |
|---|
| 4050 | | - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff |
|---|
| 4253 | + * @vma: the vma to be mmapped is prepared by do_mmap |
|---|
| 4051 | 4254 | */ |
|---|
| 4052 | 4255 | int shmem_zero_setup(struct vm_area_struct *vma) |
|---|
| 4053 | 4256 | { |
|---|
| .. | .. |
|---|
| 4055 | 4258 | loff_t size = vma->vm_end - vma->vm_start; |
|---|
| 4056 | 4259 | |
|---|
| 4057 | 4260 | /* |
|---|
| 4058 | | - * Cloning a new file under mmap_sem leads to a lock ordering conflict |
|---|
| 4261 | + * Cloning a new file under mmap_lock leads to a lock ordering conflict |
|---|
| 4059 | 4262 | * between XFS directory reading and selinux: since this file is only |
|---|
| 4060 | 4263 | * accessible to the user through its mapping, use S_PRIVATE flag to |
|---|
| 4061 | 4264 | * bypass file security, in the same way as shmem_kernel_file_setup(). |
|---|
| .. | .. |
|---|
| 4069 | 4272 | vma->vm_file = file; |
|---|
| 4070 | 4273 | vma->vm_ops = &shmem_vm_ops; |
|---|
| 4071 | 4274 | |
|---|
| 4072 | | - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && |
|---|
| 4275 | + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && |
|---|
| 4073 | 4276 | ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < |
|---|
| 4074 | 4277 | (vma->vm_end & HPAGE_PMD_MASK)) { |
|---|
| 4075 | 4278 | khugepaged_enter(vma, vma->vm_flags); |
|---|
| .. | .. |
|---|
| 4117 | 4320 | #endif |
|---|
| 4118 | 4321 | } |
|---|
| 4119 | 4322 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
|---|
| 4323 | + |
|---|
| 4324 | +void shmem_mark_page_lazyfree(struct page *page, bool tail) |
|---|
| 4325 | +{ |
|---|
| 4326 | + mark_page_lazyfree_movetail(page, tail); |
|---|
| 4327 | +} |
|---|
| 4328 | +EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree); |
|---|
| 4329 | + |
|---|
| 4330 | +int reclaim_shmem_address_space(struct address_space *mapping) |
|---|
| 4331 | +{ |
|---|
| 4332 | +#ifdef CONFIG_SHMEM |
|---|
| 4333 | + pgoff_t start = 0; |
|---|
| 4334 | + struct page *page; |
|---|
| 4335 | + LIST_HEAD(page_list); |
|---|
| 4336 | + XA_STATE(xas, &mapping->i_pages, start); |
|---|
| 4337 | + |
|---|
| 4338 | + if (!shmem_mapping(mapping)) |
|---|
| 4339 | + return -EINVAL; |
|---|
| 4340 | + |
|---|
| 4341 | + lru_add_drain(); |
|---|
| 4342 | + |
|---|
| 4343 | + rcu_read_lock(); |
|---|
| 4344 | + xas_for_each(&xas, page, ULONG_MAX) { |
|---|
| 4345 | + if (xas_retry(&xas, page)) |
|---|
| 4346 | + continue; |
|---|
| 4347 | + if (xa_is_value(page)) |
|---|
| 4348 | + continue; |
|---|
| 4349 | + if (isolate_lru_page(page)) |
|---|
| 4350 | + continue; |
|---|
| 4351 | + |
|---|
| 4352 | + list_add(&page->lru, &page_list); |
|---|
| 4353 | + |
|---|
| 4354 | + if (need_resched()) { |
|---|
| 4355 | + xas_pause(&xas); |
|---|
| 4356 | + cond_resched_rcu(); |
|---|
| 4357 | + } |
|---|
| 4358 | + } |
|---|
| 4359 | + rcu_read_unlock(); |
|---|
| 4360 | + |
|---|
| 4361 | + return reclaim_pages(&page_list); |
|---|
| 4362 | +#else |
|---|
| 4363 | + return 0; |
|---|
| 4364 | +#endif |
|---|
| 4365 | +} |
|---|
| 4366 | +EXPORT_SYMBOL_GPL(reclaim_shmem_address_space); |
|---|