From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/shmem.c | 1875 +++++++++++++++++++++++++++++++++------------------------- 1 files changed, 1,061 insertions(+), 814 deletions(-) diff --git a/kernel/mm/shmem.c b/kernel/mm/shmem.c index bbb8780..b812f26 100644 --- a/kernel/mm/shmem.c +++ b/kernel/mm/shmem.c @@ -36,8 +36,17 @@ #include <linux/uio.h> #include <linux/khugepaged.h> #include <linux/hugetlb.h> +#include <linux/frontswap.h> +#include <linux/fs_parser.h> +#include <linux/mm_inline.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ + +#include "internal.h" + +#undef CREATE_TRACE_POINTS +#include <trace/hooks/shmem_fs.h> +#include <trace/hooks/mm.h> static struct vfsmount *shm_mnt; @@ -80,7 +89,6 @@ #include <linux/uuid.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include "internal.h" @@ -106,21 +114,43 @@ pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; +struct shmem_options { + unsigned long long blocks; + unsigned long long inodes; + struct mempolicy *mpol; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool full_inums; + int huge; + int seen; +#define SHMEM_SEEN_BLOCKS 1 +#define SHMEM_SEEN_INODES 2 +#define SHMEM_SEEN_HUGE 4 +#define SHMEM_SEEN_INUMS 8 +}; + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + unsigned long nr_pages = totalram_pages(); + + return min(nr_pages - totalhigh_pages(), nr_pages / 2); } #endif static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, @@ -239,18 +269,78 @@ static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -static int shmem_reserve_inode(struct super_block *sb) +/* + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and + * produces a novel ino for the newly allocated inode. + * + * It may also be called when making a hard link to permit the space needed by + * each dentry. However, in that case, no new inode number is needed since that + * internally draws from another pool of inode numbers (currently global + * get_next_ino()). This case is indicated by passing NULL as inop. + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; } - sbinfo->free_inodes--; + if (inop) { + ino = sbinfo->next_ino++; + if (unlikely(is_zero_ino(ino))) + ino = sbinfo->next_ino++; + if (unlikely(!sbinfo->full_inums && + ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + if (IS_ENABLED(CONFIG_64BIT)) + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", + __func__, MINOR(sb->s_dev)); + sbinfo->next_ino = 1; + ino = sbinfo->next_ino++; + } + *inop = ino; + } spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it + * doesn't hold stat_lock in shmem_reserve_inode since + * max_inodes is always 0, and is called from potentially + * unknown contexts. As such, use a per-cpu batched allocator + * which doesn't require the per-sb stat_lock unless we are at + * the batch boundary. + * + * We don't need to worry about inode{32,64} since SB_KERNMOUNT + * shmem mounts are not exposed to userspace, so we don't need + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { + spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); } + return 0; } @@ -326,24 +416,20 @@ } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - struct radix_tree_node *node; - void __rcu **pslot; + XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); - if (!item) - return -ENOENT; + item = xas_load(&xas); if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->i_pages, node, pslot, - replacement, NULL); + xas_store(&xas, replacement); return 0; } @@ -357,12 +443,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->i_pages, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); } /* @@ -397,12 +478,12 @@ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly; -#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) +#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -419,7 +500,9 @@ return SHMEM_HUGE_FORCE; return -EINVAL; } +#endif +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { @@ -570,7 +653,7 @@ struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY @@ -579,11 +662,11 @@ { return 0; } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) { - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && shmem_huge != SHMEM_HUGE_DENY) return true; @@ -595,9 +678,13 @@ */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { - int error, nr = hpage_nr_pages(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); + unsigned long i = 0; + unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -609,46 +696,53 @@ page->mapping = mapping; page->index = index; - xa_lock_irq(&mapping->i_pages); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->i_pages, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; - } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->i_pages, - index + i, page + i); - VM_BUG_ON(error); + if (!PageSwapCache(page)) { + error = mem_cgroup_charge(page, charge_mm, gfp); + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); } - count_vm_event(THP_FILE_ALLOC); + goto error; } - } else if (!expected) { - error = radix_tree_insert(&mapping->i_pages, index, page); - } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); + } + cgroup_throttle_swaprate(page, gfp); + + do { + void *entry; + xas_lock_irq(&xas); + entry = xas_find_conflict(&xas); + if (entry != expected) + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; +next: + xas_store(&xas, page); + if (++i < nr) { + xas_next(&xas); + goto next; + } + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(page, NR_SHMEM_THPS); + } + mapping->nrpages += nr; + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { + error = xas_error(&xas); + goto error; } - if (!error) { - mapping->nrpages += nr; - if (PageTransHuge(page)) - __inc_node_page_state(page, NR_SHMEM_THPS); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - xa_unlock_irq(&mapping->i_pages); - } else { - page->mapping = NULL; - xa_unlock_irq(&mapping->i_pages); - page_ref_sub(page, nr); - } + return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); return error; } @@ -663,27 +757,25 @@ VM_BUG_ON_PAGE(PageCompound(page), page); xa_lock_irq(&mapping->i_pages); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { void *old; - xa_lock_irq(&mapping->i_pages); - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); - xa_unlock_irq(&mapping->i_pages); + old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -700,29 +792,19 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void __rcu **slot; + XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, end - 1) { + if (xas_retry(&xas, page)) continue; - } - - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -797,7 +879,33 @@ } /* - * Remove range of pages and swap entries from radix tree, and free them. + * Check whether a hole-punch or truncation needs to split a huge page, + * returning true if no split was required, or the split has been successful. + * + * Eviction (or truncation to 0 size) should never need to split a huge page; + * but in rare cases might do so, if shmem_undo_range() failed to trylock on + * head, and then succeeded to trylock on tail. + * + * A split can only succeed when there are no additional references on the + * huge page: so the split below relies upon find_get_entries() having stopped + * when it found a subpage of the huge page, without getting further references. + */ +static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +{ + if (!PageTransCompound(page)) + return true; + + /* Just proceed to delete a huge page wholly within the range punched */ + if (PageHead(page) && + page->index >= start && page->index + HPAGE_PMD_NR <= end) + return true; + + /* Try to split huge page, so we can truly punch the hole or truncate */ + return split_huge_page(page) >= 0; +} + +/* + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, @@ -833,7 +941,7 @@ if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -846,31 +954,11 @@ if (!trylock_page(page)) continue; - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); + if ((!unfalloc || !PageUptodate(page)) && + page_mapping(page) == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) truncate_inode_page(mapping, page); - } } unlock_page(page); } @@ -930,7 +1018,7 @@ if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -944,42 +1032,24 @@ lock_page(page); - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - /* - * Partial thp truncate due 'start' in middle - * of THP: don't need to look on these pages - * again on !pvec.nr restart. - */ - if (index != round_down(end, HPAGE_PMD_NR)) - start++; - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } else { + if (page_mapping(page) != mapping) { /* Page was replaced by swap: retry */ unlock_page(page); index--; break; + } + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) + truncate_inode_page(mapping, page); + else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Wipe the page and don't get stuck */ + clear_highpage(page); + flush_dcache_page(page); + set_page_dirty(page); + if (index < + round_up(start, HPAGE_PMD_NR)) + start = index + 1; } } unlock_page(page); @@ -1067,7 +1137,7 @@ * Part of the huge page can be beyond i_size: subject * to shrink under memory pressure. */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to @@ -1106,9 +1176,14 @@ } spin_unlock(&sbinfo->shrinklist_lock); } - if (!list_empty(&info->swaplist)) { + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); mutex_lock(&shmem_swaplist_mutex); - list_del_init(&info->swaplist); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } } @@ -1119,166 +1194,174 @@ clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +extern struct swap_info_struct *swap_info[]; + +static int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices, + unsigned int type, bool frontswap) { - struct radix_tree_iter iter; - void __rcu **slot; - unsigned long found = -1; - unsigned int checked = 0; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + swp_entry_t entry; + unsigned int ret = 0; + + if (!nr_entries) + return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) continue; + + if (!xa_is_value(page)) + continue; + + entry = radix_to_swp_entry(page); + if (swp_type(entry) != type) + continue; + if (frontswap && + !frontswap_test(swap_info[type], swp_offset(entry))) + continue; + + indices[ret] = xas.xa_index; + entries[ret] = page; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); } - if (entry == item) { - found = iter.index; + if (++ret == nr_entries) break; - } - checked++; - if ((checked % 4096) != 0) - continue; - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); } - rcu_read_unlock(); - return found; + + return ret; +} + +/* + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. + */ +static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, + pgoff_t *indices) +{ + int i = 0; + int ret = 0; + int error = 0; + struct address_space *mapping = inode->i_mapping; + + for (i = 0; i < pvec.nr; i++) { + struct page *page = pvec.pages[i]; + + if (!xa_is_value(page)) + continue; + error = shmem_swapin_page(inode, indices[i], + &page, SGP_CACHE, + mapping_gfp_mask(mapping), + NULL, NULL); + if (error == 0) { + unlock_page(page); + put_page(page); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; + } + return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page **pagep) +static int shmem_unuse_inode(struct inode *inode, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { - struct address_space *mapping = info->vfs_inode.i_mapping; - void *radswap; - pgoff_t index; - gfp_t gfp; - int error = 0; + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); + int ret = 0; - radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->i_pages, radswap); - if (index == -1) - return -EAGAIN; /* tell shmem_unuse we found nothing */ + pagevec_init(&pvec); + do { + unsigned int nr_entries = PAGEVEC_SIZE; - /* - * Move _head_ to start search for next from here. - * But be careful: shmem_evict_inode checks list_empty without taking - * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. - */ - if (shmem_swaplist.next != &info->swaplist) - list_move_tail(&shmem_swaplist, &info->swaplist); + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) + nr_entries = *fs_pages_to_unuse; - gfp = mapping_gfp_mask(mapping); - if (shmem_should_replace_page(*pagep, gfp)) { - mutex_unlock(&shmem_swaplist_mutex); - error = shmem_replace_page(pagep, gfp, info, index); - mutex_lock(&shmem_swaplist_mutex); - /* - * We needed to drop mutex to make that restrictive page - * allocation, but the inode might have been freed while we - * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock - * on this swapcache page is not enough to prevent that - - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. - * - * We must not proceed to shmem_add_to_page_cache() if the - * inode has been freed, but of course we cannot rely on - * inode or mapping or info to check that. However, we can - * safely check if our swap entry is still in use (and here - * it can't have got reused for another page): if it's still - * in use, then the inode cannot have been freed yet, and we - * can safely proceed (if it's no longer in use, that tells - * nothing about the inode, but we don't need to unuse swap). - */ - if (!page_swapcount(*pagep)) - error = -ENOENT; - } - - /* - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, - * but also to hold up shmem_evict_inode(): so inode cannot be freed - * beneath us (pagelock doesn't help until the page is in pagecache). - */ - if (!error) - error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); - if (error != -ENOMEM) { - /* - * Truncation and eviction use free_swap_and_cache(), which - * only does trylock page: if we raced, best clean up here. - */ - delete_from_swap_cache(*pagep); - set_page_dirty(*pagep); - if (!error) { - spin_lock_irq(&info->lock); - info->swapped--; - spin_unlock_irq(&info->lock); - swap_free(swap); + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, + pvec.pages, indices, + type, frontswap); + if (pvec.nr == 0) { + ret = 0; + break; } - } - return error; + + ret = shmem_unuse_swap_entries(inode, pvec, indices); + if (ret < 0) + break; + + if (frontswap_partial) { + *fs_pages_to_unuse -= ret; + if (*fs_pages_to_unuse == 0) { + ret = FRONTSWAP_PAGES_UNUSED; + break; + } + } + + start = indices[pvec.nr - 1]; + } while (true); + + return ret; } /* - * Search through swapped inodes to find and replace swap by page. + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. */ -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - struct list_head *this, *next; - struct shmem_inode_info *info; - struct mem_cgroup *memcg; + struct shmem_inode_info *info, *next; int error = 0; - /* - * There's a faint possibility that swap page was replaced before - * caller locked it: caller will come back later with the right page. - */ - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) - goto out; - - /* - * Charge page using GFP_KERNEL while we can wait, before taking - * the shmem_swaplist_mutex which might hold up shmem_writepage(). - * Charged back to the user (not to caller) when swap account is used. - */ - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, - &memcg, false); - if (error) - goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ - error = -EAGAIN; + if (list_empty(&shmem_swaplist)) + return 0; mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(this, next, &shmem_swaplist) { - info = list_entry(this, struct shmem_inode_info, swaplist); - if (info->swapped) - error = shmem_unuse_inode(info, swap, &page); - else + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { list_del_init(&info->swaplist); + continue; + } + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); + mutex_unlock(&shmem_swaplist_mutex); + + error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, + fs_pages_to_unuse); cond_resched(); - if (error != -EAGAIN) + + mutex_lock(&shmem_swaplist_mutex); + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); + if (error) break; - /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (error) { - if (error != -ENOMEM) - error = 0; - mem_cgroup_cancel_charge(page, memcg, false); - } else - mem_cgroup_commit_charge(page, memcg, true, false); -out: - unlock_page(page); - put_page(page); return error; } @@ -1348,6 +1431,7 @@ SetPageUptodate(page); } + trace_android_vh_set_shmem_page_flag(page); swap = get_swap_page(page); if (!swap.val) goto redirty; @@ -1362,9 +1446,11 @@ */ mutex_lock(&shmem_swaplist_mutex); if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); + list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, + NULL) == 0) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; @@ -1447,11 +1533,11 @@ { struct vm_area_struct pvma; struct page *page; - struct vm_fault vmf; + struct vm_fault vmf = { + .vma = &pvma, + }; shmem_pseudo_vma_init(&pvma, info, index); - vmf.vma = &pvma; - vmf.address = 0; page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); @@ -1462,23 +1548,14 @@ struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; struct page *page; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return NULL; - hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, @@ -1486,6 +1563,8 @@ shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); + else + count_vm_event(THP_FILE_FALLBACK); return page; } @@ -1493,7 +1572,11 @@ struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct page *page; + struct page *page = NULL; + + trace_android_vh_shmem_alloc_page(&page); + if (page) + return page; shmem_pseudo_vma_init(&pvma, info, index); page = alloc_page_vma(gfp, &pvma, 0); @@ -1511,7 +1594,7 @@ int nr; int err = -ENOSPC; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; @@ -1589,11 +1672,11 @@ * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1605,8 +1688,7 @@ */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1620,13 +1702,109 @@ } /* + * Swap in the page pointed to by *pagep. + * Caller has to make sure that *pagep contains a valid swapped page. + * Returns 0 and the page in pagep if success. On failure, returns the + * error code and NULL in *pagep. + */ +static int shmem_swapin_page(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; + struct page *page; + swp_entry_t swap; + int error; + + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); + swap = radix_to_swp_entry(*pagep); + *pagep = NULL; + + /* Look it up and read it in.. */ + page = lookup_swap_cache(swap, NULL, 0); + if (!page) { + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); + } + /* Here we actually start the io */ + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with page locked to prevent races */ + lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; + goto unlock; + } + if (!PageUptodate(page)) { + error = -EIO; + goto failed; + } + wait_on_page_writeback(page); + + /* + * Some architectures may have to restore extra metadata to the + * physical page after reading from swap. + */ + arch_swap_restore(swap, page); + + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; + } + + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); + if (error) + goto failed; + + spin_lock_irq(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + if (sgp == SGP_WRITE) + mark_page_accessed(page); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + + *pagep = page; + return 0; +failed: + if (!shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: + if (page) { + unlock_page(page); + put_page(page); + } + + return error; +} + +/* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * fault_mm and fault_type are only supplied by shmem_fault: + * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, @@ -1638,9 +1816,7 @@ struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; - swp_entry_t swap; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; int error; @@ -1652,19 +1828,37 @@ if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) sgp = SGP_CACHE; repeat: - swap.val = 0; - page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { - swap = radix_to_swp_entry(page); - page = NULL; - } - if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - error = -EINVAL; - goto unlock; + return -EINVAL; } + sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = vma ? vma->vm_mm : current->mm; + + page = find_lock_entry(mapping, index); + + if (page && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(page)) { + unlock_page(page); + put_page(page); + } + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + + if (xa_is_value(page)) { + error = shmem_swapin_page(inode, index, &page, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; + + *pagep = page; + return error; + } + + if (page) + hindex = page->index; if (page && sgp == SGP_WRITE) mark_page_accessed(page); @@ -1675,230 +1869,141 @@ unlock_page(page); put_page(page); page = NULL; + hindex = index; } - if (page || (sgp == SGP_READ && !swap.val)) { - *pagep = page; - return 0; - } + if (page || sgp == SGP_READ) + goto out; /* * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; - if (swap.val) { - /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { - error = -ENOMEM; - goto failed; - } - } + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { - error = -EEXIST; /* try again */ - goto unlock; - } - if (!PageUptodate(page)) { - error = -EIO; - goto failed; - } - wait_on_page_writeback(page); + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: { + loff_t i_size; + pgoff_t off; - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - if (error) - goto failed; - } - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - * Reset swap.val? No, leave it so "failed" goes back to - * "repeat": reading a hole and writing should succeed. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } - if (error) - goto failed; - - mem_cgroup_commit_charge(page, memcg, true, false); - - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - - if (sgp == SGP_WRITE) - mark_page_accessed(page); - - delete_from_swap_cache(page); - set_page_dirty(page); - swap_free(swap); - - } else { - if (vma && userfaultfd_missing(vma)) { - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); - return 0; - } - - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } + + fallthrough; + } + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; + } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); - if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); - } - if (IS_ERR(page)) { - int retry = 5; - error = PTR_ERR(page); - page = NULL; - if (error != -ENOSPC) - goto failed; - /* - * Try to reclaim some spece by splitting a huge page - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } - goto failed; - } + page = shmem_alloc_and_acct_page(gfp, inode, index, true); + if (IS_ERR(page)) { +alloc_nohuge: + page = shmem_alloc_and_acct_page(gfp, inode, + index, false); + } + if (IS_ERR(page)) { + int retry = 5; - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; - - if (sgp == SGP_WRITE) - __SetPageReferenced(page); - - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) - goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); - goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); - - spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - alloced = true; - - if (PageTransHuge(page) && - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { - /* - * Part of the huge page is beyond i_size: subject - * to shrink under memory pressure. - */ - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } - + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto unlock; /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Try to reclaim some space by splitting a huge page + * beyond i_size on the filesystem. */ - if (sgp == SGP_FALLOC) - sgp = SGP_WRITE; + while (retry--) { + int ret; + + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; + } + goto unlock; + } + + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; + + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) + goto unacct; + lru_cache_add(page); + + spin_lock_irq(&info->lock); + info->alloced += compound_nr(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + alloced = true; + + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { + /* + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; clear: - /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize - * it now, lest undo on failure cancel our earlier guarantee. - */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); - int i; + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !PageUptodate(page)) { + int i; - for (i = 0; i < (1 << compound_order(head)); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); - } - SetPageUptodate(head); + for (i = 0; i < compound_nr(page); i++) { + clear_highpage(page + i); + flush_dcache_page(page + i); } + SetPageUptodate(page); } /* Perhaps the file has been truncated since we checked */ @@ -1914,6 +2019,7 @@ error = -EINVAL; goto unlock; } +out: *pagep = page + index - hindex; return 0; @@ -1921,16 +2027,13 @@ * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, compound_nr(page)); if (PageTransHuge(page)) { unlock_page(page); put_page(page); goto alloc_nohuge; } -failed: - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) - error = -EEXIST; unlock: if (page) { unlock_page(page); @@ -1942,7 +2045,7 @@ spin_unlock_irq(&info->lock); goto repeat; } - if (error == -EEXIST) /* from above or from radix_tree_insert */ + if (error == -EEXIST) goto repeat; return error; } @@ -1994,16 +2097,14 @@ shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { + struct file *fpin; wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ - up_read(&vma->vm_mm->mmap_sem); + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + if (fpin) ret = VM_FAULT_RETRY; - } shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, @@ -2021,6 +2122,9 @@ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); return ret; } spin_unlock(&inode->i_lock); @@ -2059,7 +2163,7 @@ get_area = current->mm->get_unmapped_area; addr = get_area(file, uaddr, len, pgoff, flags); - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; @@ -2179,26 +2283,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; - if (info->seals & F_SEAL_FUTURE_WRITE) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; - /* - * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED - * read-only mapping, take care to not allow mprotect to revert - * protections. - */ - vma->vm_flags &= ~(VM_MAYWRITE); - } + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); @@ -2212,13 +2308,14 @@ struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino; - if (shmem_reserve_inode(sb)) + if (shmem_reserve_inode(sb, &ino)) return NULL; inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + inode->i_ino = ino; inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -2226,6 +2323,7 @@ info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->shrinklist); @@ -2272,28 +2370,25 @@ return mapping->a_ops == &shmem_aops; } -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +#ifdef CONFIG_USERFAULTFD +int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; - spinlock_t *ptl; void *page_kaddr; struct page *page; - pte_t _dst_pte, *dst_pte; int ret; - pgoff_t offset, max_off; + pgoff_t max_off; - ret = -ENOMEM; if (!shmem_inode_acct_block(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, @@ -2304,29 +2399,30 @@ put_page(*pagep); *pagep = NULL; } - goto out; + return -ENOMEM; } if (!*pagep) { + ret = -ENOMEM; page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; - if (!zeropage) { /* mcopy_atomic */ + if (!zeropage) { /* COPY */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); kunmap_atomic(page_kaddr); - /* fallback to copy_from_user outside mmap_sem */ + /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *pagep = page; - shmem_inode_unacct_blocks(inode, 1); + ret = -ENOENT; /* don't free the page */ - return -ENOENT; + goto out_unacct_blocks; } - } else { /* mfill_zeropage_atomic */ + } else { /* ZEROPAGE */ clear_highpage(page); } } else { @@ -2334,57 +2430,26 @@ *pagep = NULL; } - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); __SetPageLocked(page); __SetPageSwapBacked(page); __SetPageUptodate(page); ret = -EFAULT; - offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + if (unlikely(pgoff >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, true, false); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); - - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - else { - /* - * We don't set the pte dirty if the vma has no - * VM_WRITE permission, so mark the page dirty or it - * could be freed from under us. We could do it - * unconditionally before unlock_page(), but doing it - * only if VM_WRITE is not set is faster. - */ - set_page_dirty(page); - } - - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; - - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; - - lru_cache_add_anon(page); + goto out_delete_from_cache; spin_lock_irq(&info->lock); info->alloced++; @@ -2392,52 +2457,19 @@ shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - inc_mm_counter(dst_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(dst_vma, dst_addr, dst_pte); - pte_unmap_unlock(dst_pte, ptl); + SetPageDirty(page); unlock_page(page); - ret = 0; -out: - return ret; -out_release_uncharge_unlock: - pte_unmap_unlock(dst_pte, ptl); - ClearPageDirty(page); + return 0; +out_delete_from_cache: delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); out_release: unlock_page(page); put_page(page); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); - goto out; + return ret; } - -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); -} +#endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; @@ -2617,7 +2649,7 @@ } /* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + * llseek SEEK_DATA or SEEK_HOLE through the page cache. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pgoff_t index, pgoff_t end, int whence) @@ -2647,7 +2679,7 @@ index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } @@ -2943,7 +2975,7 @@ * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { - ret = shmem_reserve_inode(inode->i_sb); + ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } @@ -3095,12 +3127,9 @@ error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; } inode->i_size = len-1; @@ -3192,7 +3221,7 @@ new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } @@ -3209,7 +3238,8 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, - const char *name, void *buffer, size_t size) + const char *name, void *buffer, size_t size, + int flags) { struct shmem_inode_info *info = SHMEM_I(inode); @@ -3225,7 +3255,7 @@ struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags); + return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3352,16 +3382,162 @@ .fh_to_dentry = shmem_fh_to_dentry, }; -static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, - bool remount) +enum shmem_param { + Opt_gid, + Opt_huge, + Opt_mode, + Opt_mpol, + Opt_nr_blocks, + Opt_nr_inodes, + Opt_size, + Opt_uid, + Opt_inode32, + Opt_inode64, +}; + +static const struct constant_table shmem_param_enums_huge[] = { + {"never", SHMEM_HUGE_NEVER }, + {"always", SHMEM_HUGE_ALWAYS }, + {"within_size", SHMEM_HUGE_WITHIN_SIZE }, + {"advise", SHMEM_HUGE_ADVISE }, + {} +}; + +const struct fs_parameter_spec shmem_fs_parameters[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), + fsparam_u32oct("mode", Opt_mode), + fsparam_string("mpol", Opt_mpol), + fsparam_string("nr_blocks", Opt_nr_blocks), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("inode32", Opt_inode32), + fsparam_flag ("inode64", Opt_inode64), + {} +}; + +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { - char *this_char, *value, *rest; - struct mempolicy *mpol = NULL; - uid_t uid; - gid_t gid; + struct shmem_options *ctx = fc->fs_private; + struct fs_parse_result result; + unsigned long long size; + char *rest; + int opt; + kuid_t kuid; + kgid_t kgid; + + opt = fs_parse(fc, shmem_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_size: + size = memparse(param->string, &rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages(); + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_value; + ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_blocks: + ctx->blocks = memparse(param->string, &rest); + if (*rest) + goto bad_value; + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_inodes: + ctx->inodes = memparse(param->string, &rest); + if (*rest) + goto bad_value; + ctx->seen |= SHMEM_SEEN_INODES; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 07777; + break; + case Opt_uid: + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; + break; + case Opt_gid: + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; + break; + case Opt_huge: + ctx->huge = result.uint_32; + if (ctx->huge != SHMEM_HUGE_NEVER && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage())) + goto unsupported_parameter; + ctx->seen |= SHMEM_SEEN_HUGE; + break; + case Opt_mpol: + if (IS_ENABLED(CONFIG_NUMA)) { + mpol_put(ctx->mpol); + ctx->mpol = NULL; + if (mpol_parse_str(param->string, &ctx->mpol)) + goto bad_value; + break; + } + goto unsupported_parameter; + case Opt_inode32: + ctx->full_inums = false; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + case Opt_inode64: + if (sizeof(ino_t) < 8) { + return invalfc(fc, + "Cannot use inode64 with <64bit inums in kernel\n"); + } + ctx->full_inums = true; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + } + return 0; + +unsupported_parameter: + return invalfc(fc, "Unsupported parameter '%s'", param->key); +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); +} + +static int shmem_parse_options(struct fs_context *fc, void *data) +{ + char *options = data; + + if (options) { + int err = security_sb_eat_lsm_opts(options, &fc->security); + if (err) + return err; + } while (options != NULL) { - this_char = options; + char *this_char = options; for (;;) { /* * NUL-terminate this option: unfortunately, @@ -3377,139 +3553,91 @@ break; } } - if (!*this_char) - continue; - if ((value = strchr(this_char,'=')) != NULL) { - *value++ = 0; - } else { - pr_err("tmpfs: No value for mount option '%s'\n", - this_char); - goto error; - } + if (*this_char) { + char *value = strchr(this_char,'='); + size_t len = 0; + int err; - if (!strcmp(this_char,"size")) { - unsigned long long size; - size = memparse(value,&rest); - if (*rest == '%') { - size <<= PAGE_SHIFT; - size *= totalram_pages; - do_div(size, 100); - rest++; + if (value) { + *value++ = '\0'; + len = strlen(value); } - if (*rest) - goto bad_val; - sbinfo->max_blocks = - DIV_ROUND_UP(size, PAGE_SIZE); - } else if (!strcmp(this_char,"nr_blocks")) { - sbinfo->max_blocks = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"nr_inodes")) { - sbinfo->max_inodes = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"mode")) { - if (remount) - continue; - sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"uid")) { - if (remount) - continue; - uid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->uid = make_kuid(current_user_ns(), uid); - if (!uid_valid(sbinfo->uid)) - goto bad_val; - } else if (!strcmp(this_char,"gid")) { - if (remount) - continue; - gid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->gid = make_kgid(current_user_ns(), gid); - if (!gid_valid(sbinfo->gid)) - goto bad_val; -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE - } else if (!strcmp(this_char, "huge")) { - int huge; - huge = shmem_parse_huge(value); - if (huge < 0) - goto bad_val; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER) - goto bad_val; - sbinfo->huge = huge; -#endif -#ifdef CONFIG_NUMA - } else if (!strcmp(this_char,"mpol")) { - mpol_put(mpol); - mpol = NULL; - if (mpol_parse_str(value, &mpol)) - goto bad_val; -#endif - } else { - pr_err("tmpfs: Bad mount option %s\n", this_char); - goto error; + err = vfs_parse_fs_string(fc, this_char, value, len); + if (err < 0) + return err; } } - sbinfo->mpol = mpol; return 0; - -bad_val: - pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", - value, this_char); -error: - mpol_put(mpol); - return 1; - } -static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +/* + * Reconfigure a shmem filesystem. + * + * Note that we disallow change from limited->unlimited blocks/inodes while any + * are in use; but we must separately disallow unlimited->limited, because in + * that case we have no record of how much is already in use. + */ +static int shmem_reconfigure(struct fs_context *fc) { - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - struct shmem_sb_info config = *sbinfo; + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; - int error = -EINVAL; - - config.mpol = NULL; - if (shmem_parse_options(data, &config, true)) - return error; + const char *err; spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) - goto out; - if (config.max_inodes < inodes) - goto out; - /* - * Those tests disallow limited->unlimited while any are in use; - * but we must separately disallow unlimited->limited, because - * in that case we have no record of how much is already in use. - */ - if (config.max_blocks && !sbinfo->max_blocks) - goto out; - if (config.max_inodes && !sbinfo->max_inodes) - goto out; + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { + err = "Cannot retroactively limit size"; + goto out; + } + if (percpu_counter_compare(&sbinfo->used_blocks, + ctx->blocks) > 0) { + err = "Too small a size for current use"; + goto out; + } + } + if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { + if (!sbinfo->max_inodes) { + err = "Cannot retroactively limit inodes"; + goto out; + } + if (ctx->inodes < inodes) { + err = "Too few inodes for current use"; + goto out; + } + } - error = 0; - sbinfo->huge = config.huge; - sbinfo->max_blocks = config.max_blocks; - sbinfo->max_inodes = config.max_inodes; - sbinfo->free_inodes = config.max_inodes - inodes; + if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && + sbinfo->next_ino > UINT_MAX) { + err = "Current inum too high to switch to 32-bit inums"; + goto out; + } + + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + if (ctx->seen & SHMEM_SEEN_INUMS) + sbinfo->full_inums = ctx->full_inums; + if (ctx->seen & SHMEM_SEEN_BLOCKS) + sbinfo->max_blocks = ctx->blocks; + if (ctx->seen & SHMEM_SEEN_INODES) { + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_inodes = ctx->inodes - inodes; + } /* * Preserve previous mempolicy unless mpol remount option was specified. */ - if (config.mpol) { + if (ctx->mpol) { mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; } + spin_unlock(&sbinfo->stat_lock); + return 0; out: spin_unlock(&sbinfo->stat_lock); - return error; + return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) @@ -3529,7 +3657,30 @@ if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + + /* + * Showing inode{64,32} might be useful even if it's the system default, + * since then people don't have to resort to checking both here and + * /proc/config.gz to confirm 64-bit inums were successfully applied + * (which may not even exist if IKCONFIG_PROC isn't enabled). + * + * We hide it when inode64 isn't the default and we are using 32-bit + * inodes, since that probably just means the feature isn't even under + * consideration. + * + * As such: + * + * +-----------------+-----------------+ + * | TMPFS_INODE64=y | TMPFS_INODE64=n | + * +------------------+-----------------+-----------------+ + * | full_inums=true | show | show | + * | full_inums=false | show | hide | + * +------------------+-----------------+-----------------+ + * + */ + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); @@ -3544,14 +3695,16 @@ { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } -int shmem_fill_super(struct super_block *sb, void *data, int silent) +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { + struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int err = -ENOMEM; @@ -3562,9 +3715,6 @@ if (!sbinfo) return -ENOMEM; - sbinfo->mode = 0777 | S_ISVTX; - sbinfo->uid = current_fsuid(); - sbinfo->gid = current_fsgid(); sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -3574,12 +3724,12 @@ * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { - sbinfo->max_blocks = shmem_default_max_blocks(); - sbinfo->max_inodes = shmem_default_max_inodes(); - if (shmem_parse_options(data, sbinfo, false)) { - err = -EINVAL; - goto failed; - } + if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) + ctx->blocks = shmem_default_max_blocks(); + if (!(ctx->seen & SHMEM_SEEN_INODES)) + ctx->inodes = shmem_default_max_inodes(); + if (!(ctx->seen & SHMEM_SEEN_INUMS)) + ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); } else { sb->s_flags |= SB_NOUSER; } @@ -3588,11 +3738,24 @@ #else sb->s_flags |= SB_NOUSER; #endif + sbinfo->max_blocks = ctx->blocks; + sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + if (sb->s_flags & SB_KERNMOUNT) { + sbinfo->ino_batch = alloc_percpu(ino_t); + if (!sbinfo->ino_batch) + goto failed; + } + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->full_inums = ctx->full_inums; + sbinfo->mode = ctx->mode; + sbinfo->huge = ctx->huge; + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; - sbinfo->free_inodes = sbinfo->max_inodes; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); @@ -3625,6 +3788,31 @@ return err; } +static int shmem_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, shmem_fill_super); +} + +static void shmem_free_fc(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + + if (ctx) { + mpol_put(ctx->mpol); + kfree(ctx); + } +} + +static const struct fs_context_operations shmem_fs_context_ops = { + .free = shmem_free_fc, + .get_tree = shmem_get_tree, +#ifdef CONFIG_TMPFS + .parse_monolithic = shmem_parse_options, + .parse_param = shmem_parse_one, + .reconfigure = shmem_reconfigure, +#endif +}; + static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) @@ -3636,9 +3824,8 @@ return &info->vfs_inode; } -static void shmem_destroy_callback(struct rcu_head *head) +static void shmem_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); @@ -3648,7 +3835,6 @@ { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - call_rcu(&inode->i_rcu, shmem_destroy_callback); } static void shmem_init_inode(void *foo) @@ -3739,16 +3925,16 @@ static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, + .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, - .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif @@ -3761,29 +3947,42 @@ .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .allow_speculation = filemap_allow_speculation, +#endif }; -static struct dentry *shmem_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +int shmem_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, shmem_fill_super); + struct shmem_options *ctx; + + ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->mode = 0777 | S_ISVTX; + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + + fc->fs_private = ctx; + fc->ops = &shmem_fs_context_ops; + return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .mount = shmem_mount, + .init_fs_context = shmem_init_fs_context, +#ifdef CONFIG_TMPFS + .parameters = shmem_fs_parameters, +#endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT, }; int __init shmem_init(void) { int error; - - /* If rootfs called this, don't re-init */ - if (shmem_inode_cachep) - return 0; shmem_init_inodecache(); @@ -3800,7 +3999,7 @@ goto out1; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else @@ -3816,11 +4015,11 @@ return error; } -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int values[] = { + static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, @@ -3868,9 +4067,9 @@ struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -3878,6 +4077,8 @@ loff_t i_size; pgoff_t off; + if (!transhuge_vma_enabled(vma, vma->vm_flags)) + return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; if (shmem_huge == SHMEM_HUGE_DENY) @@ -3893,7 +4094,7 @@ if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; - /* fall through */ + fallthrough; case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); @@ -3902,7 +4103,7 @@ return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ @@ -3917,8 +4118,9 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", - .mount = ramfs_mount, - .kill_sb = kill_litter_super, + .init_fs_context = ramfs_init_fs_context, + .parameters = ramfs_fs_parameters, + .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; @@ -3932,7 +4134,8 @@ return 0; } -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { return 0; } @@ -4047,7 +4250,7 @@ /** * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + * @vma: the vma to be mmapped is prepared by do_mmap */ int shmem_zero_setup(struct vm_area_struct *vma) { @@ -4055,7 +4258,7 @@ loff_t size = vma->vm_end - vma->vm_start; /* - * Cloning a new file under mmap_sem leads to a lock ordering conflict + * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). @@ -4069,7 +4272,7 @@ vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); @@ -4117,3 +4320,47 @@ #endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); + +void shmem_mark_page_lazyfree(struct page *page, bool tail) +{ + mark_page_lazyfree_movetail(page, tail); +} +EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree); + +int reclaim_shmem_address_space(struct address_space *mapping) +{ +#ifdef CONFIG_SHMEM + pgoff_t start = 0; + struct page *page; + LIST_HEAD(page_list); + XA_STATE(xas, &mapping->i_pages, start); + + if (!shmem_mapping(mapping)) + return -EINVAL; + + lru_add_drain(); + + rcu_read_lock(); + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (isolate_lru_page(page)) + continue; + + list_add(&page->lru, &page_list); + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + + return reclaim_pages(&page_list); +#else + return 0; +#endif +} +EXPORT_SYMBOL_GPL(reclaim_shmem_address_space); -- Gitblit v1.6.2