From 08f87f769b595151be1afeff53e144f543faa614 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 06 Dec 2023 09:51:13 +0000
Subject: [PATCH] add dts config
---
kernel/mm/shmem.c | 1876 ++++++++++++++++++++++++++++++++-------------------------
1 files changed, 1,053 insertions(+), 823 deletions(-)
diff --git a/kernel/mm/shmem.c b/kernel/mm/shmem.c
index bbb8780..682a3e9 100644
--- a/kernel/mm/shmem.c
+++ b/kernel/mm/shmem.c
@@ -36,8 +36,17 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
+#include <linux/frontswap.h>
+#include <linux/fs_parser.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+
+#include "internal.h"
+
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/shmem_fs.h>
+#include <trace/hooks/mm.h>
static struct vfsmount *shm_mnt;
@@ -80,7 +89,6 @@
#include <linux/uuid.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include "internal.h"
@@ -106,21 +114,43 @@
pgoff_t nr_unswapped; /* how often writepage refused to swap out */
};
+struct shmem_options {
+ unsigned long long blocks;
+ unsigned long long inodes;
+ struct mempolicy *mpol;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ bool full_inums;
+ int huge;
+ int seen;
+#define SHMEM_SEEN_BLOCKS 1
+#define SHMEM_SEEN_INODES 2
+#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
+};
+
#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
- return totalram_pages / 2;
+ return totalram_pages() / 2;
}
static unsigned long shmem_default_max_inodes(void)
{
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+ unsigned long nr_pages = totalram_pages();
+
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
}
#endif
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
@@ -239,18 +269,79 @@
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_inodes) {
+ if (!sbinfo->free_inodes) {
+ raw_spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
}
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
+ }
+ raw_spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
}
+
return 0;
}
@@ -258,9 +349,9 @@
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -326,24 +417,20 @@
}
/*
- * Replace item expected in radix tree by a new item, while holding tree lock.
+ * Replace item expected in xarray by a new item, while holding xa_lock.
*/
-static int shmem_radix_tree_replace(struct address_space *mapping,
+static int shmem_replace_entry(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
- struct radix_tree_node *node;
- void __rcu **pslot;
+ XA_STATE(xas, &mapping->i_pages, index);
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
- if (!item)
- return -ENOENT;
+ item = xas_load(&xas);
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->i_pages, node, pslot,
- replacement, NULL);
+ xas_store(&xas, replacement);
return 0;
}
@@ -357,12 +444,7 @@
static bool shmem_confirm_swap(struct address_space *mapping,
pgoff_t index, swp_entry_t swap)
{
- void *item;
-
- rcu_read_lock();
- item = radix_tree_lookup(&mapping->i_pages, index);
- rcu_read_unlock();
- return item == swp_to_radix_entry(swap);
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}
/*
@@ -397,12 +479,12 @@
#define SHMEM_HUGE_DENY (-1)
#define SHMEM_HUGE_FORCE (-2)
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly;
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -419,7 +501,9 @@
return SHMEM_HUGE_FORCE;
return -EINVAL;
}
+#endif
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
switch (huge) {
@@ -570,7 +654,7 @@
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
return READ_ONCE(sbinfo->shrinklist_len);
}
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
#define shmem_huge SHMEM_HUGE_DENY
@@ -579,11 +663,11 @@
{
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
(shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
shmem_huge != SHMEM_HUGE_DENY)
return true;
@@ -595,9 +679,13 @@
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected)
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
{
- int error, nr = hpage_nr_pages(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
+ unsigned long i = 0;
+ unsigned long nr = compound_nr(page);
+ int error;
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -609,46 +697,53 @@
page->mapping = mapping;
page->index = index;
- xa_lock_irq(&mapping->i_pages);
- if (PageTransHuge(page)) {
- void __rcu **results;
- pgoff_t idx;
- int i;
-
- error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
- &results, &idx, index, 1) &&
- idx < index + HPAGE_PMD_NR) {
- error = -EEXIST;
- }
-
- if (!error) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->i_pages,
- index + i, page + i);
- VM_BUG_ON(error);
+ if (!PageSwapCache(page)) {
+ error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
}
- count_vm_event(THP_FILE_ALLOC);
+ goto error;
}
- } else if (!expected) {
- error = radix_tree_insert(&mapping->i_pages, index, page);
- } else {
- error = shmem_radix_tree_replace(mapping, index, expected,
- page);
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
+ do {
+ void *entry;
+ xas_lock_irq(&xas);
+ entry = xas_find_conflict(&xas);
+ if (entry != expected)
+ xas_set_err(&xas, -EEXIST);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+next:
+ xas_store(&xas, page);
+ if (++i < nr) {
+ xas_next(&xas);
+ goto next;
+ }
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_ALLOC);
+ __inc_node_page_state(page, NR_SHMEM_THPS);
+ }
+ mapping->nrpages += nr;
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
+ goto error;
}
- if (!error) {
- mapping->nrpages += nr;
- if (PageTransHuge(page))
- __inc_node_page_state(page, NR_SHMEM_THPS);
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
- xa_unlock_irq(&mapping->i_pages);
- } else {
- page->mapping = NULL;
- xa_unlock_irq(&mapping->i_pages);
- page_ref_sub(page, nr);
- }
+ return 0;
+error:
+ page->mapping = NULL;
+ page_ref_sub(page, nr);
return error;
}
@@ -663,27 +758,25 @@
VM_BUG_ON_PAGE(PageCompound(page), page);
xa_lock_irq(&mapping->i_pages);
- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- __dec_node_page_state(page, NR_SHMEM);
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
+ __dec_lruvec_page_state(page, NR_SHMEM);
xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
}
/*
- * Remove swap entry from radix tree, free the swap and its page cache.
+ * Remove swap entry from page cache, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
void *old;
- xa_lock_irq(&mapping->i_pages);
- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
- xa_unlock_irq(&mapping->i_pages);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -700,29 +793,19 @@
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= end)
- break;
-
- page = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, end - 1) {
+ if (xas_retry(&xas, page))
continue;
- }
-
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
swapped++;
if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
}
@@ -797,7 +880,33 @@
}
/*
- * Remove range of pages and swap entries from radix tree, and free them.
+ * Check whether a hole-punch or truncation needs to split a huge page,
+ * returning true if no split was required, or the split has been successful.
+ *
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
+ * head, and then succeeded to trylock on tail.
+ *
+ * A split can only succeed when there are no additional references on the
+ * huge page: so the split below relies upon find_get_entries() having stopped
+ * when it found a subpage of the huge page, without getting further references.
+ */
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+{
+ if (!PageTransCompound(page))
+ return true;
+
+ /* Just proceed to delete a huge page wholly within the range punched */
+ if (PageHead(page) &&
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
+ return true;
+
+ /* Try to split huge page, so we can truly punch the hole or truncate */
+ return split_huge_page(page) >= 0;
+}
+
+/*
+ * Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
@@ -833,7 +942,7 @@
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
@@ -846,31 +955,11 @@
if (!trylock_page(page))
continue;
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
- if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if ((!unfalloc || !PageUptodate(page)) &&
+ page_mapping(page) == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
truncate_inode_page(mapping, page);
- }
}
unlock_page(page);
}
@@ -930,7 +1019,7 @@
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
if (shmem_free_swap(mapping, index, page)) {
@@ -944,42 +1033,24 @@
lock_page(page);
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- /*
- * Partial thp truncate due 'start' in middle
- * of THP: don't need to look on these pages
- * again on !pvec.nr restart.
- */
- if (index != round_down(end, HPAGE_PMD_NR))
- start++;
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- truncate_inode_page(mapping, page);
- } else {
+ if (page_mapping(page) != mapping) {
/* Page was replaced by swap: retry */
unlock_page(page);
index--;
break;
+ }
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
+ truncate_inode_page(mapping, page);
+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Wipe the page and don't get stuck */
+ clear_highpage(page);
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ if (index <
+ round_up(start, HPAGE_PMD_NR))
+ start = index + 1;
}
}
unlock_page(page);
@@ -1067,7 +1138,7 @@
* Part of the huge page can be beyond i_size: subject
* to shrink under memory pressure.
*/
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
spin_lock(&sbinfo->shrinklist_lock);
/*
* _careful to defend against unlocked access to
@@ -1106,9 +1177,14 @@
}
spin_unlock(&sbinfo->shrinklist_lock);
}
- if (!list_empty(&info->swaplist)) {
+ while (!list_empty(&info->swaplist)) {
+ /* Wait while shmem_unuse() is scanning this inode... */
+ wait_var_event(&info->stop_eviction,
+ !atomic_read(&info->stop_eviction));
mutex_lock(&shmem_swaplist_mutex);
- list_del_init(&info->swaplist);
+ /* ...but beware of the race if we peeked too early */
+ if (!atomic_read(&info->stop_eviction))
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
}
@@ -1119,166 +1195,174 @@
clear_inode(inode);
}
-static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+extern struct swap_info_struct *swap_info[];
+
+static int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices,
+ unsigned int type, bool frontswap)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- unsigned long found = -1;
- unsigned int checked = 0;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ swp_entry_t entry;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, root, &iter, 0) {
- void *entry = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(entry)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
+
+ if (!xa_is_value(page))
+ continue;
+
+ entry = radix_to_swp_entry(page);
+ if (swp_type(entry) != type)
+ continue;
+ if (frontswap &&
+ !frontswap_test(swap_info[type], swp_offset(entry)))
+ continue;
+
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
}
- if (entry == item) {
- found = iter.index;
+ if (++ret == nr_entries)
break;
- }
- checked++;
- if ((checked % 4096) != 0)
- continue;
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
}
-
rcu_read_unlock();
- return found;
+
+ return ret;
+}
+
+/*
+ * Move the swapped pages for an inode to page cache. Returns the count
+ * of pages swapped in, or the error in case of failure.
+ */
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
+ pgoff_t *indices)
+{
+ int i = 0;
+ int ret = 0;
+ int error = 0;
+ struct address_space *mapping = inode->i_mapping;
+
+ for (i = 0; i < pvec.nr; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!xa_is_value(page))
+ continue;
+ error = shmem_swapin_page(inode, indices[i],
+ &page, SGP_CACHE,
+ mapping_gfp_mask(mapping),
+ NULL, NULL);
+ if (error == 0) {
+ unlock_page(page);
+ put_page(page);
+ ret++;
+ }
+ if (error == -ENOMEM)
+ break;
+ error = 0;
+ }
+ return error ? error : ret;
}
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
-static int shmem_unuse_inode(struct shmem_inode_info *info,
- swp_entry_t swap, struct page **pagep)
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
- struct address_space *mapping = info->vfs_inode.i_mapping;
- void *radswap;
- pgoff_t index;
- gfp_t gfp;
- int error = 0;
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start = 0;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
+ int ret = 0;
- radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->i_pages, radswap);
- if (index == -1)
- return -EAGAIN; /* tell shmem_unuse we found nothing */
+ pagevec_init(&pvec);
+ do {
+ unsigned int nr_entries = PAGEVEC_SIZE;
- /*
- * Move _head_ to start search for next from here.
- * But be careful: shmem_evict_inode checks list_empty without taking
- * mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist.
- */
- if (shmem_swaplist.next != &info->swaplist)
- list_move_tail(&shmem_swaplist, &info->swaplist);
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
+ nr_entries = *fs_pages_to_unuse;
- gfp = mapping_gfp_mask(mapping);
- if (shmem_should_replace_page(*pagep, gfp)) {
- mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_replace_page(pagep, gfp, info, index);
- mutex_lock(&shmem_swaplist_mutex);
- /*
- * We needed to drop mutex to make that restrictive page
- * allocation, but the inode might have been freed while we
- * dropped it: although a racing shmem_evict_inode() cannot
- * complete without emptying the radix_tree, our page lock
- * on this swapcache page is not enough to prevent that -
- * free_swap_and_cache() of our swap entry will only
- * trylock_page(), removing swap from radix_tree whatever.
- *
- * We must not proceed to shmem_add_to_page_cache() if the
- * inode has been freed, but of course we cannot rely on
- * inode or mapping or info to check that. However, we can
- * safely check if our swap entry is still in use (and here
- * it can't have got reused for another page): if it's still
- * in use, then the inode cannot have been freed yet, and we
- * can safely proceed (if it's no longer in use, that tells
- * nothing about the inode, but we don't need to unuse swap).
- */
- if (!page_swapcount(*pagep))
- error = -ENOENT;
- }
-
- /*
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
- * beneath us (pagelock doesn't help until the page is in pagecache).
- */
- if (!error)
- error = shmem_add_to_page_cache(*pagep, mapping, index,
- radswap);
- if (error != -ENOMEM) {
- /*
- * Truncation and eviction use free_swap_and_cache(), which
- * only does trylock page: if we raced, best clean up here.
- */
- delete_from_swap_cache(*pagep);
- set_page_dirty(*pagep);
- if (!error) {
- spin_lock_irq(&info->lock);
- info->swapped--;
- spin_unlock_irq(&info->lock);
- swap_free(swap);
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
+ pvec.pages, indices,
+ type, frontswap);
+ if (pvec.nr == 0) {
+ ret = 0;
+ break;
}
- }
- return error;
+
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ if (ret < 0)
+ break;
+
+ if (frontswap_partial) {
+ *fs_pages_to_unuse -= ret;
+ if (*fs_pages_to_unuse == 0) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ break;
+ }
+ }
+
+ start = indices[pvec.nr - 1];
+ } while (true);
+
+ return ret;
}
/*
- * Search through swapped inodes to find and replace swap by page.
+ * Read all the shared memory data that resides in the swap
+ * device 'type' back into memory, so the swap device can be
+ * unused.
*/
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- struct list_head *this, *next;
- struct shmem_inode_info *info;
- struct mem_cgroup *memcg;
+ struct shmem_inode_info *info, *next;
int error = 0;
- /*
- * There's a faint possibility that swap page was replaced before
- * caller locked it: caller will come back later with the right page.
- */
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
- goto out;
-
- /*
- * Charge page using GFP_KERNEL while we can wait, before taking
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
- * Charged back to the user (not to caller) when swap account is used.
- */
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
- &memcg, false);
- if (error)
- goto out;
- /* No radix_tree_preload: swap entry keeps a place for page in tree */
- error = -EAGAIN;
+ if (list_empty(&shmem_swaplist))
+ return 0;
mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(this, next, &shmem_swaplist) {
- info = list_entry(this, struct shmem_inode_info, swaplist);
- if (info->swapped)
- error = shmem_unuse_inode(info, swap, &page);
- else
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
+ if (!info->swapped) {
list_del_init(&info->swaplist);
+ continue;
+ }
+ /*
+ * Drop the swaplist mutex while searching the inode for swap;
+ * but before doing so, make sure shmem_evict_inode() will not
+ * remove placeholder inode from swaplist, nor let it be freed
+ * (igrab() would protect from unlink, but not from unmount).
+ */
+ atomic_inc(&info->stop_eviction);
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
+ fs_pages_to_unuse);
cond_resched();
- if (error != -EAGAIN)
+
+ mutex_lock(&shmem_swaplist_mutex);
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
+ if (atomic_dec_and_test(&info->stop_eviction))
+ wake_up_var(&info->stop_eviction);
+ if (error)
break;
- /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (error) {
- if (error != -ENOMEM)
- error = 0;
- mem_cgroup_cancel_charge(page, memcg, false);
- } else
- mem_cgroup_commit_charge(page, memcg, true, false);
-out:
- unlock_page(page);
- put_page(page);
return error;
}
@@ -1348,6 +1432,7 @@
SetPageUptodate(page);
}
+ trace_android_vh_set_shmem_page_flag(page);
swap = get_swap_page(page);
if (!swap.val)
goto redirty;
@@ -1362,9 +1447,11 @@
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
+ list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ if (add_to_swap_cache(page, swap,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
@@ -1406,10 +1493,10 @@
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -1447,11 +1534,11 @@
{
struct vm_area_struct pvma;
struct page *page;
- struct vm_fault vmf;
+ struct vm_fault vmf = {
+ .vma = &pvma,
+ };
shmem_pseudo_vma_init(&pvma, info, index);
- vmf.vma = &pvma;
- vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
@@ -1462,23 +1549,14 @@
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct inode *inode = &info->vfs_inode;
- struct address_space *mapping = inode->i_mapping;
- pgoff_t idx, hindex;
- void __rcu **results;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ pgoff_t hindex;
struct page *page;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return NULL;
-
hindex = round_down(index, HPAGE_PMD_NR);
- rcu_read_lock();
- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
- rcu_read_unlock();
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
+ XA_PRESENT))
return NULL;
- }
- rcu_read_unlock();
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
@@ -1486,6 +1564,8 @@
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
+ else
+ count_vm_event(THP_FILE_FALLBACK);
return page;
}
@@ -1493,7 +1573,11 @@
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct page *page;
+ struct page *page = NULL;
+
+ trace_android_vh_shmem_alloc_page(&page);
+ if (page)
+ return page;
shmem_pseudo_vma_init(&pvma, info, index);
page = alloc_page_vma(gfp, &pvma, 0);
@@ -1511,7 +1595,7 @@
int nr;
int err = -ENOSPC;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
@@ -1589,11 +1673,11 @@
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
- newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
- __inc_node_page_state(newpage, NR_FILE_PAGES);
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(oldpage, newpage);
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1605,8 +1689,7 @@
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ lru_cache_add(newpage);
*pagep = newpage;
}
@@ -1620,13 +1703,109 @@
}
/*
+ * Swap in the page pointed to by *pagep.
+ * Caller has to make sure that *pagep contains a valid swapped page.
+ * Returns 0 and the page in pagep if success. On failure, returns the
+ * error code and NULL in *pagep.
+ */
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+ struct page *page;
+ swp_entry_t swap;
+ int error;
+
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
+ swap = radix_to_swp_entry(*pagep);
+ *pagep = NULL;
+
+ /* Look it up and read it in.. */
+ page = lookup_swap_cache(swap, NULL, 0);
+ if (!page) {
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
+ *fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ /* We have to do this with page locked to prevent races */
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST;
+ goto unlock;
+ }
+ if (!PageUptodate(page)) {
+ error = -EIO;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ /*
+ * Some architectures may have to restore extra metadata to the
+ * physical page after reading from swap.
+ */
+ arch_swap_restore(swap, page);
+
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
+ }
+
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
+ if (error)
+ goto failed;
+
+ spin_lock_irq(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ *pagep = page;
+ return 0;
+failed:
+ if (!shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return error;
+}
+
+/*
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1638,9 +1817,7 @@
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct mem_cgroup *memcg;
struct page *page;
- swp_entry_t swap;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
int error;
@@ -1652,19 +1829,37 @@
if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
sgp = SGP_CACHE;
repeat:
- swap.val = 0;
- page = find_lock_entry(mapping, index);
- if (radix_tree_exceptional_entry(page)) {
- swap = radix_to_swp_entry(page);
- page = NULL;
- }
-
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- error = -EINVAL;
- goto unlock;
+ return -EINVAL;
}
+ sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = vma ? vma->vm_mm : current->mm;
+
+ page = find_lock_entry(mapping, index);
+
+ if (page && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(page)) {
+ unlock_page(page);
+ put_page(page);
+ }
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
+
+ if (xa_is_value(page)) {
+ error = shmem_swapin_page(inode, index, &page,
+ sgp, gfp, vma, fault_type);
+ if (error == -EEXIST)
+ goto repeat;
+
+ *pagep = page;
+ return error;
+ }
+
+ if (page)
+ hindex = page->index;
if (page && sgp == SGP_WRITE)
mark_page_accessed(page);
@@ -1675,230 +1870,141 @@
unlock_page(page);
put_page(page);
page = NULL;
+ hindex = index;
}
- if (page || (sgp == SGP_READ && !swap.val)) {
- *pagep = page;
- return 0;
- }
+ if (page || sgp == SGP_READ)
+ goto out;
/*
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
- if (swap.val) {
- /* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
- /* Or update major stats only when swapin succeeds?? */
- if (fault_type) {
- *fault_type |= VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
- }
- /* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
- error = -ENOMEM;
- goto failed;
- }
- }
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
- error = -EEXIST; /* try again */
- goto unlock;
- }
- if (!PageUptodate(page)) {
- error = -EIO;
- goto failed;
- }
- wait_on_page_writeback(page);
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE: {
+ loff_t i_size;
+ pgoff_t off;
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- if (error)
- goto failed;
- }
-
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap));
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- * Reset swap.val? No, leave it so "failed" goes back to
- * "repeat": reading a hole and writing should succeed.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
- if (error)
- goto failed;
-
- mem_cgroup_commit_charge(page, memcg, true, false);
-
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
- if (sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- delete_from_swap_cache(page);
- set_page_dirty(page);
- swap_free(swap);
-
- } else {
- if (vma && userfaultfd_missing(vma)) {
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
- return 0;
- }
-
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
- /* fallthrough */
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
+
+ fallthrough;
+ }
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
+ }
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
- }
- if (IS_ERR(page)) {
- int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
- if (error != -ENOSPC)
- goto failed;
- /*
- * Try to reclaim some spece by splitting a huge page
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
- goto failed;
- }
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge:
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
-
- if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
-
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error)
- goto unacct;
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
- compound_order(page));
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL);
- radix_tree_preload_end();
- }
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
- goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
-
- spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- alloced = true;
-
- if (PageTransHuge(page) &&
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
- /*
- * Part of the huge page is beyond i_size: subject
- * to shrink under memory pressure.
- */
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
-
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto unlock;
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Try to reclaim some space by splitting a huge page
+ * beyond i_size on the filesystem.
*/
- if (sgp == SGP_FALLOC)
- sgp = SGP_WRITE;
+ while (retry--) {
+ int ret;
+
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
+ }
+ goto unlock;
+ }
+
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
+
+ if (sgp == SGP_WRITE)
+ __SetPageReferenced(page);
+
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
+ if (error)
+ goto unacct;
+ lru_cache_add(page);
+
+ spin_lock_irq(&info->lock);
+ info->alloced += compound_nr(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ alloced = true;
+
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
+ /*
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
clear:
- /*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
- * it now, lest undo on failure cancel our earlier guarantee.
- */
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
- int i;
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ int i;
- for (i = 0; i < (1 << compound_order(head)); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- SetPageUptodate(head);
+ for (i = 0; i < compound_nr(page); i++) {
+ clear_highpage(page + i);
+ flush_dcache_page(page + i);
}
+ SetPageUptodate(page);
}
/* Perhaps the file has been truncated since we checked */
@@ -1914,6 +2020,7 @@
error = -EINVAL;
goto unlock;
}
+out:
*pagep = page + index - hindex;
return 0;
@@ -1921,16 +2028,13 @@
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
if (PageTransHuge(page)) {
unlock_page(page);
put_page(page);
goto alloc_nohuge;
}
-failed:
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
- error = -EEXIST;
unlock:
if (page) {
unlock_page(page);
@@ -1942,7 +2046,7 @@
spin_unlock_irq(&info->lock);
goto repeat;
}
- if (error == -EEXIST) /* from above or from radix_tree_insert */
+ if (error == -EEXIST)
goto repeat;
return error;
}
@@ -1994,16 +2098,14 @@
shmem_falloc->waitq &&
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* It's polite to up mmap_sem if we can */
- up_read(&vma->vm_mm->mmap_sem);
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
ret = VM_FAULT_RETRY;
- }
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2021,6 +2123,9 @@
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
return ret;
}
spin_unlock(&inode->i_lock);
@@ -2059,7 +2164,7 @@
get_area = current->mm->get_unmapped_area;
addr = get_area(file, uaddr, len, pgoff, flags);
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
if (IS_ERR_VALUE(addr))
return addr;
@@ -2179,26 +2284,18 @@
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
- /*
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
- * read-only mapping, take care to not allow mprotect to revert
- * protections.
- */
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
@@ -2212,13 +2309,14 @@
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
- if (shmem_reserve_inode(sb))
+ if (shmem_reserve_inode(sb, &ino))
return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2226,6 +2324,7 @@
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->shrinklist);
@@ -2272,28 +2371,25 @@
return mapping->a_ops == &shmem_aops;
}
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- struct mem_cgroup *memcg;
- spinlock_t *ptl;
void *page_kaddr;
struct page *page;
- pte_t _dst_pte, *dst_pte;
int ret;
- pgoff_t offset, max_off;
+ pgoff_t max_off;
- ret = -ENOMEM;
if (!shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
@@ -2304,29 +2400,30 @@
put_page(*pagep);
*pagep = NULL;
}
- goto out;
+ return -ENOMEM;
}
if (!*pagep) {
+ ret = -ENOMEM;
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */
+ if (!zeropage) { /* COPY */
page_kaddr = kmap_atomic(page);
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
- shmem_inode_unacct_blocks(inode, 1);
+ ret = -ENOENT;
/* don't free the page */
- return -ENOENT;
+ goto out_unacct_blocks;
}
- } else { /* mfill_zeropage_atomic */
+ } else { /* ZEROPAGE */
clear_highpage(page);
}
} else {
@@ -2334,57 +2431,26 @@
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ VM_BUG_ON(PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
__SetPageUptodate(page);
ret = -EFAULT;
- offset = linear_page_index(dst_vma, dst_addr);
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ if (unlikely(pgoff >= max_off))
goto out_release;
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
- if (!ret) {
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
- radix_tree_preload_end();
- }
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, false);
if (ret)
- goto out_release_uncharge;
-
- mem_cgroup_commit_charge(page, memcg, false, false);
-
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- else {
- /*
- * We don't set the pte dirty if the vma has no
- * VM_WRITE permission, so mark the page dirty or it
- * could be freed from under us. We could do it
- * unconditionally before unlock_page(), but doing it
- * only if VM_WRITE is not set is faster.
- */
- set_page_dirty(page);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
-
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- lru_cache_add_anon(page);
+ goto out_delete_from_cache;
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2392,52 +2458,19 @@
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- inc_mm_counter(dst_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
- pte_unmap_unlock(dst_pte, ptl);
+ SetPageDirty(page);
unlock_page(page);
- ret = 0;
-out:
- return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- ClearPageDirty(page);
+ return 0;
+out_delete_from_cache:
delete_from_page_cache(page);
-out_release_uncharge:
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
unlock_page(page);
put_page(page);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
- goto out;
+ return ret;
}
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
-}
+#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
@@ -2617,7 +2650,7 @@
}
/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
*/
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pgoff_t index, pgoff_t end, int whence)
@@ -2647,7 +2680,7 @@
index = indices[i];
}
page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
+ if (page && !xa_is_value(page)) {
if (!PageUptodate(page))
page = NULL;
}
@@ -2943,7 +2976,7 @@
* first link must skip that, to get the accounting right.
*/
if (inode->i_nlink) {
- ret = shmem_reserve_inode(inode->i_sb);
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
goto out;
}
@@ -3095,12 +3128,9 @@
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- error = 0;
+ if (error && error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
}
inode->i_size = len-1;
@@ -3192,7 +3222,7 @@
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
GFP_KERNEL);
if (!new_xattr->name) {
- kfree(new_xattr);
+ kvfree(new_xattr);
return -ENOMEM;
}
@@ -3209,7 +3239,8 @@
static int shmem_xattr_handler_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
- const char *name, void *buffer, size_t size)
+ const char *name, void *buffer, size_t size,
+ int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3225,7 +3256,7 @@
struct shmem_inode_info *info = SHMEM_I(inode);
name = xattr_full_name(handler, name);
- return simple_xattr_set(&info->xattrs, name, value, size, flags);
+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3352,16 +3383,142 @@
.fh_to_dentry = shmem_fh_to_dentry,
};
-static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
- bool remount)
+enum shmem_param {
+ Opt_gid,
+ Opt_huge,
+ Opt_mode,
+ Opt_mpol,
+ Opt_nr_blocks,
+ Opt_nr_inodes,
+ Opt_size,
+ Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
+};
+
+static const struct constant_table shmem_param_enums_huge[] = {
+ {"never", SHMEM_HUGE_NEVER },
+ {"always", SHMEM_HUGE_ALWAYS },
+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
+ {"advise", SHMEM_HUGE_ADVISE },
+ {}
+};
+
+const struct fs_parameter_spec shmem_fs_parameters[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
+ fsparam_u32oct("mode", Opt_mode),
+ fsparam_string("mpol", Opt_mpol),
+ fsparam_string("nr_blocks", Opt_nr_blocks),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
+ {}
+};
+
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
- char *this_char, *value, *rest;
- struct mempolicy *mpol = NULL;
- uid_t uid;
- gid_t gid;
+ struct shmem_options *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ unsigned long long size;
+ char *rest;
+ int opt;
+
+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_size:
+ size = memparse(param->string, &rest);
+ if (*rest == '%') {
+ size <<= PAGE_SHIFT;
+ size *= totalram_pages();
+ do_div(size, 100);
+ rest++;
+ }
+ if (*rest)
+ goto bad_value;
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_blocks:
+ ctx->blocks = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_inodes:
+ ctx->inodes = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_INODES;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 07777;
+ break;
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_value;
+ break;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_value;
+ break;
+ case Opt_huge:
+ ctx->huge = result.uint_32;
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage()))
+ goto unsupported_parameter;
+ ctx->seen |= SHMEM_SEEN_HUGE;
+ break;
+ case Opt_mpol:
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ mpol_put(ctx->mpol);
+ ctx->mpol = NULL;
+ if (mpol_parse_str(param->string, &ctx->mpol))
+ goto bad_value;
+ break;
+ }
+ goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ }
+ return 0;
+
+unsupported_parameter:
+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
+}
+
+static int shmem_parse_options(struct fs_context *fc, void *data)
+{
+ char *options = data;
+
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
while (options != NULL) {
- this_char = options;
+ char *this_char = options;
for (;;) {
/*
* NUL-terminate this option: unfortunately,
@@ -3377,139 +3534,93 @@
break;
}
}
- if (!*this_char)
- continue;
- if ((value = strchr(this_char,'=')) != NULL) {
- *value++ = 0;
- } else {
- pr_err("tmpfs: No value for mount option '%s'\n",
- this_char);
- goto error;
- }
+ if (*this_char) {
+ char *value = strchr(this_char,'=');
+ size_t len = 0;
+ int err;
- if (!strcmp(this_char,"size")) {
- unsigned long long size;
- size = memparse(value,&rest);
- if (*rest == '%') {
- size <<= PAGE_SHIFT;
- size *= totalram_pages;
- do_div(size, 100);
- rest++;
+ if (value) {
+ *value++ = '\0';
+ len = strlen(value);
}
- if (*rest)
- goto bad_val;
- sbinfo->max_blocks =
- DIV_ROUND_UP(size, PAGE_SIZE);
- } else if (!strcmp(this_char,"nr_blocks")) {
- sbinfo->max_blocks = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"nr_inodes")) {
- sbinfo->max_inodes = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"mode")) {
- if (remount)
- continue;
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"uid")) {
- if (remount)
- continue;
- uid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->uid = make_kuid(current_user_ns(), uid);
- if (!uid_valid(sbinfo->uid))
- goto bad_val;
- } else if (!strcmp(this_char,"gid")) {
- if (remount)
- continue;
- gid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->gid = make_kgid(current_user_ns(), gid);
- if (!gid_valid(sbinfo->gid))
- goto bad_val;
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
- } else if (!strcmp(this_char, "huge")) {
- int huge;
- huge = shmem_parse_huge(value);
- if (huge < 0)
- goto bad_val;
- if (!has_transparent_hugepage() &&
- huge != SHMEM_HUGE_NEVER)
- goto bad_val;
- sbinfo->huge = huge;
-#endif
-#ifdef CONFIG_NUMA
- } else if (!strcmp(this_char,"mpol")) {
- mpol_put(mpol);
- mpol = NULL;
- if (mpol_parse_str(value, &mpol))
- goto bad_val;
-#endif
- } else {
- pr_err("tmpfs: Bad mount option %s\n", this_char);
- goto error;
+ err = vfs_parse_fs_string(fc, this_char, value, len);
+ if (err < 0)
+ return err;
}
}
- sbinfo->mpol = mpol;
return 0;
-
-bad_val:
- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
- value, this_char);
-error:
- mpol_put(mpol);
- return 1;
-
}
-static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
+/*
+ * Reconfigure a shmem filesystem.
+ *
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
+ * are in use; but we must separately disallow unlimited->limited, because in
+ * that case we have no record of how much is already in use.
+ */
+static int shmem_reconfigure(struct fs_context *fc)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- struct shmem_sb_info config = *sbinfo;
+ struct shmem_options *ctx = fc->fs_private;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
- int error = -EINVAL;
+ struct mempolicy *mpol = NULL;
+ const char *err;
- config.mpol = NULL;
- if (shmem_parse_options(data, &config, true))
- return error;
-
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
- goto out;
- if (config.max_inodes < inodes)
- goto out;
- /*
- * Those tests disallow limited->unlimited while any are in use;
- * but we must separately disallow unlimited->limited, because
- * in that case we have no record of how much is already in use.
- */
- if (config.max_blocks && !sbinfo->max_blocks)
- goto out;
- if (config.max_inodes && !sbinfo->max_inodes)
- goto out;
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
+ if (!sbinfo->max_blocks) {
+ err = "Cannot retroactively limit size";
+ goto out;
+ }
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ ctx->blocks) > 0) {
+ err = "Too small a size for current use";
+ goto out;
+ }
+ }
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
+ if (!sbinfo->max_inodes) {
+ err = "Cannot retroactively limit inodes";
+ goto out;
+ }
+ if (ctx->inodes < inodes) {
+ err = "Too few inodes for current use";
+ goto out;
+ }
+ }
- error = 0;
- sbinfo->huge = config.huge;
- sbinfo->max_blocks = config.max_blocks;
- sbinfo->max_inodes = config.max_inodes;
- sbinfo->free_inodes = config.max_inodes - inodes;
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
+ sbinfo->max_blocks = ctx->blocks;
+ if (ctx->seen & SHMEM_SEEN_INODES) {
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_inodes = ctx->inodes - inodes;
+ }
/*
* Preserve previous mempolicy unless mpol remount option was specified.
*/
- if (config.mpol) {
- mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
+ if (ctx->mpol) {
+ mpol = sbinfo->mpol;
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
+ ctx->mpol = NULL;
}
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
+ return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
- return error;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ return invalfc(fc, "%s", err);
}
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
@@ -3529,7 +3640,30 @@
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
@@ -3544,14 +3678,16 @@
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
kfree(sbinfo);
sb->s_fs_info = NULL;
}
-int shmem_fill_super(struct super_block *sb, void *data, int silent)
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
int err = -ENOMEM;
@@ -3562,9 +3698,6 @@
if (!sbinfo)
return -ENOMEM;
- sbinfo->mode = 0777 | S_ISVTX;
- sbinfo->uid = current_fsuid();
- sbinfo->gid = current_fsgid();
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
@@ -3574,12 +3707,12 @@
* but the internal instance is left unlimited.
*/
if (!(sb->s_flags & SB_KERNMOUNT)) {
- sbinfo->max_blocks = shmem_default_max_blocks();
- sbinfo->max_inodes = shmem_default_max_inodes();
- if (shmem_parse_options(data, sbinfo, false)) {
- err = -EINVAL;
- goto failed;
- }
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
+ ctx->blocks = shmem_default_max_blocks();
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
+ ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3588,11 +3721,24 @@
#else
sb->s_flags |= SB_NOUSER;
#endif
+ sbinfo->max_blocks = ctx->blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
+ sbinfo->mode = ctx->mode;
+ sbinfo->huge = ctx->huge;
+ sbinfo->mpol = ctx->mpol;
+ ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
- sbinfo->free_inodes = sbinfo->max_inodes;
spin_lock_init(&sbinfo->shrinklist_lock);
INIT_LIST_HEAD(&sbinfo->shrinklist);
@@ -3625,6 +3771,31 @@
return err;
}
+static int shmem_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, shmem_fill_super);
+}
+
+static void shmem_free_fc(struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+
+ if (ctx) {
+ mpol_put(ctx->mpol);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations shmem_fs_context_ops = {
+ .free = shmem_free_fc,
+ .get_tree = shmem_get_tree,
+#ifdef CONFIG_TMPFS
+ .parse_monolithic = shmem_parse_options,
+ .parse_param = shmem_parse_one,
+ .reconfigure = shmem_reconfigure,
+#endif
+};
+
static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
@@ -3636,9 +3807,8 @@
return &info->vfs_inode;
}
-static void shmem_destroy_callback(struct rcu_head *head)
+static void shmem_free_in_core_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
@@ -3648,7 +3818,6 @@
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
static void shmem_init_inode(void *foo)
@@ -3739,16 +3908,16 @@
static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
+ .free_inode = shmem_free_in_core_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
- .remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
.free_cached_objects = shmem_unused_huge_scan,
#endif
@@ -3761,29 +3930,42 @@
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ .allow_speculation = filemap_allow_speculation,
+#endif
};
-static struct dentry *shmem_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+int shmem_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, shmem_fill_super);
+ struct shmem_options *ctx;
+
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mode = 0777 | S_ISVTX;
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+
+ fc->fs_private = ctx;
+ fc->ops = &shmem_fs_context_ops;
+ return 0;
}
static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .mount = shmem_mount,
+ .init_fs_context = shmem_init_fs_context,
+#ifdef CONFIG_TMPFS
+ .parameters = shmem_fs_parameters,
+#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};
int __init shmem_init(void)
{
int error;
-
- /* If rootfs called this, don't re-init */
- if (shmem_inode_cachep)
- return 0;
shmem_init_inodecache();
@@ -3800,7 +3982,7 @@
goto out1;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
@@ -3816,11 +3998,11 @@
return error;
}
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int values[] = {
+ static const int values[] = {
SHMEM_HUGE_ALWAYS,
SHMEM_HUGE_WITHIN_SIZE,
SHMEM_HUGE_ADVISE,
@@ -3868,9 +4050,9 @@
struct kobj_attribute shmem_enabled_attr =
__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_huge_enabled(struct vm_area_struct *vma)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -3878,6 +4060,8 @@
loff_t i_size;
pgoff_t off;
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
+ return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
if (shmem_huge == SHMEM_HUGE_DENY)
@@ -3893,7 +4077,7 @@
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
return true;
- /* fall through */
+ fallthrough;
case SHMEM_HUGE_ADVISE:
/* TODO: implement fadvise() hints */
return (vma->vm_flags & VM_HUGEPAGE);
@@ -3902,7 +4086,7 @@
return false;
}
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#else /* !CONFIG_SHMEM */
@@ -3917,7 +4101,8 @@
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
- .mount = ramfs_mount,
+ .init_fs_context = ramfs_init_fs_context,
+ .parameters = ramfs_fs_parameters,
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -3932,7 +4117,8 @@
return 0;
}
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
return 0;
}
@@ -4047,7 +4233,7 @@
/**
* shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
@@ -4055,7 +4241,7 @@
loff_t size = vma->vm_end - vma->vm_start;
/*
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
* between XFS directory reading and selinux: since this file is only
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
@@ -4069,7 +4255,7 @@
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
(vma->vm_end & HPAGE_PMD_MASK)) {
khugepaged_enter(vma, vma->vm_flags);
@@ -4117,3 +4303,47 @@
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
+
+void shmem_mark_page_lazyfree(struct page *page, bool tail)
+{
+ mark_page_lazyfree_movetail(page, tail);
+}
+EXPORT_SYMBOL_GPL(shmem_mark_page_lazyfree);
+
+int reclaim_shmem_address_space(struct address_space *mapping)
+{
+#ifdef CONFIG_SHMEM
+ pgoff_t start = 0;
+ struct page *page;
+ LIST_HEAD(page_list);
+ XA_STATE(xas, &mapping->i_pages, start);
+
+ if (!shmem_mapping(mapping))
+ return -EINVAL;
+
+ lru_add_drain();
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (isolate_lru_page(page))
+ continue;
+
+ list_add(&page->lru, &page_list);
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+
+ return reclaim_pages(&page_list);
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);
--
Gitblit v1.6.2