From 151fecfb72a0d602dfe79790602ef64b4e241574 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 19 Feb 2024 01:51:07 +0000
Subject: [PATCH] export RK_PA3
---
kernel/fs/ext4/mballoc.c | 1728 +++++++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 1,248 insertions(+), 480 deletions(-)
diff --git a/kernel/fs/ext4/mballoc.c b/kernel/fs/ext4/mballoc.c
index db82436..b35d59d 100644
--- a/kernel/fs/ext4/mballoc.c
+++ b/kernel/fs/ext4/mballoc.c
@@ -16,14 +16,8 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
-
-#ifdef CONFIG_EXT4_DEBUG
-ushort ext4_mballoc_debug __read_mostly;
-
-module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
-MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
-#endif
/*
* MUSTDO:
@@ -131,7 +125,7 @@
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
* The regular allocator (using the buddy cache) supports a few tunables.
@@ -356,6 +350,36 @@
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+
+/*
+ * The algorithm using this percpu seq counter goes below:
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
+ * allocation in ext4_mb_new_blocks().
+ * 2. We increment this percpu discard_pa_seq counter when we either allocate
+ * or free these blocks i.e. while marking those blocks as used/free in
+ * mb_mark_used()/mb_free_blocks().
+ * 3. We also increment this percpu seq counter when we successfully identify
+ * that the bb_prealloc_list is not empty and hence proceed for discarding
+ * of those PAs inside ext4_mb_discard_group_preallocations().
+ *
+ * Now to make sure that the regular fast path of block allocation is not
+ * affected, as a small optimization we only sample the percpu seq counter
+ * on that cpu. Only when the block allocation fails and when freed blocks
+ * found were 0, that is when we sample percpu seq counter for all cpus using
+ * below function ext4_get_discard_pa_seq_sum(). This happens after making
+ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
+ */
+static DEFINE_PER_CPU(u64, discard_pa_seq);
+static inline u64 ext4_get_discard_pa_seq_sum(void)
+{
+ int __cpu;
+ u64 __seq = 0;
+
+ for_each_possible_cpu(__cpu)
+ __seq += per_cpu(discard_pa_seq, __cpu);
+ return __seq;
+}
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -493,6 +517,8 @@
static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+ return;
if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
unsigned char *b1, *b2;
int i;
@@ -511,6 +537,31 @@
}
}
+static void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ struct buffer_head *bh;
+
+ grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!grp->bb_bitmap)
+ return;
+
+ bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR_OR_NULL(bh)) {
+ kfree(grp->bb_bitmap);
+ grp->bb_bitmap = NULL;
+ return;
+ }
+
+ memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
+ put_bh(bh);
+}
+
+static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
+{
+ kfree(grp->bb_bitmap);
+}
+
#else
static inline void mb_free_blocks_double(struct inode *inode,
struct ext4_buddy *e4b, int first, int count)
@@ -523,6 +574,17 @@
return;
}
static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+ return;
+}
+
+static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ return;
+}
+
+static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
return;
}
@@ -558,11 +620,8 @@
void *buddy;
void *buddy2;
- {
- static int mb_check_counter;
- if (mb_check_counter++ % 100 != 0)
- return 0;
- }
+ if (e4b->bd_info->bb_check_counter++ % 10)
+ return 0;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -626,6 +685,8 @@
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
+ if (!grp)
+ return NULL;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -709,9 +770,9 @@
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
+ void *buddy, void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp)
{
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
@@ -758,28 +819,8 @@
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
- spin_lock(&sbi->s_bal_lock);
- sbi->s_mb_buddies_generated++;
- sbi->s_mb_generation_time += period;
- spin_unlock(&sbi->s_bal_lock);
-}
-
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
-{
- int count;
- int order = 1;
- void *buddy;
-
- while ((buddy = mb_find_buddy(e4b, order++, &count))) {
- ext4_set_bits(buddy, 0, count);
- }
- e4b->bd_info->bb_fragments = 0;
- memset(e4b->bd_info->bb_counters, 0,
- sizeof(*e4b->bd_info->bb_counters) *
- (e4b->bd_sb->s_blocksize_bits + 2));
-
- ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
- e4b->bd_bitmap, e4b->bd_group);
+ atomic_inc(&sbi->s_mb_buddies_generated);
+ atomic64_add(period, &sbi->s_mb_generation_time);
}
/* The buddy information is attached the buddy cache inode
@@ -820,13 +861,13 @@
char *bitmap;
struct ext4_group_info *grinfo;
- mb_debug(1, "init page %lu\n", page->index);
-
inode = page->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
blocks_per_page = PAGE_SIZE / blocksize;
+
+ mb_debug(sb, "init page %lu\n", page->index);
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
@@ -851,6 +892,8 @@
break;
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ continue;
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
@@ -861,13 +904,13 @@
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
goto out;
}
- mb_debug(1, "read bitmap for group %u\n", group);
+ mb_debug(sb, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
@@ -912,10 +955,14 @@
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+ mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
@@ -926,13 +973,13 @@
ext4_lock_group(sb, group);
/* init the buddy */
memset(data, 0xff, blocksize);
- ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
ext4_unlock_group(sb, group);
incore = NULL;
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+ mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
@@ -1038,8 +1085,11 @@
int ret = 0;
might_sleep();
- mb_debug(1, "init group %u\n", group);
+ mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
+ if (!this_grp)
+ return -EFSCORRUPTED;
+
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
@@ -1110,10 +1160,12 @@
struct inode *inode = sbi->s_buddy_cache;
might_sleep();
- mb_debug(1, "load group %u\n", group);
+ mb_debug(sb, "load group %u\n", group);
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return -EFSCORRUPTED;
e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = grp;
@@ -1217,9 +1269,6 @@
/* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
-
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
return 0;
@@ -1336,9 +1385,6 @@
}
}
-/*
- * _________________________________________________________________ */
-
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
@@ -1430,6 +1476,7 @@
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free += count;
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
@@ -1449,15 +1496,16 @@
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ ext4_mark_group_bitmap_corrupted(
+ sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
- mb_regenerate_buddy(e4b);
+ }
goto done;
}
@@ -1572,6 +1620,7 @@
mb_check_buddy(e4b);
mb_mark_used_double(e4b, start, len);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free -= len;
if (e4b->bd_info->bb_first_free == start)
e4b->bd_info->bb_first_free += len;
@@ -1671,11 +1720,15 @@
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
spin_unlock(&sbi->s_md_lock);
}
-}
+ /*
+ * As we've just preallocated more space than
+ * user requested originally, we store allocated
+ * space in a special descriptor.
+ */
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ ext4_mb_new_preallocation(ac);
-/*
- * regular allocator, for general purposes allocation
- */
+}
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
@@ -1825,7 +1878,9 @@
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
- if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+ if (!grp)
+ return -EFSCORRUPTED;
+ if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
return 0;
if (grp->bb_free == 0)
return 0;
@@ -1919,7 +1974,7 @@
ext4_mb_use_best_found(ac, e4b);
- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
if (EXT4_SB(sb)->s_mb_stats)
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@@ -1956,7 +2011,7 @@
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
- * we have free blocks
+ * have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -2036,39 +2091,29 @@
}
/*
- * This is now called BEFORE we load the buddy bitmap.
+ * This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
- * for the allocation or not. In addition it can also return negative
- * error code when something goes wrong.
+ * for the allocation or not.
*/
-static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
- unsigned free, fragments;
+ ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return false;
+
free = grp->bb_free;
if (free == 0)
- return 0;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
- return 0;
-
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
- return 0;
-
- /* We only do this if the grp has never been initialized */
- if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
- if (ret)
- return ret;
- }
+ return false;
fragments = grp->bb_fragments;
if (fragments == 0)
- return 0;
+ return false;
switch (cr) {
case 0:
@@ -2078,42 +2123,189 @@
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
- return 0;
+ return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
+ return true;
if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
+ return false;
- return 1;
+ return true;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 2:
if (free >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 3:
- return 1;
+ return true;
default:
BUG();
}
- return 0;
+ return false;
+}
+
+/*
+ * This could return negative error code if something goes wrong
+ * during ext4_mb_init_group(). This should not be called with
+ * ext4_lock_group() held.
+ */
+static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
+ ext4_grpblk_t free;
+ int ret = 0;
+
+ if (!grp)
+ return -EFSCORRUPTED;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
+ if (should_lock)
+ ext4_lock_group(sb, group);
+ free = grp->bb_free;
+ if (free == 0)
+ goto out;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ goto out;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ goto out;
+ if (should_lock)
+ ext4_unlock_group(sb, group);
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ if (should_lock)
+ ext4_lock_group(sb, group);
+ ret = ext4_mb_good_group(ac, group, cr);
+out:
+ if (should_lock)
+ ext4_unlock_group(sb, group);
+ return ret;
+}
+
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
}
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
- int cr;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
+ int cr = -1;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2133,8 +2325,8 @@
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2178,6 +2370,7 @@
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2189,8 +2382,31 @@
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = min(nr, sbi->s_mb_prefetch);
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
- ret = ext4_mb_good_group(ac, group, cr);
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
if (!first_err)
first_err = ret;
@@ -2208,11 +2424,9 @@
* block group
*/
ret = ext4_mb_good_group(ac, group, cr);
- if (ret <= 0) {
+ if (ret == 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
- if (!first_err)
- first_err = ret;
continue;
}
@@ -2231,6 +2445,9 @@
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2239,28 +2456,41 @@
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
+
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
+
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+ ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2313,6 +2543,8 @@
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ return 0;
/* Load the group info in memory only if not already loaded. */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2323,7 +2555,7 @@
buddy_loaded = 1;
}
- memcpy(&sg, ext4_get_group_info(sb, group), i);
+ memcpy(&sg, grinfo, i);
if (buddy_loaded)
ext4_mb_unload_buddy(&e4b);
@@ -2333,7 +2565,7 @@
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2348,6 +2580,67 @@
.stop = ext4_mb_seq_groups_stop,
.show = ext4_mb_seq_groups_show,
};
+
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = (struct super_block *)seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ seq_puts(seq, "mballoc:\n");
+ if (!sbi->s_mb_stats) {
+ seq_puts(seq, "\tmb stats collection turned off.\n");
+ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ return 0;
+ }
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
+
+ seq_puts(seq, "\tcr0_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[0]));
+
+ seq_puts(seq, "\tcr1_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[1]));
+
+ seq_puts(seq, "\tcr2_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[2]));
+
+ seq_puts(seq, "\tcr3_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[3]));
+ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ ext4_get_groups_count(sb));
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
+ atomic64_read(&sbi->s_mb_generation_time));
+ seq_printf(seq, "\tpreallocated: %u\n",
+ atomic_read(&sbi->s_mb_preallocated));
+ seq_printf(seq, "\tdiscarded: %u\n",
+ atomic_read(&sbi->s_mb_discarded));
+ return 0;
+}
static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
@@ -2453,20 +2746,7 @@
meta_group_info[i]->bb_free_root = RB_ROOT;
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[i]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_NOFS);
- BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
- bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(IS_ERR_OR_NULL(bh));
- memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
return 0;
exit_group_info:
@@ -2510,6 +2790,7 @@
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
+ cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
@@ -2519,12 +2800,44 @@
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO.
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+ * unsigned integer, so the maximum shift is 32.
+ */
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+ goto err_freebuddy;
+ }
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- while (i-- > 0)
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ while (i-- > 0) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+
+ if (grp)
+ kmem_cache_free(cachep, grp);
+ }
i = sbi->s_group_info_size;
rcu_read_lock();
group_info = rcu_dereference(sbi->s_group_info);
@@ -2633,7 +2946,6 @@
} while (i <= sb->s_blocksize_bits + 1);
spin_lock_init(&sbi->s_md_lock);
- spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_free_pending = 0;
INIT_LIST_HEAD(&sbi->s_freed_data_list);
@@ -2642,6 +2954,7 @@
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -2702,7 +3015,7 @@
}
/* need to called with the ext4 group lock held */
-static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
struct list_head *cur, *tmp;
@@ -2714,9 +3027,7 @@
count++;
kmem_cache_free(ext4_pspace_cachep, pa);
}
- if (count)
- mb_debug(1, "mballoc: %u PAs left\n", count);
-
+ return count;
}
int ext4_mb_release(struct super_block *sb)
@@ -2727,15 +3038,20 @@
struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ int count;
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
+ cond_resched();
grinfo = ext4_get_group_info(sb, i);
-#ifdef DOUBLE_CHECK
- kfree(grinfo->bb_bitmap);
-#endif
+ if (!grinfo)
+ continue;
+ mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
- ext4_mb_cleanup_pa(grinfo);
+ count = ext4_mb_cleanup_pa(grinfo);
+ if (count)
+ mb_debug(sb, "mballoc: %d PAs left\n",
+ count);
ext4_unlock_group(sb, i);
kmem_cache_free(cachep, grinfo);
}
@@ -2759,17 +3075,18 @@
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
ext4_msg(sb, KERN_INFO,
- "mballoc: %u extents scanned, %u goal hits, "
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
"%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
+ atomic_read(&sbi->s_bal_groups_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
ext4_msg(sb, KERN_INFO,
- "mballoc: %lu generated and it took %Lu",
- sbi->s_mb_buddies_generated,
- sbi->s_mb_generation_time);
+ "mballoc: %u generated and it took %llu",
+ atomic_read(&sbi->s_mb_buddies_generated),
+ atomic64_read(&sbi->s_mb_generation_time));
ext4_msg(sb, KERN_INFO,
"mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
@@ -2808,7 +3125,7 @@
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
@@ -2848,7 +3165,8 @@
kmem_cache_free(ext4_free_data_cachep, entry);
ext4_mb_unload_buddy(&e4b);
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+ mb_debug(sb, "freed %d blocks in %d structures\n", count,
+ count2);
}
/*
@@ -2908,23 +3226,26 @@
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
SLAB_RECLAIM_ACCOUNT);
if (ext4_pspace_cachep == NULL)
- return -ENOMEM;
+ goto out;
ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_ac_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- return -ENOMEM;
- }
+ if (ext4_ac_cachep == NULL)
+ goto out_pa_free;
ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_data_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- kmem_cache_destroy(ext4_ac_cachep);
- return -ENOMEM;
- }
+ if (ext4_free_data_cachep == NULL)
+ goto out_ac_free;
+
return 0;
+
+out_ac_free:
+ kmem_cache_destroy(ext4_ac_cachep);
+out_pa_free:
+ kmem_cache_destroy(ext4_pspace_cachep);
+out:
+ return -ENOMEM;
}
void ext4_exit_mballoc(void)
@@ -3061,6 +3382,110 @@
}
/*
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
+ * blocks in bitmaps and update counters.
+ */
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int i, err;
+ int already;
+ unsigned int clen, clen_changed, thisgrp_len;
+
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
+
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
+
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ ext4_unlock_group(sb, group);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
+ }
+
+ if (err)
+ brelse(bitmap_bh);
+}
+
+/*
* here we normalize request for locality group
* Group request are normalized to s_mb_group_prealloc, which goes to
* s_strip if we set the same via mount option.
@@ -3076,8 +3501,7 @@
BUG_ON(lg == NULL);
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug(1, "#%u: goal %u blocks for locality group\n",
- current->pid, ac->ac_g_ex.fe_len);
+ mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}
/*
@@ -3089,6 +3513,7 @@
struct ext4_allocation_request *ar)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
ext4_lblk_t end;
loff_t size, start_off;
@@ -3169,6 +3594,15 @@
}
size = size >> bsbits;
start = start_off >> bsbits;
+
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
@@ -3260,35 +3694,39 @@
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
- if (ar->pright && (ar->lright == (start + size))) {
+ if (ar->pright && (ar->lright == (start + size)) &&
+ ar->pright >= size &&
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
/* merge to the right */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- if (ar->pleft && (ar->lleft + 1 == start)) {
+ if (ar->pleft && (ar->lleft + 1 == start) &&
+ ar->pleft + 1 < ext4_blocks_count(es)) {
/* merge to the left */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
- (unsigned) orig_size, (unsigned) start);
+ mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
+ orig_size, start);
}
static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
@@ -3363,9 +3801,10 @@
BUG_ON(start < pa->pa_pstart);
BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
pa->pa_free -= len;
- mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}
/*
@@ -3389,7 +3828,8 @@
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
+ pa->pa_lstart-len, len, pa);
}
/*
@@ -3424,7 +3864,7 @@
/*
* search goal blocks in preallocated space
*/
-static noinline_for_stack int
+static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
@@ -3436,7 +3876,7 @@
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
- return 0;
+ return false;
/* first, try per-file preallocation */
rcu_read_lock();
@@ -3463,7 +3903,7 @@
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 10;
rcu_read_unlock();
- return 1;
+ return true;
}
spin_unlock(&pa->pa_lock);
}
@@ -3471,12 +3911,12 @@
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
- return 0;
+ return false;
/* inode may have no locality group for some reason */
lg = ac->ac_lg;
if (lg == NULL)
- return 0;
+ return false;
order = fls(ac->ac_o_ex.fe_len) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
@@ -3505,9 +3945,9 @@
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
ac->ac_criteria = 20;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -3524,6 +3964,8 @@
struct ext4_free_data *entry;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return;
n = rb_first(&(grp->bb_free_root));
while (n) {
@@ -3551,6 +3993,9 @@
int preallocated = 0;
int len;
+ if (!grp)
+ return;
+
/* all form of preallocation discards first load group,
* so the only competing code is preallocation use.
* we don't need any locking here
@@ -3572,7 +4017,27 @@
ext4_set_bits(bitmap, start, len);
preallocated += len;
}
- mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3607,7 +4072,7 @@
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3648,7 +4113,7 @@
/*
* creates new preallocated space for given inode
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3661,16 +4126,13 @@
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
- int winl;
- int wins;
- int win;
- int offs;
+ int new_bex_start;
+ int new_bex_end;
/* we can't allocate as much as normalizer wants.
* so, found space must get proper lstart
@@ -3678,26 +4140,40 @@
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
- /* we're limited by original request in that
- * logical block must be covered any way
- * winl is window we can move our chunk within */
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+ /*
+ * Use the below logic for adjusting best extent as it keeps
+ * fragmentation in check while ensuring logical range of best
+ * extent doesn't overflow out of goal extent:
+ *
+ * 1. Check if best ex can be kept at end of goal and still
+ * cover original start
+ * 2. Else, check if best ex can be kept at start of goal and
+ * still cover original start
+ * 3. Else, keep the best ex at start of original request.
+ */
+ new_bex_end = ac->ac_g_ex.fe_logical +
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
+ new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (ac->ac_o_ex.fe_logical >= new_bex_start)
+ goto adjust_bex;
- /* also, we should cover whole original request */
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+ new_bex_start = ac->ac_g_ex.fe_logical;
+ new_bex_end =
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (ac->ac_o_ex.fe_logical < new_bex_end)
+ goto adjust_bex;
- /* the smallest one defines real window */
- win = min(winl, wins);
+ new_bex_start = ac->ac_o_ex.fe_logical;
+ new_bex_end =
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- offs = ac->ac_o_ex.fe_logical %
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (offs && offs < win)
- win = offs;
+adjust_bex:
+ ac->ac_b_ex.fe_logical = new_bex_start;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+ BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
}
/* preallocation can change ac_b_ex, thus we store actually
@@ -3708,15 +4184,14 @@
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
@@ -3724,25 +4199,24 @@
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
pa->pa_obj_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
-
- return 0;
+ atomic_inc(&ei->i_prealloc_active);
}
/*
* creates new preallocated space for locality group inodes belongs to
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3754,11 +4228,9 @@
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- BUG_ON(ext4_pspace_cachep == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
/* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */
@@ -3768,47 +4240,42 @@
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
ext4_mb_use_group_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
lg = ac->ac_lg;
BUG_ON(lg == NULL);
pa->pa_obj_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
/*
* We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context
*/
- return 0;
}
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
- int err;
-
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- err = ext4_mb_new_group_pa(ac);
+ ext4_mb_new_group_pa(ac);
else
- err = ext4_mb_new_inode_pa(ac);
- return err;
+ ext4_mb_new_inode_pa(ac);
}
/*
@@ -3843,7 +4310,7 @@
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- mb_debug(1, " free preallocated %u/%u in group %u\n",
+ mb_debug(sb, "free preallocated %u/%u in group %u\n",
(unsigned) ext4_group_first_block_no(sb, group) + bit,
(unsigned) next - bit, (unsigned) group);
free += next - bit;
@@ -3857,10 +4324,10 @@
}
if (free != pa->pa_free) {
ext4_msg(e4b->bd_sb, KERN_CRIT,
- "pa %p: logic %lu, phys. %lu, len %lu",
+ "pa %p: logic %lu, phys. %lu, len %d",
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
+ pa->pa_len);
ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
free, pa->pa_free);
/*
@@ -3884,7 +4351,11 @@
trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
+ e4b->bd_group, group, pa->pa_pstart);
+ return 0;
+ }
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
@@ -3903,7 +4374,7 @@
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
@@ -3911,20 +4382,21 @@
struct list_head list;
struct ext4_buddy e4b;
int err;
- int busy = 0;
int free = 0;
- mb_debug(1, "discard preallocation for group %u\n", group);
-
- if (list_empty(&grp->bb_prealloc_list))
+ if (!grp)
return 0;
+ mb_debug(sb, "discard preallocation for group %u\n", group);
+ if (list_empty(&grp->bb_prealloc_list))
+ goto out_dbg;
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
- return 0;
+ ext4_error_err(sb, -err,
+ "Error %d reading block bitmap for %u",
+ err, group);
+ goto out_dbg;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -3932,21 +4404,17 @@
ext4_warning(sb, "Error %d loading buddy information for %u",
err, group);
put_bh(bitmap_bh);
- return 0;
+ goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
INIT_LIST_HEAD(&list);
-repeat:
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -3955,7 +4423,10 @@
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
+
+ if (!free)
+ this_cpu_inc(discard_pa_seq);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -3964,20 +4435,6 @@
list_del(&pa->pa_group_list);
list_add(&pa->u.pa_tmp_list, &list);
- }
-
- /* if we still need more blocks and some PAs were used, try again */
- if (free < needed && busy) {
- busy = 0;
- ext4_unlock_group(sb, group);
- cond_resched();
- goto repeat;
- }
-
- /* found anything to free? */
- if (list_empty(&list)) {
- BUG_ON(free != 0);
- goto out;
}
/* now free all selected PAs */
@@ -3997,10 +4454,12 @@
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
+out_dbg:
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
+ free, group, grp->bb_free);
return free;
}
@@ -4013,7 +4472,7 @@
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4029,16 +4488,24 @@
return;
}
- mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ mb_debug(sb, "discard preallocation for inode %lu\n",
+ inode->i_ino);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+
+ if (needed == 0)
+ needed = UINT_MAX;
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4055,10 +4522,11 @@
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4090,16 +4558,16 @@
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4117,22 +4585,77 @@
}
}
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa;
+
+ BUG_ON(ext4_pspace_cachep == NULL);
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
+ if (!pa)
+ return -ENOMEM;
+ atomic_set(&pa->pa_count, 1);
+ ac->ac_pa = pa;
+ return 0;
+}
+
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ BUG_ON(!pa);
+ ac->ac_pa = NULL;
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+ kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
#ifdef CONFIG_EXT4_DEBUG
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ ext4_group_t i, ngroups;
+
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ return;
+
+ ngroups = ext4_get_groups_count(sb);
+ mb_debug(sb, "groups: ");
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+ struct ext4_prealloc_space *pa;
+ ext4_grpblk_t start;
+ struct list_head *cur;
+
+ if (!grp)
+ continue;
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space,
+ pa_group_list);
+ spin_lock(&pa->pa_lock);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+ NULL, &start);
+ spin_unlock(&pa->pa_lock);
+ mb_debug(sb, "PA:%u:%d:%d\n", i, start,
+ pa->pa_len);
+ }
+ ext4_unlock_group(sb, i);
+ mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
+ grp->bb_fragments);
+ }
+}
+
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t ngroups, i;
- if (!ext4_mballoc_debug ||
- (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
- ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ mb_debug(sb, "Can't allocate:"
" Allocation context details:");
- ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
+ mb_debug(sb, "status %u flags 0x%x",
ac->ac_status, ac->ac_flags);
- ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
- "goal %lu/%lu/%lu@%lu, "
+ mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
"best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
@@ -4147,37 +4670,17 @@
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
- ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
- ngroups = ext4_get_groups_count(sb);
- for (i = 0; i < ngroups; i++) {
- struct ext4_group_info *grp = ext4_get_group_info(sb, i);
- struct ext4_prealloc_space *pa;
- ext4_grpblk_t start;
- struct list_head *cur;
- ext4_lock_group(sb, i);
- list_for_each(cur, &grp->bb_prealloc_list) {
- pa = list_entry(cur, struct ext4_prealloc_space,
- pa_group_list);
- spin_lock(&pa->pa_lock);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart,
- NULL, &start);
- spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%u:%d:%u \n", i,
- start, pa->pa_len);
- }
- ext4_unlock_group(sb, i);
-
- if (grp->bb_free == 0)
- continue;
- printk(KERN_ERR "%u: %d/%d \n",
- i, grp->bb_free, grp->bb_fragments);
- }
- printk(KERN_ERR "\n");
+ mb_debug(sb, "%u found", ac->ac_found);
+ ext4_mb_show_pa(sb);
}
#else
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ return;
+}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
+ ext4_mb_show_pa(ac->ac_sb);
return;
}
#endif
@@ -4205,9 +4708,8 @@
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- if ((size == isize) &&
- !ext4_fs_is_busy(sbi) &&
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+ !inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4277,17 +4779,17 @@
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
return 0;
}
@@ -4302,13 +4804,14 @@
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
- mb_debug(1, "discard locality group preallocation\n");
+ mb_debug(sb, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
@@ -4327,7 +4830,7 @@
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4353,8 +4856,8 @@
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
ext4_lock_group(sb, group);
@@ -4391,7 +4894,8 @@
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&tmp_pa->pa_lock);
@@ -4425,10 +4929,29 @@
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4440,21 +4963,31 @@
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4464,6 +4997,7 @@
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4471,17 +5005,55 @@
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;
}
+
+static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
+ struct ext4_allocation_context *ac, u64 *seq)
+{
+ int freed;
+ u64 seq_retry = 0;
+ bool ret = false;
+
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+ if (freed) {
+ ret = true;
+ goto out_dbg;
+ }
+ seq_retry = ext4_get_discard_pa_seq_sum();
+ if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
+ ac->ac_flags |= EXT4_MB_STRICT_CHECK;
+ *seq = seq_retry;
+ ret = true;
+ }
+
+out_dbg:
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ return ret;
+}
+
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp);
/*
* Main entry point into mballoc to allocate blocks
@@ -4491,19 +5063,22 @@
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
- int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ int retries = 0;
+ u64 seq;
might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -4522,6 +5097,7 @@
ar->len = ar->len >> 1;
}
if (!ar->len) {
+ ext4_mb_show_pa(sb);
*errp = -ENOSPC;
return 0;
}
@@ -4559,26 +5135,32 @@
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
+
+ *errp = ext4_mb_pa_alloc(ac);
+ if (*errp)
+ goto errout;
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
- goto discard_and_exit;
-
- /* as we've just preallocated more space than
- * user requested originally, we store allocated
- * space in a special descriptor */
- if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- *errp = ext4_mb_new_preallocation(ac);
+ /*
+ * pa allocated above is added to grp->bb_prealloc_list only
+ * when we were able to allocate some block i.e. when
+ * ac->ac_status == AC_STATUS_FOUND.
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
+ * So we have to free this pa here itself.
+ */
if (*errp) {
- discard_and_exit:
+ ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+ ext4_mb_pa_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4590,9 +5172,14 @@
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
- if (freed)
+ if (++retries < 3 &&
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
+ /*
+ * If block allocation fails then the pa allocated above
+ * needs to be freed here itself.
+ */
+ ext4_mb_pa_free(ac);
*errp = -ENOSPC;
}
@@ -4721,21 +5308,128 @@
return 0;
}
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (; group < ext4_get_groups_count(sb); group++) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ ext4_get_group_no_and_offset(sb,
+ max(ext4_group_first_block_no(sb, group), goal),
+ NULL, &blkoff);
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i)) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+ }
+
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + i;
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
+
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int already_freed = 0, err, i;
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return;
+ }
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ return;
+
+ for (i = 0; i < count; i++) {
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
+ already_freed++;
+ }
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ return;
+ ext4_free_group_clusters_set(
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
+ count - already_freed);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ brelse(bitmap_bh);
+}
+
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
@@ -4746,82 +5440,23 @@
int err = 0;
int ret;
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
-
sbi = EXT4_SB(sb);
+
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
goto error_return;
}
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
-
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
- }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
- ext4_get_group_info(sb, block_group))))
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return;
/*
@@ -4832,6 +5467,8 @@
overflow = EXT4_C2B(sbi, bit) + count -
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -4846,13 +5483,8 @@
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
-
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -4918,11 +5550,11 @@
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count,
- NULL);
+ err = ext4_issue_discard(sb, block_group, bit,
+ count_clusters, NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
} else
@@ -4946,9 +5578,17 @@
flex_group)->free_clusters);
}
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
- dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+ /*
+ * on a bigalloc file system, defer the s_freeclusters_counter
+ * update to the caller (ext4_remove_space and friends) so they
+ * can determine if a cluster freed here should be rereserved
+ */
+ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ count_clusters);
+ }
ext4_mb_unload_buddy(&e4b);
@@ -4966,11 +5606,116 @@
block += count;
count = overflow;
put_bh(bitmap_bh);
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
+ return;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
+ return;
+ }
+
+ might_sleep();
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
return;
}
@@ -5030,11 +5775,7 @@
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
@@ -5119,19 +5860,19 @@
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
* @count: number of blocks to TRIM
- * @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+ int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
struct ext4_free_extent ex;
+ ext4_group_t group = e4b->bd_group;
int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -5152,6 +5893,71 @@
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
+}
+
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ if (grp < ext4_get_groups_count(sb))
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp) - 1) >>
+ EXT4_CLUSTER_BITS(sb);
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+ ext4_grpblk_t next, count, free_count;
+ bool set_trimmed = false;
+ void *bitmap;
+
+ bitmap = e4b->bd_bitmap;
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
+ set_trimmed = true;
+ start = max(e4b->bd_info->bb_first_free, start);
+ count = 0;
+ free_count = 0;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
+ break;
+ next = mb_find_next_bit(bitmap, max + 1, start);
+
+ if ((next - start) >= minblocks) {
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
+ if (ret && ret != -EOPNOTSUPP)
+ return count;
+ count += next - start;
+ }
+ free_count += next - start;
+ start = next + 1;
+
+ if (ext4_trim_interrupted())
+ return count;
+
+ if (need_resched()) {
+ ext4_unlock_group(sb, e4b->bd_group);
+ cond_resched();
+ ext4_lock_group(sb, e4b->bd_group);
+ }
+
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
+ break;
+ }
+
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
+ return count;
}
/**
@@ -5177,10 +5983,8 @@
ext4_grpblk_t start, ext4_grpblk_t max,
ext4_grpblk_t minblocks)
{
- void *bitmap;
- ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret = 0;
+ int ret;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -5190,58 +5994,20 @@
ret, group);
return ret;
}
- bitmap = e4b.bd_bitmap;
ext4_lock_group(sb, group);
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
- goto out;
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+ else
+ ret = 0;
- while (start <= max) {
- start = mb_find_next_zero_bit(bitmap, max + 1, start);
- if (start > max)
- break;
- next = mb_find_next_bit(bitmap, max + 1, start);
-
- if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
- if (ret && ret != -EOPNOTSUPP)
- break;
- ret = 0;
- count += next - start;
- }
- free_count += next - start;
- start = next + 1;
-
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
-
- if (need_resched()) {
- ext4_unlock_group(sb, group);
- cond_resched();
- ext4_lock_group(sb, group);
- }
-
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
- break;
- }
-
- if (!ret) {
- ret = count;
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- }
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
- count, group);
+ ret, group);
return ret;
}
@@ -5286,7 +6052,7 @@
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks)
+ if (end >= max_blks - 1)
end = max_blks - 1;
if (end <= first_data_blk)
goto out;
@@ -5303,7 +6069,11 @@
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ continue;
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
ret = ext4_mb_init_group(sb, group, GFP_NOFS);
@@ -5319,10 +6089,9 @@
*/
if (group == last_group)
end = last_cluster;
-
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
@@ -5338,7 +6107,7 @@
}
if (!ret)
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
@@ -5367,8 +6136,7 @@
ext4_lock_group(sb, group);
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ start = max(e4b.bd_info->bb_first_free, start);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
--
Gitblit v1.6.2