hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/ext4/mballoc.c
....@@ -16,14 +16,8 @@
1616 #include <linux/slab.h>
1717 #include <linux/nospec.h>
1818 #include <linux/backing-dev.h>
19
+#include <linux/freezer.h>
1920 #include <trace/events/ext4.h>
20
-
21
-#ifdef CONFIG_EXT4_DEBUG
22
-ushort ext4_mballoc_debug __read_mostly;
23
-
24
-module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
25
-MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
26
-#endif
2721
2822 /*
2923 * MUSTDO:
....@@ -131,7 +125,7 @@
131125 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132126 * terms of number of blocks. If we have mounted the file system with -O
133127 * stripe=<value> option the group prealloc request is normalized to the
134
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
128
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
135129 * greater than the default mb_group_prealloc.
136130 *
137131 * The regular allocator (using the buddy cache) supports a few tunables.
....@@ -356,6 +350,36 @@
356350 ext4_group_t group);
357351 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
358352 ext4_group_t group);
353
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
354
+
355
+/*
356
+ * The algorithm using this percpu seq counter goes below:
357
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
358
+ * allocation in ext4_mb_new_blocks().
359
+ * 2. We increment this percpu discard_pa_seq counter when we either allocate
360
+ * or free these blocks i.e. while marking those blocks as used/free in
361
+ * mb_mark_used()/mb_free_blocks().
362
+ * 3. We also increment this percpu seq counter when we successfully identify
363
+ * that the bb_prealloc_list is not empty and hence proceed for discarding
364
+ * of those PAs inside ext4_mb_discard_group_preallocations().
365
+ *
366
+ * Now to make sure that the regular fast path of block allocation is not
367
+ * affected, as a small optimization we only sample the percpu seq counter
368
+ * on that cpu. Only when the block allocation fails and when freed blocks
369
+ * found were 0, that is when we sample percpu seq counter for all cpus using
370
+ * below function ext4_get_discard_pa_seq_sum(). This happens after making
371
+ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
372
+ */
373
+static DEFINE_PER_CPU(u64, discard_pa_seq);
374
+static inline u64 ext4_get_discard_pa_seq_sum(void)
375
+{
376
+ int __cpu;
377
+ u64 __seq = 0;
378
+
379
+ for_each_possible_cpu(__cpu)
380
+ __seq += per_cpu(discard_pa_seq, __cpu);
381
+ return __seq;
382
+}
359383
360384 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
361385 {
....@@ -493,6 +517,8 @@
493517
494518 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
495519 {
520
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
521
+ return;
496522 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
497523 unsigned char *b1, *b2;
498524 int i;
....@@ -511,6 +537,31 @@
511537 }
512538 }
513539
540
+static void mb_group_bb_bitmap_alloc(struct super_block *sb,
541
+ struct ext4_group_info *grp, ext4_group_t group)
542
+{
543
+ struct buffer_head *bh;
544
+
545
+ grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
546
+ if (!grp->bb_bitmap)
547
+ return;
548
+
549
+ bh = ext4_read_block_bitmap(sb, group);
550
+ if (IS_ERR_OR_NULL(bh)) {
551
+ kfree(grp->bb_bitmap);
552
+ grp->bb_bitmap = NULL;
553
+ return;
554
+ }
555
+
556
+ memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
557
+ put_bh(bh);
558
+}
559
+
560
+static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
561
+{
562
+ kfree(grp->bb_bitmap);
563
+}
564
+
514565 #else
515566 static inline void mb_free_blocks_double(struct inode *inode,
516567 struct ext4_buddy *e4b, int first, int count)
....@@ -523,6 +574,17 @@
523574 return;
524575 }
525576 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
577
+{
578
+ return;
579
+}
580
+
581
+static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
582
+ struct ext4_group_info *grp, ext4_group_t group)
583
+{
584
+ return;
585
+}
586
+
587
+static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
526588 {
527589 return;
528590 }
....@@ -558,11 +620,8 @@
558620 void *buddy;
559621 void *buddy2;
560622
561
- {
562
- static int mb_check_counter;
563
- if (mb_check_counter++ % 100 != 0)
564
- return 0;
565
- }
623
+ if (e4b->bd_info->bb_check_counter++ % 10)
624
+ return 0;
566625
567626 while (order > 1) {
568627 buddy = mb_find_buddy(e4b, order, &max);
....@@ -626,6 +685,8 @@
626685 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
627686
628687 grp = ext4_get_group_info(sb, e4b->bd_group);
688
+ if (!grp)
689
+ return NULL;
629690 list_for_each(cur, &grp->bb_prealloc_list) {
630691 ext4_group_t groupnr;
631692 struct ext4_prealloc_space *pa;
....@@ -709,9 +770,9 @@
709770
710771 static noinline_for_stack
711772 void ext4_mb_generate_buddy(struct super_block *sb,
712
- void *buddy, void *bitmap, ext4_group_t group)
773
+ void *buddy, void *bitmap, ext4_group_t group,
774
+ struct ext4_group_info *grp)
713775 {
714
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715776 struct ext4_sb_info *sbi = EXT4_SB(sb);
716777 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
717778 ext4_grpblk_t i = 0;
....@@ -758,28 +819,8 @@
758819 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
759820
760821 period = get_cycles() - period;
761
- spin_lock(&sbi->s_bal_lock);
762
- sbi->s_mb_buddies_generated++;
763
- sbi->s_mb_generation_time += period;
764
- spin_unlock(&sbi->s_bal_lock);
765
-}
766
-
767
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
768
-{
769
- int count;
770
- int order = 1;
771
- void *buddy;
772
-
773
- while ((buddy = mb_find_buddy(e4b, order++, &count))) {
774
- ext4_set_bits(buddy, 0, count);
775
- }
776
- e4b->bd_info->bb_fragments = 0;
777
- memset(e4b->bd_info->bb_counters, 0,
778
- sizeof(*e4b->bd_info->bb_counters) *
779
- (e4b->bd_sb->s_blocksize_bits + 2));
780
-
781
- ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
782
- e4b->bd_bitmap, e4b->bd_group);
822
+ atomic_inc(&sbi->s_mb_buddies_generated);
823
+ atomic64_add(period, &sbi->s_mb_generation_time);
783824 }
784825
785826 /* The buddy information is attached the buddy cache inode
....@@ -820,13 +861,13 @@
820861 char *bitmap;
821862 struct ext4_group_info *grinfo;
822863
823
- mb_debug(1, "init page %lu\n", page->index);
824
-
825864 inode = page->mapping->host;
826865 sb = inode->i_sb;
827866 ngroups = ext4_get_groups_count(sb);
828867 blocksize = i_blocksize(inode);
829868 blocks_per_page = PAGE_SIZE / blocksize;
869
+
870
+ mb_debug(sb, "init page %lu\n", page->index);
830871
831872 groups_per_page = blocks_per_page >> 1;
832873 if (groups_per_page == 0)
....@@ -851,6 +892,8 @@
851892 break;
852893
853894 grinfo = ext4_get_group_info(sb, group);
895
+ if (!grinfo)
896
+ continue;
854897 /*
855898 * If page is uptodate then we came here after online resize
856899 * which added some new uninitialized group info structs, so
....@@ -861,13 +904,13 @@
861904 bh[i] = NULL;
862905 continue;
863906 }
864
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
907
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
865908 if (IS_ERR(bh[i])) {
866909 err = PTR_ERR(bh[i]);
867910 bh[i] = NULL;
868911 goto out;
869912 }
870
- mb_debug(1, "read bitmap for group %u\n", group);
913
+ mb_debug(sb, "read bitmap for group %u\n", group);
871914 }
872915
873916 /* wait for I/O completion */
....@@ -912,10 +955,14 @@
912955 if ((first_block + i) & 1) {
913956 /* this is block of buddy */
914957 BUG_ON(incore == NULL);
915
- mb_debug(1, "put buddy for group %u in page %lu/%x\n",
958
+ mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
916959 group, page->index, i * blocksize);
917960 trace_ext4_mb_buddy_bitmap_load(sb, group);
918961 grinfo = ext4_get_group_info(sb, group);
962
+ if (!grinfo) {
963
+ err = -EFSCORRUPTED;
964
+ goto out;
965
+ }
919966 grinfo->bb_fragments = 0;
920967 memset(grinfo->bb_counters, 0,
921968 sizeof(*grinfo->bb_counters) *
....@@ -926,13 +973,13 @@
926973 ext4_lock_group(sb, group);
927974 /* init the buddy */
928975 memset(data, 0xff, blocksize);
929
- ext4_mb_generate_buddy(sb, data, incore, group);
976
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
930977 ext4_unlock_group(sb, group);
931978 incore = NULL;
932979 } else {
933980 /* this is block of bitmap */
934981 BUG_ON(incore != NULL);
935
- mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
982
+ mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
936983 group, page->index, i * blocksize);
937984 trace_ext4_mb_bitmap_load(sb, group);
938985
....@@ -1038,8 +1085,11 @@
10381085 int ret = 0;
10391086
10401087 might_sleep();
1041
- mb_debug(1, "init group %u\n", group);
1088
+ mb_debug(sb, "init group %u\n", group);
10421089 this_grp = ext4_get_group_info(sb, group);
1090
+ if (!this_grp)
1091
+ return -EFSCORRUPTED;
1092
+
10431093 /*
10441094 * This ensures that we don't reinit the buddy cache
10451095 * page which map to the group from which we are already
....@@ -1110,10 +1160,12 @@
11101160 struct inode *inode = sbi->s_buddy_cache;
11111161
11121162 might_sleep();
1113
- mb_debug(1, "load group %u\n", group);
1163
+ mb_debug(sb, "load group %u\n", group);
11141164
11151165 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
11161166 grp = ext4_get_group_info(sb, group);
1167
+ if (!grp)
1168
+ return -EFSCORRUPTED;
11171169
11181170 e4b->bd_blkbits = sb->s_blocksize_bits;
11191171 e4b->bd_info = grp;
....@@ -1217,9 +1269,6 @@
12171269 /* Pages marked accessed already */
12181270 e4b->bd_buddy_page = page;
12191271 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1220
-
1221
- BUG_ON(e4b->bd_bitmap_page == NULL);
1222
- BUG_ON(e4b->bd_buddy_page == NULL);
12231272
12241273 return 0;
12251274
....@@ -1336,9 +1385,6 @@
13361385 }
13371386 }
13381387
1339
-/*
1340
- * _________________________________________________________________ */
1341
-
13421388 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
13431389 {
13441390 if (mb_test_bit(*bit + side, bitmap)) {
....@@ -1430,6 +1476,7 @@
14301476 mb_check_buddy(e4b);
14311477 mb_free_blocks_double(inode, e4b, first, count);
14321478
1479
+ this_cpu_inc(discard_pa_seq);
14331480 e4b->bd_info->bb_free += count;
14341481 if (first < e4b->bd_info->bb_first_free)
14351482 e4b->bd_info->bb_first_free = first;
....@@ -1449,15 +1496,16 @@
14491496
14501497 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
14511498 blocknr += EXT4_C2B(sbi, block);
1452
- ext4_grp_locked_error(sb, e4b->bd_group,
1453
- inode ? inode->i_ino : 0,
1454
- blocknr,
1455
- "freeing already freed block "
1456
- "(bit %u); block bitmap corrupt.",
1457
- block);
1458
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
1499
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1500
+ ext4_grp_locked_error(sb, e4b->bd_group,
1501
+ inode ? inode->i_ino : 0,
1502
+ blocknr,
1503
+ "freeing already freed block (bit %u); block bitmap corrupt.",
1504
+ block);
1505
+ ext4_mark_group_bitmap_corrupted(
1506
+ sb, e4b->bd_group,
14591507 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1460
- mb_regenerate_buddy(e4b);
1508
+ }
14611509 goto done;
14621510 }
14631511
....@@ -1572,6 +1620,7 @@
15721620 mb_check_buddy(e4b);
15731621 mb_mark_used_double(e4b, start, len);
15741622
1623
+ this_cpu_inc(discard_pa_seq);
15751624 e4b->bd_info->bb_free -= len;
15761625 if (e4b->bd_info->bb_first_free == start)
15771626 e4b->bd_info->bb_first_free += len;
....@@ -1671,11 +1720,15 @@
16711720 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
16721721 spin_unlock(&sbi->s_md_lock);
16731722 }
1674
-}
1723
+ /*
1724
+ * As we've just preallocated more space than
1725
+ * user requested originally, we store allocated
1726
+ * space in a special descriptor.
1727
+ */
1728
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1729
+ ext4_mb_new_preallocation(ac);
16751730
1676
-/*
1677
- * regular allocator, for general purposes allocation
1678
- */
1731
+}
16791732
16801733 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
16811734 struct ext4_buddy *e4b,
....@@ -1825,7 +1878,9 @@
18251878 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
18261879 struct ext4_free_extent ex;
18271880
1828
- if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1881
+ if (!grp)
1882
+ return -EFSCORRUPTED;
1883
+ if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
18291884 return 0;
18301885 if (grp->bb_free == 0)
18311886 return 0;
....@@ -1919,7 +1974,7 @@
19191974
19201975 ext4_mb_use_best_found(ac, e4b);
19211976
1922
- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1977
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
19231978
19241979 if (EXT4_SB(sb)->s_mb_stats)
19251980 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
....@@ -1956,7 +2011,7 @@
19562011 /*
19572012 * IF we have corrupt bitmap, we won't find any
19582013 * free blocks even though group info says we
1959
- * we have free blocks
2014
+ * have free blocks
19602015 */
19612016 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
19622017 "%d free clusters as per "
....@@ -2036,39 +2091,29 @@
20362091 }
20372092
20382093 /*
2039
- * This is now called BEFORE we load the buddy bitmap.
2094
+ * This is also called BEFORE we load the buddy bitmap.
20402095 * Returns either 1 or 0 indicating that the group is either suitable
2041
- * for the allocation or not. In addition it can also return negative
2042
- * error code when something goes wrong.
2096
+ * for the allocation or not.
20432097 */
2044
-static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2098
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
20452099 ext4_group_t group, int cr)
20462100 {
2047
- unsigned free, fragments;
2101
+ ext4_grpblk_t free, fragments;
20482102 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
20492103 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
20502104
20512105 BUG_ON(cr < 0 || cr >= 4);
20522106
2107
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2108
+ return false;
2109
+
20532110 free = grp->bb_free;
20542111 if (free == 0)
2055
- return 0;
2056
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2057
- return 0;
2058
-
2059
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2060
- return 0;
2061
-
2062
- /* We only do this if the grp has never been initialized */
2063
- if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2064
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2065
- if (ret)
2066
- return ret;
2067
- }
2112
+ return false;
20682113
20692114 fragments = grp->bb_fragments;
20702115 if (fragments == 0)
2071
- return 0;
2116
+ return false;
20722117
20732118 switch (cr) {
20742119 case 0:
....@@ -2078,42 +2123,189 @@
20782123 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
20792124 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
20802125 ((group % flex_size) == 0))
2081
- return 0;
2126
+ return false;
20822127
2083
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2084
- (free / fragments) >= ac->ac_g_ex.fe_len)
2085
- return 1;
2128
+ if (free < ac->ac_g_ex.fe_len)
2129
+ return false;
2130
+
2131
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
2132
+ return true;
20862133
20872134 if (grp->bb_largest_free_order < ac->ac_2order)
2088
- return 0;
2135
+ return false;
20892136
2090
- return 1;
2137
+ return true;
20912138 case 1:
20922139 if ((free / fragments) >= ac->ac_g_ex.fe_len)
2093
- return 1;
2140
+ return true;
20942141 break;
20952142 case 2:
20962143 if (free >= ac->ac_g_ex.fe_len)
2097
- return 1;
2144
+ return true;
20982145 break;
20992146 case 3:
2100
- return 1;
2147
+ return true;
21012148 default:
21022149 BUG();
21032150 }
21042151
2105
- return 0;
2152
+ return false;
2153
+}
2154
+
2155
+/*
2156
+ * This could return negative error code if something goes wrong
2157
+ * during ext4_mb_init_group(). This should not be called with
2158
+ * ext4_lock_group() held.
2159
+ */
2160
+static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2161
+ ext4_group_t group, int cr)
2162
+{
2163
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2164
+ struct super_block *sb = ac->ac_sb;
2165
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2166
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2167
+ ext4_grpblk_t free;
2168
+ int ret = 0;
2169
+
2170
+ if (!grp)
2171
+ return -EFSCORRUPTED;
2172
+ if (sbi->s_mb_stats)
2173
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
2174
+ if (should_lock)
2175
+ ext4_lock_group(sb, group);
2176
+ free = grp->bb_free;
2177
+ if (free == 0)
2178
+ goto out;
2179
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2180
+ goto out;
2181
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2182
+ goto out;
2183
+ if (should_lock)
2184
+ ext4_unlock_group(sb, group);
2185
+
2186
+ /* We only do this if the grp has never been initialized */
2187
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2188
+ struct ext4_group_desc *gdp =
2189
+ ext4_get_group_desc(sb, group, NULL);
2190
+ int ret;
2191
+
2192
+ /* cr=0/1 is a very optimistic search to find large
2193
+ * good chunks almost for free. If buddy data is not
2194
+ * ready, then this optimization makes no sense. But
2195
+ * we never skip the first block group in a flex_bg,
2196
+ * since this gets used for metadata block allocation,
2197
+ * and we want to make sure we locate metadata blocks
2198
+ * in the first block group in the flex_bg if possible.
2199
+ */
2200
+ if (cr < 2 &&
2201
+ (!sbi->s_log_groups_per_flex ||
2202
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2203
+ !(ext4_has_group_desc_csum(sb) &&
2204
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2205
+ return 0;
2206
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2207
+ if (ret)
2208
+ return ret;
2209
+ }
2210
+
2211
+ if (should_lock)
2212
+ ext4_lock_group(sb, group);
2213
+ ret = ext4_mb_good_group(ac, group, cr);
2214
+out:
2215
+ if (should_lock)
2216
+ ext4_unlock_group(sb, group);
2217
+ return ret;
2218
+}
2219
+
2220
+/*
2221
+ * Start prefetching @nr block bitmaps starting at @group.
2222
+ * Return the next group which needs to be prefetched.
2223
+ */
2224
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2225
+ unsigned int nr, int *cnt)
2226
+{
2227
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
2228
+ struct buffer_head *bh;
2229
+ struct blk_plug plug;
2230
+
2231
+ blk_start_plug(&plug);
2232
+ while (nr-- > 0) {
2233
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2234
+ NULL);
2235
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2236
+
2237
+ /*
2238
+ * Prefetch block groups with free blocks; but don't
2239
+ * bother if it is marked uninitialized on disk, since
2240
+ * it won't require I/O to read. Also only try to
2241
+ * prefetch once, so we avoid getblk() call, which can
2242
+ * be expensive.
2243
+ */
2244
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2245
+ EXT4_MB_GRP_NEED_INIT(grp) &&
2246
+ ext4_free_group_clusters(sb, gdp) > 0 &&
2247
+ !(ext4_has_group_desc_csum(sb) &&
2248
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2249
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
2250
+ if (bh && !IS_ERR(bh)) {
2251
+ if (!buffer_uptodate(bh) && cnt)
2252
+ (*cnt)++;
2253
+ brelse(bh);
2254
+ }
2255
+ }
2256
+ if (++group >= ngroups)
2257
+ group = 0;
2258
+ }
2259
+ blk_finish_plug(&plug);
2260
+ return group;
2261
+}
2262
+
2263
+/*
2264
+ * Prefetching reads the block bitmap into the buffer cache; but we
2265
+ * need to make sure that the buddy bitmap in the page cache has been
2266
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
2267
+ * is not yet completed, or indeed if it was not initiated by
2268
+ * ext4_mb_prefetch did not start the I/O.
2269
+ *
2270
+ * TODO: We should actually kick off the buddy bitmap setup in a work
2271
+ * queue when the buffer I/O is completed, so that we don't block
2272
+ * waiting for the block allocation bitmap read to finish when
2273
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2274
+ */
2275
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2276
+ unsigned int nr)
2277
+{
2278
+ while (nr-- > 0) {
2279
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2280
+ NULL);
2281
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2282
+
2283
+ if (!group)
2284
+ group = ext4_get_groups_count(sb);
2285
+ group--;
2286
+ grp = ext4_get_group_info(sb, group);
2287
+
2288
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
2289
+ ext4_free_group_clusters(sb, gdp) > 0 &&
2290
+ !(ext4_has_group_desc_csum(sb) &&
2291
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2292
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
2293
+ break;
2294
+ }
2295
+ }
21062296 }
21072297
21082298 static noinline_for_stack int
21092299 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
21102300 {
2111
- ext4_group_t ngroups, group, i;
2112
- int cr;
2301
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
2302
+ int cr = -1;
21132303 int err = 0, first_err = 0;
2304
+ unsigned int nr = 0, prefetch_ios = 0;
21142305 struct ext4_sb_info *sbi;
21152306 struct super_block *sb;
21162307 struct ext4_buddy e4b;
2308
+ int lost;
21172309
21182310 sb = ac->ac_sb;
21192311 sbi = EXT4_SB(sb);
....@@ -2133,8 +2325,8 @@
21332325 goto out;
21342326
21352327 /*
2136
- * ac->ac2_order is set only if the fe_len is a power of 2
2137
- * if ac2_order is set we also set criteria to 0 so that we
2328
+ * ac->ac_2order is set only if the fe_len is a power of 2
2329
+ * if ac->ac_2order is set we also set criteria to 0 so that we
21382330 * try exact allocation using buddy.
21392331 */
21402332 i = fls(ac->ac_g_ex.fe_len);
....@@ -2178,6 +2370,7 @@
21782370 * from the goal value specified
21792371 */
21802372 group = ac->ac_g_ex.fe_group;
2373
+ prefetch_grp = group;
21812374
21822375 for (i = 0; i < ngroups; group++, i++) {
21832376 int ret = 0;
....@@ -2189,8 +2382,31 @@
21892382 if (group >= ngroups)
21902383 group = 0;
21912384
2385
+ /*
2386
+ * Batch reads of the block allocation bitmaps
2387
+ * to get multiple READs in flight; limit
2388
+ * prefetching at cr=0/1, otherwise mballoc can
2389
+ * spend a lot of time loading imperfect groups
2390
+ */
2391
+ if ((prefetch_grp == group) &&
2392
+ (cr > 1 ||
2393
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
2394
+ unsigned int curr_ios = prefetch_ios;
2395
+
2396
+ nr = sbi->s_mb_prefetch;
2397
+ if (ext4_has_feature_flex_bg(sb)) {
2398
+ nr = 1 << sbi->s_log_groups_per_flex;
2399
+ nr -= group & (nr - 1);
2400
+ nr = min(nr, sbi->s_mb_prefetch);
2401
+ }
2402
+ prefetch_grp = ext4_mb_prefetch(sb, group,
2403
+ nr, &prefetch_ios);
2404
+ if (prefetch_ios == curr_ios)
2405
+ nr = 0;
2406
+ }
2407
+
21922408 /* This now checks without needing the buddy page */
2193
- ret = ext4_mb_good_group(ac, group, cr);
2409
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
21942410 if (ret <= 0) {
21952411 if (!first_err)
21962412 first_err = ret;
....@@ -2208,11 +2424,9 @@
22082424 * block group
22092425 */
22102426 ret = ext4_mb_good_group(ac, group, cr);
2211
- if (ret <= 0) {
2427
+ if (ret == 0) {
22122428 ext4_unlock_group(sb, group);
22132429 ext4_mb_unload_buddy(&e4b);
2214
- if (!first_err)
2215
- first_err = ret;
22162430 continue;
22172431 }
22182432
....@@ -2231,6 +2445,9 @@
22312445 if (ac->ac_status != AC_STATUS_CONTINUE)
22322446 break;
22332447 }
2448
+ /* Processed all groups and haven't found blocks */
2449
+ if (sbi->s_mb_stats && i == ngroups)
2450
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
22342451 }
22352452
22362453 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
....@@ -2239,28 +2456,41 @@
22392456 * We've been searching too long. Let's try to allocate
22402457 * the best chunk we've found so far
22412458 */
2242
-
22432459 ext4_mb_try_best_found(ac, &e4b);
22442460 if (ac->ac_status != AC_STATUS_FOUND) {
22452461 /*
22462462 * Someone more lucky has already allocated it.
22472463 * The only thing we can do is just take first
22482464 * found block(s)
2249
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
22502465 */
2466
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2467
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2468
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2469
+ ac->ac_b_ex.fe_len, lost);
2470
+
22512471 ac->ac_b_ex.fe_group = 0;
22522472 ac->ac_b_ex.fe_start = 0;
22532473 ac->ac_b_ex.fe_len = 0;
22542474 ac->ac_status = AC_STATUS_CONTINUE;
22552475 ac->ac_flags |= EXT4_MB_HINT_FIRST;
22562476 cr = 3;
2257
- atomic_inc(&sbi->s_mb_lost_chunks);
22582477 goto repeat;
22592478 }
22602479 }
2480
+
2481
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2482
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
22612483 out:
22622484 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
22632485 err = first_err;
2486
+
2487
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2488
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2489
+ ac->ac_flags, cr, err);
2490
+
2491
+ if (nr)
2492
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2493
+
22642494 return err;
22652495 }
22662496
....@@ -2313,6 +2543,8 @@
23132543 sizeof(struct ext4_group_info);
23142544
23152545 grinfo = ext4_get_group_info(sb, group);
2546
+ if (!grinfo)
2547
+ return 0;
23162548 /* Load the group info in memory only if not already loaded. */
23172549 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
23182550 err = ext4_mb_load_buddy(sb, group, &e4b);
....@@ -2323,7 +2555,7 @@
23232555 buddy_loaded = 1;
23242556 }
23252557
2326
- memcpy(&sg, ext4_get_group_info(sb, group), i);
2558
+ memcpy(&sg, grinfo, i);
23272559
23282560 if (buddy_loaded)
23292561 ext4_mb_unload_buddy(&e4b);
....@@ -2333,7 +2565,7 @@
23332565 for (i = 0; i <= 13; i++)
23342566 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
23352567 sg.info.bb_counters[i] : 0);
2336
- seq_printf(seq, " ]\n");
2568
+ seq_puts(seq, " ]\n");
23372569
23382570 return 0;
23392571 }
....@@ -2348,6 +2580,67 @@
23482580 .stop = ext4_mb_seq_groups_stop,
23492581 .show = ext4_mb_seq_groups_show,
23502582 };
2583
+
2584
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2585
+{
2586
+ struct super_block *sb = (struct super_block *)seq->private;
2587
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2588
+
2589
+ seq_puts(seq, "mballoc:\n");
2590
+ if (!sbi->s_mb_stats) {
2591
+ seq_puts(seq, "\tmb stats collection turned off.\n");
2592
+ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2593
+ return 0;
2594
+ }
2595
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2596
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2597
+
2598
+ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
2599
+
2600
+ seq_puts(seq, "\tcr0_stats:\n");
2601
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2602
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2603
+ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2604
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2605
+ atomic64_read(&sbi->s_bal_cX_failed[0]));
2606
+
2607
+ seq_puts(seq, "\tcr1_stats:\n");
2608
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2609
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2610
+ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2611
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2612
+ atomic64_read(&sbi->s_bal_cX_failed[1]));
2613
+
2614
+ seq_puts(seq, "\tcr2_stats:\n");
2615
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2616
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2617
+ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2618
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2619
+ atomic64_read(&sbi->s_bal_cX_failed[2]));
2620
+
2621
+ seq_puts(seq, "\tcr3_stats:\n");
2622
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2623
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2624
+ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2625
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2626
+ atomic64_read(&sbi->s_bal_cX_failed[3]));
2627
+ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2628
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2629
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2630
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2631
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2632
+
2633
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2634
+ atomic_read(&sbi->s_mb_buddies_generated),
2635
+ ext4_get_groups_count(sb));
2636
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
2637
+ atomic64_read(&sbi->s_mb_generation_time));
2638
+ seq_printf(seq, "\tpreallocated: %u\n",
2639
+ atomic_read(&sbi->s_mb_preallocated));
2640
+ seq_printf(seq, "\tdiscarded: %u\n",
2641
+ atomic_read(&sbi->s_mb_discarded));
2642
+ return 0;
2643
+}
23512644
23522645 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
23532646 {
....@@ -2453,20 +2746,7 @@
24532746 meta_group_info[i]->bb_free_root = RB_ROOT;
24542747 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
24552748
2456
-#ifdef DOUBLE_CHECK
2457
- {
2458
- struct buffer_head *bh;
2459
- meta_group_info[i]->bb_bitmap =
2460
- kmalloc(sb->s_blocksize, GFP_NOFS);
2461
- BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2462
- bh = ext4_read_block_bitmap(sb, group);
2463
- BUG_ON(IS_ERR_OR_NULL(bh));
2464
- memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2465
- sb->s_blocksize);
2466
- put_bh(bh);
2467
- }
2468
-#endif
2469
-
2749
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
24702750 return 0;
24712751
24722752 exit_group_info:
....@@ -2510,6 +2790,7 @@
25102790 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
25112791 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
25122792 for (i = 0; i < ngroups; i++) {
2793
+ cond_resched();
25132794 desc = ext4_get_group_desc(sb, i, NULL);
25142795 if (desc == NULL) {
25152796 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
....@@ -2519,12 +2800,44 @@
25192800 goto err_freebuddy;
25202801 }
25212802
2803
+ if (ext4_has_feature_flex_bg(sb)) {
2804
+ /* a single flex group is supposed to be read by a single IO.
2805
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
2806
+ * unsigned integer, so the maximum shift is 32.
2807
+ */
2808
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
2809
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
2810
+ goto err_freebuddy;
2811
+ }
2812
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
2813
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
2814
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
2815
+ } else {
2816
+ sbi->s_mb_prefetch = 32;
2817
+ }
2818
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
2819
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
2820
+ /* now many real IOs to prefetch within a single allocation at cr=0
2821
+ * given cr=0 is an CPU-related optimization we shouldn't try to
2822
+ * load too many groups, at some point we should start to use what
2823
+ * we've got in memory.
2824
+ * with an average random access time 5ms, it'd take a second to get
2825
+ * 200 groups (* N with flex_bg), so let's make this limit 4
2826
+ */
2827
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
2828
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
2829
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
2830
+
25222831 return 0;
25232832
25242833 err_freebuddy:
25252834 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2526
- while (i-- > 0)
2527
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2835
+ while (i-- > 0) {
2836
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
2837
+
2838
+ if (grp)
2839
+ kmem_cache_free(cachep, grp);
2840
+ }
25282841 i = sbi->s_group_info_size;
25292842 rcu_read_lock();
25302843 group_info = rcu_dereference(sbi->s_group_info);
....@@ -2633,7 +2946,6 @@
26332946 } while (i <= sb->s_blocksize_bits + 1);
26342947
26352948 spin_lock_init(&sbi->s_md_lock);
2636
- spin_lock_init(&sbi->s_bal_lock);
26372949 sbi->s_mb_free_pending = 0;
26382950 INIT_LIST_HEAD(&sbi->s_freed_data_list);
26392951
....@@ -2642,6 +2954,7 @@
26422954 sbi->s_mb_stats = MB_DEFAULT_STATS;
26432955 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
26442956 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2957
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
26452958 /*
26462959 * The default group preallocation is 512, which for 4k block
26472960 * sizes translates to 2 megabytes. However for bigalloc file
....@@ -2702,7 +3015,7 @@
27023015 }
27033016
27043017 /* need to called with the ext4 group lock held */
2705
-static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
3018
+static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
27063019 {
27073020 struct ext4_prealloc_space *pa;
27083021 struct list_head *cur, *tmp;
....@@ -2714,9 +3027,7 @@
27143027 count++;
27153028 kmem_cache_free(ext4_pspace_cachep, pa);
27163029 }
2717
- if (count)
2718
- mb_debug(1, "mballoc: %u PAs left\n", count);
2719
-
3030
+ return count;
27203031 }
27213032
27223033 int ext4_mb_release(struct super_block *sb)
....@@ -2727,15 +3038,20 @@
27273038 struct ext4_group_info *grinfo, ***group_info;
27283039 struct ext4_sb_info *sbi = EXT4_SB(sb);
27293040 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3041
+ int count;
27303042
27313043 if (sbi->s_group_info) {
27323044 for (i = 0; i < ngroups; i++) {
3045
+ cond_resched();
27333046 grinfo = ext4_get_group_info(sb, i);
2734
-#ifdef DOUBLE_CHECK
2735
- kfree(grinfo->bb_bitmap);
2736
-#endif
3047
+ if (!grinfo)
3048
+ continue;
3049
+ mb_group_bb_bitmap_free(grinfo);
27373050 ext4_lock_group(sb, i);
2738
- ext4_mb_cleanup_pa(grinfo);
3051
+ count = ext4_mb_cleanup_pa(grinfo);
3052
+ if (count)
3053
+ mb_debug(sb, "mballoc: %d PAs left\n",
3054
+ count);
27393055 ext4_unlock_group(sb, i);
27403056 kmem_cache_free(cachep, grinfo);
27413057 }
....@@ -2759,17 +3075,18 @@
27593075 atomic_read(&sbi->s_bal_reqs),
27603076 atomic_read(&sbi->s_bal_success));
27613077 ext4_msg(sb, KERN_INFO,
2762
- "mballoc: %u extents scanned, %u goal hits, "
3078
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
27633079 "%u 2^N hits, %u breaks, %u lost",
27643080 atomic_read(&sbi->s_bal_ex_scanned),
3081
+ atomic_read(&sbi->s_bal_groups_scanned),
27653082 atomic_read(&sbi->s_bal_goals),
27663083 atomic_read(&sbi->s_bal_2orders),
27673084 atomic_read(&sbi->s_bal_breaks),
27683085 atomic_read(&sbi->s_mb_lost_chunks));
27693086 ext4_msg(sb, KERN_INFO,
2770
- "mballoc: %lu generated and it took %Lu",
2771
- sbi->s_mb_buddies_generated,
2772
- sbi->s_mb_generation_time);
3087
+ "mballoc: %u generated and it took %llu",
3088
+ atomic_read(&sbi->s_mb_buddies_generated),
3089
+ atomic64_read(&sbi->s_mb_generation_time));
27733090 ext4_msg(sb, KERN_INFO,
27743091 "mballoc: %u preallocated, %u discarded",
27753092 atomic_read(&sbi->s_mb_preallocated),
....@@ -2808,7 +3125,7 @@
28083125 struct ext4_group_info *db;
28093126 int err, count = 0, count2 = 0;
28103127
2811
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
3128
+ mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
28123129 entry->efd_count, entry->efd_group, entry);
28133130
28143131 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
....@@ -2848,7 +3165,8 @@
28483165 kmem_cache_free(ext4_free_data_cachep, entry);
28493166 ext4_mb_unload_buddy(&e4b);
28503167
2851
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
3168
+ mb_debug(sb, "freed %d blocks in %d structures\n", count,
3169
+ count2);
28523170 }
28533171
28543172 /*
....@@ -2908,23 +3226,26 @@
29083226 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
29093227 SLAB_RECLAIM_ACCOUNT);
29103228 if (ext4_pspace_cachep == NULL)
2911
- return -ENOMEM;
3229
+ goto out;
29123230
29133231 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
29143232 SLAB_RECLAIM_ACCOUNT);
2915
- if (ext4_ac_cachep == NULL) {
2916
- kmem_cache_destroy(ext4_pspace_cachep);
2917
- return -ENOMEM;
2918
- }
3233
+ if (ext4_ac_cachep == NULL)
3234
+ goto out_pa_free;
29193235
29203236 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
29213237 SLAB_RECLAIM_ACCOUNT);
2922
- if (ext4_free_data_cachep == NULL) {
2923
- kmem_cache_destroy(ext4_pspace_cachep);
2924
- kmem_cache_destroy(ext4_ac_cachep);
2925
- return -ENOMEM;
2926
- }
3238
+ if (ext4_free_data_cachep == NULL)
3239
+ goto out_ac_free;
3240
+
29273241 return 0;
3242
+
3243
+out_ac_free:
3244
+ kmem_cache_destroy(ext4_ac_cachep);
3245
+out_pa_free:
3246
+ kmem_cache_destroy(ext4_pspace_cachep);
3247
+out:
3248
+ return -ENOMEM;
29283249 }
29293250
29303251 void ext4_exit_mballoc(void)
....@@ -3061,6 +3382,110 @@
30613382 }
30623383
30633384 /*
3385
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
3386
+ * blocks in bitmaps and update counters.
3387
+ */
3388
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3389
+ int len, int state)
3390
+{
3391
+ struct buffer_head *bitmap_bh = NULL;
3392
+ struct ext4_group_desc *gdp;
3393
+ struct buffer_head *gdp_bh;
3394
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
3395
+ ext4_group_t group;
3396
+ ext4_grpblk_t blkoff;
3397
+ int i, err;
3398
+ int already;
3399
+ unsigned int clen, clen_changed, thisgrp_len;
3400
+
3401
+ while (len > 0) {
3402
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
3403
+
3404
+ /*
3405
+ * Check to see if we are freeing blocks across a group
3406
+ * boundary.
3407
+ * In case of flex_bg, this can happen that (block, len) may
3408
+ * span across more than one group. In that case we need to
3409
+ * get the corresponding group metadata to work with.
3410
+ * For this we have goto again loop.
3411
+ */
3412
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
3413
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
3414
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
3415
+
3416
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
3417
+ if (IS_ERR(bitmap_bh)) {
3418
+ err = PTR_ERR(bitmap_bh);
3419
+ bitmap_bh = NULL;
3420
+ break;
3421
+ }
3422
+
3423
+ err = -EIO;
3424
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3425
+ if (!gdp)
3426
+ break;
3427
+
3428
+ ext4_lock_group(sb, group);
3429
+ already = 0;
3430
+ for (i = 0; i < clen; i++)
3431
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
3432
+ !state)
3433
+ already++;
3434
+
3435
+ clen_changed = clen - already;
3436
+ if (state)
3437
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
3438
+ else
3439
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
3440
+ if (ext4_has_group_desc_csum(sb) &&
3441
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3442
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3443
+ ext4_free_group_clusters_set(sb, gdp,
3444
+ ext4_free_clusters_after_init(sb, group, gdp));
3445
+ }
3446
+ if (state)
3447
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
3448
+ else
3449
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
3450
+
3451
+ ext4_free_group_clusters_set(sb, gdp, clen);
3452
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
3453
+ ext4_group_desc_csum_set(sb, group, gdp);
3454
+
3455
+ ext4_unlock_group(sb, group);
3456
+
3457
+ if (sbi->s_log_groups_per_flex) {
3458
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
3459
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
3460
+ s_flex_groups, flex_group);
3461
+
3462
+ if (state)
3463
+ atomic64_sub(clen_changed, &fg->free_clusters);
3464
+ else
3465
+ atomic64_add(clen_changed, &fg->free_clusters);
3466
+
3467
+ }
3468
+
3469
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
3470
+ if (err)
3471
+ break;
3472
+ sync_dirty_buffer(bitmap_bh);
3473
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
3474
+ sync_dirty_buffer(gdp_bh);
3475
+ if (err)
3476
+ break;
3477
+
3478
+ block += thisgrp_len;
3479
+ len -= thisgrp_len;
3480
+ brelse(bitmap_bh);
3481
+ BUG_ON(len < 0);
3482
+ }
3483
+
3484
+ if (err)
3485
+ brelse(bitmap_bh);
3486
+}
3487
+
3488
+/*
30643489 * here we normalize request for locality group
30653490 * Group request are normalized to s_mb_group_prealloc, which goes to
30663491 * s_strip if we set the same via mount option.
....@@ -3076,8 +3501,7 @@
30763501
30773502 BUG_ON(lg == NULL);
30783503 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3079
- mb_debug(1, "#%u: goal %u blocks for locality group\n",
3080
- current->pid, ac->ac_g_ex.fe_len);
3504
+ mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
30813505 }
30823506
30833507 /*
....@@ -3089,6 +3513,7 @@
30893513 struct ext4_allocation_request *ar)
30903514 {
30913515 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3516
+ struct ext4_super_block *es = sbi->s_es;
30923517 int bsbits, max;
30933518 ext4_lblk_t end;
30943519 loff_t size, start_off;
....@@ -3169,6 +3594,15 @@
31693594 }
31703595 size = size >> bsbits;
31713596 start = start_off >> bsbits;
3597
+
3598
+ /*
3599
+ * For tiny groups (smaller than 8MB) the chosen allocation
3600
+ * alignment may be larger than group size. Make sure the
3601
+ * alignment does not move allocation to a different group which
3602
+ * makes mballoc fail assertions later.
3603
+ */
3604
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
3605
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
31723606
31733607 /* don't cover already allocated blocks in selected range */
31743608 if (ar->pleft && start <= ar->lleft) {
....@@ -3260,35 +3694,39 @@
32603694 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
32613695
32623696 /* define goal start in order to merge */
3263
- if (ar->pright && (ar->lright == (start + size))) {
3697
+ if (ar->pright && (ar->lright == (start + size)) &&
3698
+ ar->pright >= size &&
3699
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
32643700 /* merge to the right */
32653701 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3266
- &ac->ac_f_ex.fe_group,
3267
- &ac->ac_f_ex.fe_start);
3702
+ &ac->ac_g_ex.fe_group,
3703
+ &ac->ac_g_ex.fe_start);
32683704 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
32693705 }
3270
- if (ar->pleft && (ar->lleft + 1 == start)) {
3706
+ if (ar->pleft && (ar->lleft + 1 == start) &&
3707
+ ar->pleft + 1 < ext4_blocks_count(es)) {
32713708 /* merge to the left */
32723709 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3273
- &ac->ac_f_ex.fe_group,
3274
- &ac->ac_f_ex.fe_start);
3710
+ &ac->ac_g_ex.fe_group,
3711
+ &ac->ac_g_ex.fe_start);
32753712 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
32763713 }
32773714
3278
- mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3279
- (unsigned) orig_size, (unsigned) start);
3715
+ mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
3716
+ orig_size, start);
32803717 }
32813718
32823719 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
32833720 {
32843721 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
32853722
3286
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3723
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
32873724 atomic_inc(&sbi->s_bal_reqs);
32883725 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
32893726 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
32903727 atomic_inc(&sbi->s_bal_success);
32913728 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3729
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
32923730 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
32933731 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
32943732 atomic_inc(&sbi->s_bal_goals);
....@@ -3363,9 +3801,10 @@
33633801 BUG_ON(start < pa->pa_pstart);
33643802 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
33653803 BUG_ON(pa->pa_free < len);
3804
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
33663805 pa->pa_free -= len;
33673806
3368
- mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3807
+ mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
33693808 }
33703809
33713810 /*
....@@ -3389,7 +3828,8 @@
33893828 * in on-disk bitmap -- see ext4_mb_release_context()
33903829 * Other CPUs are prevented from allocating from this pa by lg_mutex
33913830 */
3392
- mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3831
+ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
3832
+ pa->pa_lstart-len, len, pa);
33933833 }
33943834
33953835 /*
....@@ -3424,7 +3864,7 @@
34243864 /*
34253865 * search goal blocks in preallocated space
34263866 */
3427
-static noinline_for_stack int
3867
+static noinline_for_stack bool
34283868 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
34293869 {
34303870 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
....@@ -3436,7 +3876,7 @@
34363876
34373877 /* only data can be preallocated */
34383878 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3439
- return 0;
3879
+ return false;
34403880
34413881 /* first, try per-file preallocation */
34423882 rcu_read_lock();
....@@ -3463,7 +3903,7 @@
34633903 spin_unlock(&pa->pa_lock);
34643904 ac->ac_criteria = 10;
34653905 rcu_read_unlock();
3466
- return 1;
3906
+ return true;
34673907 }
34683908 spin_unlock(&pa->pa_lock);
34693909 }
....@@ -3471,12 +3911,12 @@
34713911
34723912 /* can we use group allocation? */
34733913 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3474
- return 0;
3914
+ return false;
34753915
34763916 /* inode may have no locality group for some reason */
34773917 lg = ac->ac_lg;
34783918 if (lg == NULL)
3479
- return 0;
3919
+ return false;
34803920 order = fls(ac->ac_o_ex.fe_len) - 1;
34813921 if (order > PREALLOC_TB_SIZE - 1)
34823922 /* The max size of hash table is PREALLOC_TB_SIZE */
....@@ -3505,9 +3945,9 @@
35053945 if (cpa) {
35063946 ext4_mb_use_group_pa(ac, cpa);
35073947 ac->ac_criteria = 20;
3508
- return 1;
3948
+ return true;
35093949 }
3510
- return 0;
3950
+ return false;
35113951 }
35123952
35133953 /*
....@@ -3524,6 +3964,8 @@
35243964 struct ext4_free_data *entry;
35253965
35263966 grp = ext4_get_group_info(sb, group);
3967
+ if (!grp)
3968
+ return;
35273969 n = rb_first(&(grp->bb_free_root));
35283970
35293971 while (n) {
....@@ -3551,6 +3993,9 @@
35513993 int preallocated = 0;
35523994 int len;
35533995
3996
+ if (!grp)
3997
+ return;
3998
+
35543999 /* all form of preallocation discards first load group,
35554000 * so the only competing code is preallocation use.
35564001 * we don't need any locking here
....@@ -3572,7 +4017,27 @@
35724017 ext4_set_bits(bitmap, start, len);
35734018 preallocated += len;
35744019 }
3575
- mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
4020
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
4021
+}
4022
+
4023
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
4024
+ struct ext4_prealloc_space *pa)
4025
+{
4026
+ struct ext4_inode_info *ei;
4027
+
4028
+ if (pa->pa_deleted) {
4029
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
4030
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
4031
+ pa->pa_len);
4032
+ return;
4033
+ }
4034
+
4035
+ pa->pa_deleted = 1;
4036
+
4037
+ if (pa->pa_type == MB_INODE_PA) {
4038
+ ei = EXT4_I(pa->pa_inode);
4039
+ atomic_dec(&ei->i_prealloc_active);
4040
+ }
35764041 }
35774042
35784043 static void ext4_mb_pa_callback(struct rcu_head *head)
....@@ -3607,7 +4072,7 @@
36074072 return;
36084073 }
36094074
3610
- pa->pa_deleted = 1;
4075
+ ext4_mb_mark_pa_deleted(sb, pa);
36114076 spin_unlock(&pa->pa_lock);
36124077
36134078 grp_blk = pa->pa_pstart;
....@@ -3648,7 +4113,7 @@
36484113 /*
36494114 * creates new preallocated space for given inode
36504115 */
3651
-static noinline_for_stack int
4116
+static noinline_for_stack void
36524117 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
36534118 {
36544119 struct super_block *sb = ac->ac_sb;
....@@ -3661,16 +4126,13 @@
36614126 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
36624127 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
36634128 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4129
+ BUG_ON(ac->ac_pa == NULL);
36644130
3665
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3666
- if (pa == NULL)
3667
- return -ENOMEM;
4131
+ pa = ac->ac_pa;
36684132
36694133 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3670
- int winl;
3671
- int wins;
3672
- int win;
3673
- int offs;
4134
+ int new_bex_start;
4135
+ int new_bex_end;
36744136
36754137 /* we can't allocate as much as normalizer wants.
36764138 * so, found space must get proper lstart
....@@ -3678,26 +4140,40 @@
36784140 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
36794141 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
36804142
3681
- /* we're limited by original request in that
3682
- * logical block must be covered any way
3683
- * winl is window we can move our chunk within */
3684
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
4143
+ /*
4144
+ * Use the below logic for adjusting best extent as it keeps
4145
+ * fragmentation in check while ensuring logical range of best
4146
+ * extent doesn't overflow out of goal extent:
4147
+ *
4148
+ * 1. Check if best ex can be kept at end of goal and still
4149
+ * cover original start
4150
+ * 2. Else, check if best ex can be kept at start of goal and
4151
+ * still cover original start
4152
+ * 3. Else, keep the best ex at start of original request.
4153
+ */
4154
+ new_bex_end = ac->ac_g_ex.fe_logical +
4155
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
4156
+ new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4157
+ if (ac->ac_o_ex.fe_logical >= new_bex_start)
4158
+ goto adjust_bex;
36854159
3686
- /* also, we should cover whole original request */
3687
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
4160
+ new_bex_start = ac->ac_g_ex.fe_logical;
4161
+ new_bex_end =
4162
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4163
+ if (ac->ac_o_ex.fe_logical < new_bex_end)
4164
+ goto adjust_bex;
36884165
3689
- /* the smallest one defines real window */
3690
- win = min(winl, wins);
4166
+ new_bex_start = ac->ac_o_ex.fe_logical;
4167
+ new_bex_end =
4168
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
36914169
3692
- offs = ac->ac_o_ex.fe_logical %
3693
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3694
- if (offs && offs < win)
3695
- win = offs;
4170
+adjust_bex:
4171
+ ac->ac_b_ex.fe_logical = new_bex_start;
36964172
3697
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3698
- EXT4_NUM_B2C(sbi, win);
36994173 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
37004174 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
4175
+ BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
4176
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
37014177 }
37024178
37034179 /* preallocation can change ac_b_ex, thus we store actually
....@@ -3708,15 +4184,14 @@
37084184 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
37094185 pa->pa_len = ac->ac_b_ex.fe_len;
37104186 pa->pa_free = pa->pa_len;
3711
- atomic_set(&pa->pa_count, 1);
37124187 spin_lock_init(&pa->pa_lock);
37134188 INIT_LIST_HEAD(&pa->pa_inode_list);
37144189 INIT_LIST_HEAD(&pa->pa_group_list);
37154190 pa->pa_deleted = 0;
37164191 pa->pa_type = MB_INODE_PA;
37174192
3718
- mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3719
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
4193
+ mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4194
+ pa->pa_len, pa->pa_lstart);
37204195 trace_ext4_mb_new_inode_pa(ac, pa);
37214196
37224197 ext4_mb_use_inode_pa(ac, pa);
....@@ -3724,25 +4199,24 @@
37244199
37254200 ei = EXT4_I(ac->ac_inode);
37264201 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4202
+ if (!grp)
4203
+ return;
37274204
37284205 pa->pa_obj_lock = &ei->i_prealloc_lock;
37294206 pa->pa_inode = ac->ac_inode;
37304207
3731
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
37324208 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3733
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
37344209
37354210 spin_lock(pa->pa_obj_lock);
37364211 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
37374212 spin_unlock(pa->pa_obj_lock);
3738
-
3739
- return 0;
4213
+ atomic_inc(&ei->i_prealloc_active);
37404214 }
37414215
37424216 /*
37434217 * creates new preallocated space for locality group inodes belongs to
37444218 */
3745
-static noinline_for_stack int
4219
+static noinline_for_stack void
37464220 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
37474221 {
37484222 struct super_block *sb = ac->ac_sb;
....@@ -3754,11 +4228,9 @@
37544228 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
37554229 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
37564230 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4231
+ BUG_ON(ac->ac_pa == NULL);
37574232
3758
- BUG_ON(ext4_pspace_cachep == NULL);
3759
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3760
- if (pa == NULL)
3761
- return -ENOMEM;
4233
+ pa = ac->ac_pa;
37624234
37634235 /* preallocation can change ac_b_ex, thus we store actually
37644236 * allocated blocks for history */
....@@ -3768,47 +4240,42 @@
37684240 pa->pa_lstart = pa->pa_pstart;
37694241 pa->pa_len = ac->ac_b_ex.fe_len;
37704242 pa->pa_free = pa->pa_len;
3771
- atomic_set(&pa->pa_count, 1);
37724243 spin_lock_init(&pa->pa_lock);
37734244 INIT_LIST_HEAD(&pa->pa_inode_list);
37744245 INIT_LIST_HEAD(&pa->pa_group_list);
37754246 pa->pa_deleted = 0;
37764247 pa->pa_type = MB_GROUP_PA;
37774248
3778
- mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3779
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
4249
+ mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4250
+ pa->pa_len, pa->pa_lstart);
37804251 trace_ext4_mb_new_group_pa(ac, pa);
37814252
37824253 ext4_mb_use_group_pa(ac, pa);
37834254 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
37844255
37854256 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4257
+ if (!grp)
4258
+ return;
37864259 lg = ac->ac_lg;
37874260 BUG_ON(lg == NULL);
37884261
37894262 pa->pa_obj_lock = &lg->lg_prealloc_lock;
37904263 pa->pa_inode = NULL;
37914264
3792
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
37934265 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3794
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
37954266
37964267 /*
37974268 * We will later add the new pa to the right bucket
37984269 * after updating the pa_free in ext4_mb_release_context
37994270 */
3800
- return 0;
38014271 }
38024272
3803
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
4273
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
38044274 {
3805
- int err;
3806
-
38074275 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3808
- err = ext4_mb_new_group_pa(ac);
4276
+ ext4_mb_new_group_pa(ac);
38094277 else
3810
- err = ext4_mb_new_inode_pa(ac);
3811
- return err;
4278
+ ext4_mb_new_inode_pa(ac);
38124279 }
38134280
38144281 /*
....@@ -3843,7 +4310,7 @@
38434310 if (bit >= end)
38444311 break;
38454312 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3846
- mb_debug(1, " free preallocated %u/%u in group %u\n",
4313
+ mb_debug(sb, "free preallocated %u/%u in group %u\n",
38474314 (unsigned) ext4_group_first_block_no(sb, group) + bit,
38484315 (unsigned) next - bit, (unsigned) group);
38494316 free += next - bit;
....@@ -3857,10 +4324,10 @@
38574324 }
38584325 if (free != pa->pa_free) {
38594326 ext4_msg(e4b->bd_sb, KERN_CRIT,
3860
- "pa %p: logic %lu, phys. %lu, len %lu",
4327
+ "pa %p: logic %lu, phys. %lu, len %d",
38614328 pa, (unsigned long) pa->pa_lstart,
38624329 (unsigned long) pa->pa_pstart,
3863
- (unsigned long) pa->pa_len);
4330
+ pa->pa_len);
38644331 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
38654332 free, pa->pa_free);
38664333 /*
....@@ -3884,7 +4351,11 @@
38844351 trace_ext4_mb_release_group_pa(sb, pa);
38854352 BUG_ON(pa->pa_deleted == 0);
38864353 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3887
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
4354
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
4355
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
4356
+ e4b->bd_group, group, pa->pa_pstart);
4357
+ return 0;
4358
+ }
38884359 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
38894360 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
38904361 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
....@@ -3903,7 +4374,7 @@
39034374 */
39044375 static noinline_for_stack int
39054376 ext4_mb_discard_group_preallocations(struct super_block *sb,
3906
- ext4_group_t group, int needed)
4377
+ ext4_group_t group, int *busy)
39074378 {
39084379 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
39094380 struct buffer_head *bitmap_bh = NULL;
....@@ -3911,20 +4382,21 @@
39114382 struct list_head list;
39124383 struct ext4_buddy e4b;
39134384 int err;
3914
- int busy = 0;
39154385 int free = 0;
39164386
3917
- mb_debug(1, "discard preallocation for group %u\n", group);
3918
-
3919
- if (list_empty(&grp->bb_prealloc_list))
4387
+ if (!grp)
39204388 return 0;
4389
+ mb_debug(sb, "discard preallocation for group %u\n", group);
4390
+ if (list_empty(&grp->bb_prealloc_list))
4391
+ goto out_dbg;
39214392
39224393 bitmap_bh = ext4_read_block_bitmap(sb, group);
39234394 if (IS_ERR(bitmap_bh)) {
39244395 err = PTR_ERR(bitmap_bh);
3925
- ext4_error(sb, "Error %d reading block bitmap for %u",
3926
- err, group);
3927
- return 0;
4396
+ ext4_error_err(sb, -err,
4397
+ "Error %d reading block bitmap for %u",
4398
+ err, group);
4399
+ goto out_dbg;
39284400 }
39294401
39304402 err = ext4_mb_load_buddy(sb, group, &e4b);
....@@ -3932,21 +4404,17 @@
39324404 ext4_warning(sb, "Error %d loading buddy information for %u",
39334405 err, group);
39344406 put_bh(bitmap_bh);
3935
- return 0;
4407
+ goto out_dbg;
39364408 }
39374409
3938
- if (needed == 0)
3939
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3940
-
39414410 INIT_LIST_HEAD(&list);
3942
-repeat:
39434411 ext4_lock_group(sb, group);
39444412 list_for_each_entry_safe(pa, tmp,
39454413 &grp->bb_prealloc_list, pa_group_list) {
39464414 spin_lock(&pa->pa_lock);
39474415 if (atomic_read(&pa->pa_count)) {
39484416 spin_unlock(&pa->pa_lock);
3949
- busy = 1;
4417
+ *busy = 1;
39504418 continue;
39514419 }
39524420 if (pa->pa_deleted) {
....@@ -3955,7 +4423,10 @@
39554423 }
39564424
39574425 /* seems this one can be freed ... */
3958
- pa->pa_deleted = 1;
4426
+ ext4_mb_mark_pa_deleted(sb, pa);
4427
+
4428
+ if (!free)
4429
+ this_cpu_inc(discard_pa_seq);
39594430
39604431 /* we can trust pa_free ... */
39614432 free += pa->pa_free;
....@@ -3964,20 +4435,6 @@
39644435
39654436 list_del(&pa->pa_group_list);
39664437 list_add(&pa->u.pa_tmp_list, &list);
3967
- }
3968
-
3969
- /* if we still need more blocks and some PAs were used, try again */
3970
- if (free < needed && busy) {
3971
- busy = 0;
3972
- ext4_unlock_group(sb, group);
3973
- cond_resched();
3974
- goto repeat;
3975
- }
3976
-
3977
- /* found anything to free? */
3978
- if (list_empty(&list)) {
3979
- BUG_ON(free != 0);
3980
- goto out;
39814438 }
39824439
39834440 /* now free all selected PAs */
....@@ -3997,10 +4454,12 @@
39974454 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
39984455 }
39994456
4000
-out:
40014457 ext4_unlock_group(sb, group);
40024458 ext4_mb_unload_buddy(&e4b);
40034459 put_bh(bitmap_bh);
4460
+out_dbg:
4461
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
4462
+ free, group, grp->bb_free);
40044463 return free;
40054464 }
40064465
....@@ -4013,7 +4472,7 @@
40134472 *
40144473 * FIXME!! Make sure it is valid at all the call sites
40154474 */
4016
-void ext4_discard_preallocations(struct inode *inode)
4475
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
40174476 {
40184477 struct ext4_inode_info *ei = EXT4_I(inode);
40194478 struct super_block *sb = inode->i_sb;
....@@ -4029,16 +4488,24 @@
40294488 return;
40304489 }
40314490
4032
- mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
4033
- trace_ext4_discard_preallocations(inode);
4491
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
4492
+ return;
4493
+
4494
+ mb_debug(sb, "discard preallocation for inode %lu\n",
4495
+ inode->i_ino);
4496
+ trace_ext4_discard_preallocations(inode,
4497
+ atomic_read(&ei->i_prealloc_active), needed);
40344498
40354499 INIT_LIST_HEAD(&list);
4500
+
4501
+ if (needed == 0)
4502
+ needed = UINT_MAX;
40364503
40374504 repeat:
40384505 /* first, collect all pa's in the inode */
40394506 spin_lock(&ei->i_prealloc_lock);
4040
- while (!list_empty(&ei->i_prealloc_list)) {
4041
- pa = list_entry(ei->i_prealloc_list.next,
4507
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
4508
+ pa = list_entry(ei->i_prealloc_list.prev,
40424509 struct ext4_prealloc_space, pa_inode_list);
40434510 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
40444511 spin_lock(&pa->pa_lock);
....@@ -4055,10 +4522,11 @@
40554522
40564523 }
40574524 if (pa->pa_deleted == 0) {
4058
- pa->pa_deleted = 1;
4525
+ ext4_mb_mark_pa_deleted(sb, pa);
40594526 spin_unlock(&pa->pa_lock);
40604527 list_del_rcu(&pa->pa_inode_list);
40614528 list_add(&pa->u.pa_tmp_list, &list);
4529
+ needed--;
40624530 continue;
40634531 }
40644532
....@@ -4090,16 +4558,16 @@
40904558 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
40914559 GFP_NOFS|__GFP_NOFAIL);
40924560 if (err) {
4093
- ext4_error(sb, "Error %d loading buddy information for %u",
4094
- err, group);
4561
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4562
+ err, group);
40954563 continue;
40964564 }
40974565
40984566 bitmap_bh = ext4_read_block_bitmap(sb, group);
40994567 if (IS_ERR(bitmap_bh)) {
41004568 err = PTR_ERR(bitmap_bh);
4101
- ext4_error(sb, "Error %d reading block bitmap for %u",
4102
- err, group);
4569
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
4570
+ err, group);
41034571 ext4_mb_unload_buddy(&e4b);
41044572 continue;
41054573 }
....@@ -4117,22 +4585,77 @@
41174585 }
41184586 }
41194587
4588
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
4589
+{
4590
+ struct ext4_prealloc_space *pa;
4591
+
4592
+ BUG_ON(ext4_pspace_cachep == NULL);
4593
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
4594
+ if (!pa)
4595
+ return -ENOMEM;
4596
+ atomic_set(&pa->pa_count, 1);
4597
+ ac->ac_pa = pa;
4598
+ return 0;
4599
+}
4600
+
4601
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
4602
+{
4603
+ struct ext4_prealloc_space *pa = ac->ac_pa;
4604
+
4605
+ BUG_ON(!pa);
4606
+ ac->ac_pa = NULL;
4607
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
4608
+ kmem_cache_free(ext4_pspace_cachep, pa);
4609
+}
4610
+
41204611 #ifdef CONFIG_EXT4_DEBUG
4612
+static inline void ext4_mb_show_pa(struct super_block *sb)
4613
+{
4614
+ ext4_group_t i, ngroups;
4615
+
4616
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
4617
+ return;
4618
+
4619
+ ngroups = ext4_get_groups_count(sb);
4620
+ mb_debug(sb, "groups: ");
4621
+ for (i = 0; i < ngroups; i++) {
4622
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4623
+ struct ext4_prealloc_space *pa;
4624
+ ext4_grpblk_t start;
4625
+ struct list_head *cur;
4626
+
4627
+ if (!grp)
4628
+ continue;
4629
+ ext4_lock_group(sb, i);
4630
+ list_for_each(cur, &grp->bb_prealloc_list) {
4631
+ pa = list_entry(cur, struct ext4_prealloc_space,
4632
+ pa_group_list);
4633
+ spin_lock(&pa->pa_lock);
4634
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4635
+ NULL, &start);
4636
+ spin_unlock(&pa->pa_lock);
4637
+ mb_debug(sb, "PA:%u:%d:%d\n", i, start,
4638
+ pa->pa_len);
4639
+ }
4640
+ ext4_unlock_group(sb, i);
4641
+ mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
4642
+ grp->bb_fragments);
4643
+ }
4644
+}
4645
+
41214646 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
41224647 {
41234648 struct super_block *sb = ac->ac_sb;
4124
- ext4_group_t ngroups, i;
41254649
4126
- if (!ext4_mballoc_debug ||
4127
- (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4650
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
41284651 return;
41294652
4130
- ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4653
+ mb_debug(sb, "Can't allocate:"
41314654 " Allocation context details:");
4132
- ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4655
+ mb_debug(sb, "status %u flags 0x%x",
41334656 ac->ac_status, ac->ac_flags);
4134
- ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4135
- "goal %lu/%lu/%lu@%lu, "
4657
+ mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
4658
+ "goal %lu/%lu/%lu@%lu, "
41364659 "best %lu/%lu/%lu@%lu cr %d",
41374660 (unsigned long)ac->ac_o_ex.fe_group,
41384661 (unsigned long)ac->ac_o_ex.fe_start,
....@@ -4147,37 +4670,17 @@
41474670 (unsigned long)ac->ac_b_ex.fe_len,
41484671 (unsigned long)ac->ac_b_ex.fe_logical,
41494672 (int)ac->ac_criteria);
4150
- ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4151
- ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4152
- ngroups = ext4_get_groups_count(sb);
4153
- for (i = 0; i < ngroups; i++) {
4154
- struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4155
- struct ext4_prealloc_space *pa;
4156
- ext4_grpblk_t start;
4157
- struct list_head *cur;
4158
- ext4_lock_group(sb, i);
4159
- list_for_each(cur, &grp->bb_prealloc_list) {
4160
- pa = list_entry(cur, struct ext4_prealloc_space,
4161
- pa_group_list);
4162
- spin_lock(&pa->pa_lock);
4163
- ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4164
- NULL, &start);
4165
- spin_unlock(&pa->pa_lock);
4166
- printk(KERN_ERR "PA:%u:%d:%u \n", i,
4167
- start, pa->pa_len);
4168
- }
4169
- ext4_unlock_group(sb, i);
4170
-
4171
- if (grp->bb_free == 0)
4172
- continue;
4173
- printk(KERN_ERR "%u: %d/%d \n",
4174
- i, grp->bb_free, grp->bb_fragments);
4175
- }
4176
- printk(KERN_ERR "\n");
4673
+ mb_debug(sb, "%u found", ac->ac_found);
4674
+ ext4_mb_show_pa(sb);
41774675 }
41784676 #else
4677
+static inline void ext4_mb_show_pa(struct super_block *sb)
4678
+{
4679
+ return;
4680
+}
41794681 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
41804682 {
4683
+ ext4_mb_show_pa(ac->ac_sb);
41814684 return;
41824685 }
41834686 #endif
....@@ -4205,9 +4708,8 @@
42054708 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
42064709 >> bsbits;
42074710
4208
- if ((size == isize) &&
4209
- !ext4_fs_is_busy(sbi) &&
4210
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4711
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
4712
+ !inode_is_open_for_write(ac->ac_inode)) {
42114713 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
42124714 return;
42134715 }
....@@ -4277,17 +4779,17 @@
42774779 ac->ac_g_ex = ac->ac_o_ex;
42784780 ac->ac_flags = ar->flags;
42794781
4280
- /* we have to define context: we'll we work with a file or
4782
+ /* we have to define context: we'll work with a file or
42814783 * locality group. this is a policy, actually */
42824784 ext4_mb_group_or_file(ac);
42834785
4284
- mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4786
+ mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
42854787 "left: %u/%u, right %u/%u to %swritable\n",
42864788 (unsigned) ar->len, (unsigned) ar->logical,
42874789 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
42884790 (unsigned) ar->lleft, (unsigned) ar->pleft,
42894791 (unsigned) ar->lright, (unsigned) ar->pright,
4290
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4792
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
42914793 return 0;
42924794
42934795 }
....@@ -4302,13 +4804,14 @@
43024804 struct list_head discard_list;
43034805 struct ext4_prealloc_space *pa, *tmp;
43044806
4305
- mb_debug(1, "discard locality group preallocation\n");
4807
+ mb_debug(sb, "discard locality group preallocation\n");
43064808
43074809 INIT_LIST_HEAD(&discard_list);
43084810
43094811 spin_lock(&lg->lg_prealloc_lock);
43104812 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4311
- pa_inode_list) {
4813
+ pa_inode_list,
4814
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
43124815 spin_lock(&pa->pa_lock);
43134816 if (atomic_read(&pa->pa_count)) {
43144817 /*
....@@ -4327,7 +4830,7 @@
43274830 BUG_ON(pa->pa_type != MB_GROUP_PA);
43284831
43294832 /* seems this one can be freed ... */
4330
- pa->pa_deleted = 1;
4833
+ ext4_mb_mark_pa_deleted(sb, pa);
43314834 spin_unlock(&pa->pa_lock);
43324835
43334836 list_del_rcu(&pa->pa_inode_list);
....@@ -4353,8 +4856,8 @@
43534856 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
43544857 GFP_NOFS|__GFP_NOFAIL);
43554858 if (err) {
4356
- ext4_error(sb, "Error %d loading buddy information for %u",
4357
- err, group);
4859
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4860
+ err, group);
43584861 continue;
43594862 }
43604863 ext4_lock_group(sb, group);
....@@ -4391,7 +4894,8 @@
43914894 /* Add the prealloc space to lg */
43924895 spin_lock(&lg->lg_prealloc_lock);
43934896 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4394
- pa_inode_list) {
4897
+ pa_inode_list,
4898
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
43954899 spin_lock(&tmp_pa->pa_lock);
43964900 if (tmp_pa->pa_deleted) {
43974901 spin_unlock(&tmp_pa->pa_lock);
....@@ -4425,10 +4929,29 @@
44254929 }
44264930
44274931 /*
4932
+ * if per-inode prealloc list is too long, trim some PA
4933
+ */
4934
+static void ext4_mb_trim_inode_pa(struct inode *inode)
4935
+{
4936
+ struct ext4_inode_info *ei = EXT4_I(inode);
4937
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4938
+ int count, delta;
4939
+
4940
+ count = atomic_read(&ei->i_prealloc_active);
4941
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
4942
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
4943
+ count -= sbi->s_mb_max_inode_prealloc;
4944
+ ext4_discard_preallocations(inode, count);
4945
+ }
4946
+}
4947
+
4948
+/*
44284949 * release all resource we used in allocation
44294950 */
44304951 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
44314952 {
4953
+ struct inode *inode = ac->ac_inode;
4954
+ struct ext4_inode_info *ei = EXT4_I(inode);
44324955 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
44334956 struct ext4_prealloc_space *pa = ac->ac_pa;
44344957 if (pa) {
....@@ -4440,21 +4963,31 @@
44404963 pa->pa_free -= ac->ac_b_ex.fe_len;
44414964 pa->pa_len -= ac->ac_b_ex.fe_len;
44424965 spin_unlock(&pa->pa_lock);
4966
+
4967
+ /*
4968
+ * We want to add the pa to the right bucket.
4969
+ * Remove it from the list and while adding
4970
+ * make sure the list to which we are adding
4971
+ * doesn't grow big.
4972
+ */
4973
+ if (likely(pa->pa_free)) {
4974
+ spin_lock(pa->pa_obj_lock);
4975
+ list_del_rcu(&pa->pa_inode_list);
4976
+ spin_unlock(pa->pa_obj_lock);
4977
+ ext4_mb_add_n_trim(ac);
4978
+ }
44434979 }
4444
- }
4445
- if (pa) {
4446
- /*
4447
- * We want to add the pa to the right bucket.
4448
- * Remove it from the list and while adding
4449
- * make sure the list to which we are adding
4450
- * doesn't grow big.
4451
- */
4452
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4980
+
4981
+ if (pa->pa_type == MB_INODE_PA) {
4982
+ /*
4983
+ * treat per-inode prealloc list as a lru list, then try
4984
+ * to trim the least recently used PA.
4985
+ */
44534986 spin_lock(pa->pa_obj_lock);
4454
- list_del_rcu(&pa->pa_inode_list);
4987
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
44554988 spin_unlock(pa->pa_obj_lock);
4456
- ext4_mb_add_n_trim(ac);
44574989 }
4990
+
44584991 ext4_mb_put_pa(ac, ac->ac_sb, pa);
44594992 }
44604993 if (ac->ac_bitmap_page)
....@@ -4464,6 +4997,7 @@
44644997 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
44654998 mutex_unlock(&ac->ac_lg->lg_mutex);
44664999 ext4_mb_collect_stats(ac);
5000
+ ext4_mb_trim_inode_pa(inode);
44675001 return 0;
44685002 }
44695003
....@@ -4471,17 +5005,55 @@
44715005 {
44725006 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
44735007 int ret;
4474
- int freed = 0;
5008
+ int freed = 0, busy = 0;
5009
+ int retry = 0;
44755010
44765011 trace_ext4_mb_discard_preallocations(sb, needed);
5012
+
5013
+ if (needed == 0)
5014
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
5015
+ repeat:
44775016 for (i = 0; i < ngroups && needed > 0; i++) {
4478
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
5017
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
44795018 freed += ret;
44805019 needed -= ret;
5020
+ cond_resched();
5021
+ }
5022
+
5023
+ if (needed > 0 && busy && ++retry < 3) {
5024
+ busy = 0;
5025
+ goto repeat;
44815026 }
44825027
44835028 return freed;
44845029 }
5030
+
5031
+static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
5032
+ struct ext4_allocation_context *ac, u64 *seq)
5033
+{
5034
+ int freed;
5035
+ u64 seq_retry = 0;
5036
+ bool ret = false;
5037
+
5038
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
5039
+ if (freed) {
5040
+ ret = true;
5041
+ goto out_dbg;
5042
+ }
5043
+ seq_retry = ext4_get_discard_pa_seq_sum();
5044
+ if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
5045
+ ac->ac_flags |= EXT4_MB_STRICT_CHECK;
5046
+ *seq = seq_retry;
5047
+ ret = true;
5048
+ }
5049
+
5050
+out_dbg:
5051
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
5052
+ return ret;
5053
+}
5054
+
5055
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5056
+ struct ext4_allocation_request *ar, int *errp);
44855057
44865058 /*
44875059 * Main entry point into mballoc to allocate blocks
....@@ -4491,19 +5063,22 @@
44915063 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
44925064 struct ext4_allocation_request *ar, int *errp)
44935065 {
4494
- int freed;
44955066 struct ext4_allocation_context *ac = NULL;
44965067 struct ext4_sb_info *sbi;
44975068 struct super_block *sb;
44985069 ext4_fsblk_t block = 0;
44995070 unsigned int inquota = 0;
45005071 unsigned int reserv_clstrs = 0;
5072
+ int retries = 0;
5073
+ u64 seq;
45015074
45025075 might_sleep();
45035076 sb = ar->inode->i_sb;
45045077 sbi = EXT4_SB(sb);
45055078
45065079 trace_ext4_request_blocks(ar);
5080
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
5081
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
45075082
45085083 /* Allow to use superuser reservation for quota file */
45095084 if (ext4_is_quota_file(ar->inode))
....@@ -4522,6 +5097,7 @@
45225097 ar->len = ar->len >> 1;
45235098 }
45245099 if (!ar->len) {
5100
+ ext4_mb_show_pa(sb);
45255101 *errp = -ENOSPC;
45265102 return 0;
45275103 }
....@@ -4559,26 +5135,32 @@
45595135 }
45605136
45615137 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5138
+ seq = this_cpu_read(discard_pa_seq);
45625139 if (!ext4_mb_use_preallocated(ac)) {
45635140 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
45645141 ext4_mb_normalize_request(ac, ar);
5142
+
5143
+ *errp = ext4_mb_pa_alloc(ac);
5144
+ if (*errp)
5145
+ goto errout;
45655146 repeat:
45665147 /* allocate space in core */
45675148 *errp = ext4_mb_regular_allocator(ac);
4568
- if (*errp)
4569
- goto discard_and_exit;
4570
-
4571
- /* as we've just preallocated more space than
4572
- * user requested originally, we store allocated
4573
- * space in a special descriptor */
4574
- if (ac->ac_status == AC_STATUS_FOUND &&
4575
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4576
- *errp = ext4_mb_new_preallocation(ac);
5149
+ /*
5150
+ * pa allocated above is added to grp->bb_prealloc_list only
5151
+ * when we were able to allocate some block i.e. when
5152
+ * ac->ac_status == AC_STATUS_FOUND.
5153
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
5154
+ * So we have to free this pa here itself.
5155
+ */
45775156 if (*errp) {
4578
- discard_and_exit:
5157
+ ext4_mb_pa_free(ac);
45795158 ext4_discard_allocated_blocks(ac);
45805159 goto errout;
45815160 }
5161
+ if (ac->ac_status == AC_STATUS_FOUND &&
5162
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
5163
+ ext4_mb_pa_free(ac);
45825164 }
45835165 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
45845166 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
....@@ -4590,9 +5172,14 @@
45905172 ar->len = ac->ac_b_ex.fe_len;
45915173 }
45925174 } else {
4593
- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4594
- if (freed)
5175
+ if (++retries < 3 &&
5176
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
45955177 goto repeat;
5178
+ /*
5179
+ * If block allocation fails then the pa allocated above
5180
+ * needs to be freed here itself.
5181
+ */
5182
+ ext4_mb_pa_free(ac);
45965183 *errp = -ENOSPC;
45975184 }
45985185
....@@ -4721,21 +5308,128 @@
47215308 return 0;
47225309 }
47235310
5311
+/*
5312
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
5313
+ * linearly starting at the goal block and also excludes the blocks which
5314
+ * are going to be in use after fast commit replay.
5315
+ */
5316
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5317
+ struct ext4_allocation_request *ar, int *errp)
5318
+{
5319
+ struct buffer_head *bitmap_bh;
5320
+ struct super_block *sb = ar->inode->i_sb;
5321
+ ext4_group_t group;
5322
+ ext4_grpblk_t blkoff;
5323
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
5324
+ ext4_grpblk_t i = 0;
5325
+ ext4_fsblk_t goal, block;
5326
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
5327
+
5328
+ goal = ar->goal;
5329
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
5330
+ goal >= ext4_blocks_count(es))
5331
+ goal = le32_to_cpu(es->s_first_data_block);
5332
+
5333
+ ar->len = 0;
5334
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
5335
+ for (; group < ext4_get_groups_count(sb); group++) {
5336
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5337
+ if (IS_ERR(bitmap_bh)) {
5338
+ *errp = PTR_ERR(bitmap_bh);
5339
+ pr_warn("Failed to read block bitmap\n");
5340
+ return 0;
5341
+ }
5342
+
5343
+ ext4_get_group_no_and_offset(sb,
5344
+ max(ext4_group_first_block_no(sb, group), goal),
5345
+ NULL, &blkoff);
5346
+ while (1) {
5347
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
5348
+ blkoff);
5349
+ if (i >= max)
5350
+ break;
5351
+ if (ext4_fc_replay_check_excluded(sb,
5352
+ ext4_group_first_block_no(sb, group) + i)) {
5353
+ blkoff = i + 1;
5354
+ } else
5355
+ break;
5356
+ }
5357
+ brelse(bitmap_bh);
5358
+ if (i < max)
5359
+ break;
5360
+ }
5361
+
5362
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
5363
+ *errp = -ENOSPC;
5364
+ return 0;
5365
+ }
5366
+
5367
+ block = ext4_group_first_block_no(sb, group) + i;
5368
+ ext4_mb_mark_bb(sb, block, 1, 1);
5369
+ ar->len = 1;
5370
+
5371
+ return block;
5372
+}
5373
+
5374
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
5375
+ unsigned long count)
5376
+{
5377
+ struct buffer_head *bitmap_bh;
5378
+ struct super_block *sb = inode->i_sb;
5379
+ struct ext4_group_desc *gdp;
5380
+ struct buffer_head *gdp_bh;
5381
+ ext4_group_t group;
5382
+ ext4_grpblk_t blkoff;
5383
+ int already_freed = 0, err, i;
5384
+
5385
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
5386
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5387
+ if (IS_ERR(bitmap_bh)) {
5388
+ err = PTR_ERR(bitmap_bh);
5389
+ pr_warn("Failed to read block bitmap\n");
5390
+ return;
5391
+ }
5392
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
5393
+ if (!gdp)
5394
+ return;
5395
+
5396
+ for (i = 0; i < count; i++) {
5397
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
5398
+ already_freed++;
5399
+ }
5400
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
5401
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
5402
+ if (err)
5403
+ return;
5404
+ ext4_free_group_clusters_set(
5405
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
5406
+ count - already_freed);
5407
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
5408
+ ext4_group_desc_csum_set(sb, group, gdp);
5409
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
5410
+ sync_dirty_buffer(bitmap_bh);
5411
+ sync_dirty_buffer(gdp_bh);
5412
+ brelse(bitmap_bh);
5413
+}
5414
+
47245415 /**
4725
- * ext4_free_blocks() -- Free given blocks and update quota
5416
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
5417
+ * Used by ext4_free_blocks()
47265418 * @handle: handle for this transaction
47275419 * @inode: inode
4728
- * @block: start physical block to free
4729
- * @count: number of blocks to count
5420
+ * @bh: optional buffer of the block to be freed
5421
+ * @block: starting physical block to be freed
5422
+ * @count: number of blocks to be freed
47305423 * @flags: flags used by ext4_free_blocks
47315424 */
4732
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
4733
- struct buffer_head *bh, ext4_fsblk_t block,
4734
- unsigned long count, int flags)
5425
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
5426
+ ext4_fsblk_t block, unsigned long count,
5427
+ int flags)
47355428 {
47365429 struct buffer_head *bitmap_bh = NULL;
47375430 struct super_block *sb = inode->i_sb;
47385431 struct ext4_group_desc *gdp;
5432
+ struct ext4_group_info *grp;
47395433 unsigned int overflow;
47405434 ext4_grpblk_t bit;
47415435 struct buffer_head *gd_bh;
....@@ -4746,82 +5440,23 @@
47465440 int err = 0;
47475441 int ret;
47485442
4749
- might_sleep();
4750
- if (bh) {
4751
- if (block)
4752
- BUG_ON(block != bh->b_blocknr);
4753
- else
4754
- block = bh->b_blocknr;
4755
- }
4756
-
47575443 sbi = EXT4_SB(sb);
5444
+
47585445 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
47595446 !ext4_inode_block_valid(inode, block, count)) {
4760
- ext4_error(sb, "Freeing blocks not in datazone - "
4761
- "block = %llu, count = %lu", block, count);
5447
+ ext4_error(sb, "Freeing blocks in system zone - "
5448
+ "Block = %llu, count = %lu", block, count);
5449
+ /* err = 0. ext4_std_error should be a no op */
47625450 goto error_return;
47635451 }
4764
-
4765
- ext4_debug("freeing block %llu\n", block);
4766
- trace_ext4_free_blocks(inode, block, count, flags);
4767
-
4768
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4769
- BUG_ON(count > 1);
4770
-
4771
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4772
- inode, bh, block);
4773
- }
4774
-
4775
- /*
4776
- * If the extent to be freed does not begin on a cluster
4777
- * boundary, we need to deal with partial clusters at the
4778
- * beginning and end of the extent. Normally we will free
4779
- * blocks at the beginning or the end unless we are explicitly
4780
- * requested to avoid doing so.
4781
- */
4782
- overflow = EXT4_PBLK_COFF(sbi, block);
4783
- if (overflow) {
4784
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4785
- overflow = sbi->s_cluster_ratio - overflow;
4786
- block += overflow;
4787
- if (count > overflow)
4788
- count -= overflow;
4789
- else
4790
- return;
4791
- } else {
4792
- block -= overflow;
4793
- count += overflow;
4794
- }
4795
- }
4796
- overflow = EXT4_LBLK_COFF(sbi, count);
4797
- if (overflow) {
4798
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4799
- if (count > overflow)
4800
- count -= overflow;
4801
- else
4802
- return;
4803
- } else
4804
- count += sbi->s_cluster_ratio - overflow;
4805
- }
4806
-
4807
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4808
- int i;
4809
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
4810
-
4811
- for (i = 0; i < count; i++) {
4812
- cond_resched();
4813
- if (is_metadata)
4814
- bh = sb_find_get_block(inode->i_sb, block + i);
4815
- ext4_forget(handle, is_metadata, inode, bh, block + i);
4816
- }
4817
- }
5452
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
48185453
48195454 do_more:
48205455 overflow = 0;
48215456 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
48225457
4823
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4824
- ext4_get_group_info(sb, block_group))))
5458
+ grp = ext4_get_group_info(sb, block_group);
5459
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
48255460 return;
48265461
48275462 /*
....@@ -4832,6 +5467,8 @@
48325467 overflow = EXT4_C2B(sbi, bit) + count -
48335468 EXT4_BLOCKS_PER_GROUP(sb);
48345469 count -= overflow;
5470
+ /* The range changed so it's no longer validated */
5471
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
48355472 }
48365473 count_clusters = EXT4_NUM_B2C(sbi, count);
48375474 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
....@@ -4846,13 +5483,8 @@
48465483 goto error_return;
48475484 }
48485485
4849
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4850
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4851
- in_range(block, ext4_inode_table(sb, gdp),
4852
- sbi->s_itb_per_group) ||
4853
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
4854
- sbi->s_itb_per_group)) {
4855
-
5486
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5487
+ !ext4_inode_block_valid(inode, block, count)) {
48565488 ext4_error(sb, "Freeing blocks in system zone - "
48575489 "Block = %llu, count = %lu", block, count);
48585490 /* err = 0. ext4_std_error should be a no op */
....@@ -4918,11 +5550,11 @@
49185550 * them with group lock_held
49195551 */
49205552 if (test_opt(sb, DISCARD)) {
4921
- err = ext4_issue_discard(sb, block_group, bit, count,
4922
- NULL);
5553
+ err = ext4_issue_discard(sb, block_group, bit,
5554
+ count_clusters, NULL);
49235555 if (err && err != -EOPNOTSUPP)
49245556 ext4_msg(sb, KERN_WARNING, "discard request in"
4925
- " group:%d block:%d count:%lu failed"
5557
+ " group:%u block:%d count:%lu failed"
49265558 " with %d", block_group, bit, count,
49275559 err);
49285560 } else
....@@ -4946,9 +5578,17 @@
49465578 flex_group)->free_clusters);
49475579 }
49485580
4949
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4950
- dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4951
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
5581
+ /*
5582
+ * on a bigalloc file system, defer the s_freeclusters_counter
5583
+ * update to the caller (ext4_remove_space and friends) so they
5584
+ * can determine if a cluster freed here should be rereserved
5585
+ */
5586
+ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
5587
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
5588
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
5589
+ percpu_counter_add(&sbi->s_freeclusters_counter,
5590
+ count_clusters);
5591
+ }
49525592
49535593 ext4_mb_unload_buddy(&e4b);
49545594
....@@ -4966,11 +5606,116 @@
49665606 block += count;
49675607 count = overflow;
49685608 put_bh(bitmap_bh);
5609
+ /* The range changed so it's no longer validated */
5610
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
49695611 goto do_more;
49705612 }
49715613 error_return:
49725614 brelse(bitmap_bh);
49735615 ext4_std_error(sb, err);
5616
+ return;
5617
+}
5618
+
5619
+/**
5620
+ * ext4_free_blocks() -- Free given blocks and update quota
5621
+ * @handle: handle for this transaction
5622
+ * @inode: inode
5623
+ * @bh: optional buffer of the block to be freed
5624
+ * @block: starting physical block to be freed
5625
+ * @count: number of blocks to be freed
5626
+ * @flags: flags used by ext4_free_blocks
5627
+ */
5628
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
5629
+ struct buffer_head *bh, ext4_fsblk_t block,
5630
+ unsigned long count, int flags)
5631
+{
5632
+ struct super_block *sb = inode->i_sb;
5633
+ unsigned int overflow;
5634
+ struct ext4_sb_info *sbi;
5635
+
5636
+ sbi = EXT4_SB(sb);
5637
+
5638
+ if (bh) {
5639
+ if (block)
5640
+ BUG_ON(block != bh->b_blocknr);
5641
+ else
5642
+ block = bh->b_blocknr;
5643
+ }
5644
+
5645
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
5646
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
5647
+ return;
5648
+ }
5649
+
5650
+ might_sleep();
5651
+
5652
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5653
+ !ext4_inode_block_valid(inode, block, count)) {
5654
+ ext4_error(sb, "Freeing blocks not in datazone - "
5655
+ "block = %llu, count = %lu", block, count);
5656
+ return;
5657
+ }
5658
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
5659
+
5660
+ ext4_debug("freeing block %llu\n", block);
5661
+ trace_ext4_free_blocks(inode, block, count, flags);
5662
+
5663
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5664
+ BUG_ON(count > 1);
5665
+
5666
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
5667
+ inode, bh, block);
5668
+ }
5669
+
5670
+ /*
5671
+ * If the extent to be freed does not begin on a cluster
5672
+ * boundary, we need to deal with partial clusters at the
5673
+ * beginning and end of the extent. Normally we will free
5674
+ * blocks at the beginning or the end unless we are explicitly
5675
+ * requested to avoid doing so.
5676
+ */
5677
+ overflow = EXT4_PBLK_COFF(sbi, block);
5678
+ if (overflow) {
5679
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
5680
+ overflow = sbi->s_cluster_ratio - overflow;
5681
+ block += overflow;
5682
+ if (count > overflow)
5683
+ count -= overflow;
5684
+ else
5685
+ return;
5686
+ } else {
5687
+ block -= overflow;
5688
+ count += overflow;
5689
+ }
5690
+ /* The range changed so it's no longer validated */
5691
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5692
+ }
5693
+ overflow = EXT4_LBLK_COFF(sbi, count);
5694
+ if (overflow) {
5695
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
5696
+ if (count > overflow)
5697
+ count -= overflow;
5698
+ else
5699
+ return;
5700
+ } else
5701
+ count += sbi->s_cluster_ratio - overflow;
5702
+ /* The range changed so it's no longer validated */
5703
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5704
+ }
5705
+
5706
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5707
+ int i;
5708
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
5709
+
5710
+ for (i = 0; i < count; i++) {
5711
+ cond_resched();
5712
+ if (is_metadata)
5713
+ bh = sb_find_get_block(inode->i_sb, block + i);
5714
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
5715
+ }
5716
+ }
5717
+
5718
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
49745719 return;
49755720 }
49765721
....@@ -5030,11 +5775,7 @@
50305775 goto error_return;
50315776 }
50325777
5033
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
5034
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
5035
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
5036
- in_range(block + count - 1, ext4_inode_table(sb, desc),
5037
- sbi->s_itb_per_group)) {
5778
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
50385779 ext4_error(sb, "Adding blocks in system zones - "
50395780 "Block = %llu, count = %lu",
50405781 block, count);
....@@ -5119,19 +5860,19 @@
51195860 * @sb: super block for the file system
51205861 * @start: starting block of the free extent in the alloc. group
51215862 * @count: number of blocks to TRIM
5122
- * @group: alloc. group we are working with
51235863 * @e4b: ext4 buddy for the group
51245864 *
51255865 * Trim "count" blocks starting at "start" in the "group". To assure that no
51265866 * one will allocate those blocks, mark it as used in buddy bitmap. This must
51275867 * be called with under the group lock.
51285868 */
5129
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
5130
- ext4_group_t group, struct ext4_buddy *e4b)
5869
+static int ext4_trim_extent(struct super_block *sb,
5870
+ int start, int count, struct ext4_buddy *e4b)
51315871 __releases(bitlock)
51325872 __acquires(bitlock)
51335873 {
51345874 struct ext4_free_extent ex;
5875
+ ext4_group_t group = e4b->bd_group;
51355876 int ret = 0;
51365877
51375878 trace_ext4_trim_extent(sb, group, start, count);
....@@ -5152,6 +5893,71 @@
51525893 ext4_lock_group(sb, group);
51535894 mb_free_blocks(NULL, e4b, start, ex.fe_len);
51545895 return ret;
5896
+}
5897
+
5898
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
5899
+ ext4_group_t grp)
5900
+{
5901
+ if (grp < ext4_get_groups_count(sb))
5902
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5903
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
5904
+ ext4_group_first_block_no(sb, grp) - 1) >>
5905
+ EXT4_CLUSTER_BITS(sb);
5906
+}
5907
+
5908
+static bool ext4_trim_interrupted(void)
5909
+{
5910
+ return fatal_signal_pending(current) || freezing(current);
5911
+}
5912
+
5913
+static int ext4_try_to_trim_range(struct super_block *sb,
5914
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
5915
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
5916
+{
5917
+ ext4_grpblk_t next, count, free_count;
5918
+ bool set_trimmed = false;
5919
+ void *bitmap;
5920
+
5921
+ bitmap = e4b->bd_bitmap;
5922
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
5923
+ set_trimmed = true;
5924
+ start = max(e4b->bd_info->bb_first_free, start);
5925
+ count = 0;
5926
+ free_count = 0;
5927
+
5928
+ while (start <= max) {
5929
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
5930
+ if (start > max)
5931
+ break;
5932
+ next = mb_find_next_bit(bitmap, max + 1, start);
5933
+
5934
+ if ((next - start) >= minblocks) {
5935
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
5936
+
5937
+ if (ret && ret != -EOPNOTSUPP)
5938
+ return count;
5939
+ count += next - start;
5940
+ }
5941
+ free_count += next - start;
5942
+ start = next + 1;
5943
+
5944
+ if (ext4_trim_interrupted())
5945
+ return count;
5946
+
5947
+ if (need_resched()) {
5948
+ ext4_unlock_group(sb, e4b->bd_group);
5949
+ cond_resched();
5950
+ ext4_lock_group(sb, e4b->bd_group);
5951
+ }
5952
+
5953
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
5954
+ break;
5955
+ }
5956
+
5957
+ if (set_trimmed)
5958
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
5959
+
5960
+ return count;
51555961 }
51565962
51575963 /**
....@@ -5177,10 +5983,8 @@
51775983 ext4_grpblk_t start, ext4_grpblk_t max,
51785984 ext4_grpblk_t minblocks)
51795985 {
5180
- void *bitmap;
5181
- ext4_grpblk_t next, count = 0, free_count = 0;
51825986 struct ext4_buddy e4b;
5183
- int ret = 0;
5987
+ int ret;
51845988
51855989 trace_ext4_trim_all_free(sb, group, start, max);
51865990
....@@ -5190,58 +5994,20 @@
51905994 ret, group);
51915995 return ret;
51925996 }
5193
- bitmap = e4b.bd_bitmap;
51945997
51955998 ext4_lock_group(sb, group);
5196
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5197
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5198
- goto out;
51995999
5200
- start = (e4b.bd_info->bb_first_free > start) ?
5201
- e4b.bd_info->bb_first_free : start;
6000
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
6001
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
6002
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
6003
+ else
6004
+ ret = 0;
52026005
5203
- while (start <= max) {
5204
- start = mb_find_next_zero_bit(bitmap, max + 1, start);
5205
- if (start > max)
5206
- break;
5207
- next = mb_find_next_bit(bitmap, max + 1, start);
5208
-
5209
- if ((next - start) >= minblocks) {
5210
- ret = ext4_trim_extent(sb, start,
5211
- next - start, group, &e4b);
5212
- if (ret && ret != -EOPNOTSUPP)
5213
- break;
5214
- ret = 0;
5215
- count += next - start;
5216
- }
5217
- free_count += next - start;
5218
- start = next + 1;
5219
-
5220
- if (fatal_signal_pending(current)) {
5221
- count = -ERESTARTSYS;
5222
- break;
5223
- }
5224
-
5225
- if (need_resched()) {
5226
- ext4_unlock_group(sb, group);
5227
- cond_resched();
5228
- ext4_lock_group(sb, group);
5229
- }
5230
-
5231
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
5232
- break;
5233
- }
5234
-
5235
- if (!ret) {
5236
- ret = count;
5237
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5238
- }
5239
-out:
52406006 ext4_unlock_group(sb, group);
52416007 ext4_mb_unload_buddy(&e4b);
52426008
52436009 ext4_debug("trimmed %d blocks in the group %d\n",
5244
- count, group);
6010
+ ret, group);
52456011
52466012 return ret;
52476013 }
....@@ -5286,7 +6052,7 @@
52866052 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
52876053 goto out;
52886054 }
5289
- if (end >= max_blks)
6055
+ if (end >= max_blks - 1)
52906056 end = max_blks - 1;
52916057 if (end <= first_data_blk)
52926058 goto out;
....@@ -5303,7 +6069,11 @@
53036069 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
53046070
53056071 for (group = first_group; group <= last_group; group++) {
6072
+ if (ext4_trim_interrupted())
6073
+ break;
53066074 grp = ext4_get_group_info(sb, group);
6075
+ if (!grp)
6076
+ continue;
53076077 /* We only do this if the grp has never been initialized */
53086078 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
53096079 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
....@@ -5319,10 +6089,9 @@
53196089 */
53206090 if (group == last_group)
53216091 end = last_cluster;
5322
-
53236092 if (grp->bb_free >= minlen) {
53246093 cnt = ext4_trim_all_free(sb, group, first_cluster,
5325
- end, minlen);
6094
+ end, minlen);
53266095 if (cnt < 0) {
53276096 ret = cnt;
53286097 break;
....@@ -5338,7 +6107,7 @@
53386107 }
53396108
53406109 if (!ret)
5341
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
6110
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
53426111
53436112 out:
53446113 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
....@@ -5367,8 +6136,7 @@
53676136
53686137 ext4_lock_group(sb, group);
53696138
5370
- start = (e4b.bd_info->bb_first_free > start) ?
5371
- e4b.bd_info->bb_first_free : start;
6139
+ start = max(e4b.bd_info->bb_first_free, start);
53726140 if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
53736141 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
53746142