forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 958e46acc8e900e8569dd467c1af9b8d2d019394
kernel/fs/ext4/mballoc.c
....@@ -18,13 +18,6 @@
1818 #include <linux/backing-dev.h>
1919 #include <trace/events/ext4.h>
2020
21
-#ifdef CONFIG_EXT4_DEBUG
22
-ushort ext4_mballoc_debug __read_mostly;
23
-
24
-module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
25
-MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
26
-#endif
27
-
2821 /*
2922 * MUSTDO:
3023 * - test ext4_ext_search_left() and ext4_ext_search_right()
....@@ -131,7 +124,7 @@
131124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132125 * terms of number of blocks. If we have mounted the file system with -O
133126 * stripe=<value> option the group prealloc request is normalized to the
134
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
127
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
135128 * greater than the default mb_group_prealloc.
136129 *
137130 * The regular allocator (using the buddy cache) supports a few tunables.
....@@ -356,6 +349,36 @@
356349 ext4_group_t group);
357350 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
358351 ext4_group_t group);
352
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
353
+
354
+/*
355
+ * The algorithm using this percpu seq counter goes below:
356
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
357
+ * allocation in ext4_mb_new_blocks().
358
+ * 2. We increment this percpu discard_pa_seq counter when we either allocate
359
+ * or free these blocks i.e. while marking those blocks as used/free in
360
+ * mb_mark_used()/mb_free_blocks().
361
+ * 3. We also increment this percpu seq counter when we successfully identify
362
+ * that the bb_prealloc_list is not empty and hence proceed for discarding
363
+ * of those PAs inside ext4_mb_discard_group_preallocations().
364
+ *
365
+ * Now to make sure that the regular fast path of block allocation is not
366
+ * affected, as a small optimization we only sample the percpu seq counter
367
+ * on that cpu. Only when the block allocation fails and when freed blocks
368
+ * found were 0, that is when we sample percpu seq counter for all cpus using
369
+ * below function ext4_get_discard_pa_seq_sum(). This happens after making
370
+ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
371
+ */
372
+static DEFINE_PER_CPU(u64, discard_pa_seq);
373
+static inline u64 ext4_get_discard_pa_seq_sum(void)
374
+{
375
+ int __cpu;
376
+ u64 __seq = 0;
377
+
378
+ for_each_possible_cpu(__cpu)
379
+ __seq += per_cpu(discard_pa_seq, __cpu);
380
+ return __seq;
381
+}
359382
360383 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
361384 {
....@@ -493,6 +516,8 @@
493516
494517 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
495518 {
519
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
520
+ return;
496521 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
497522 unsigned char *b1, *b2;
498523 int i;
....@@ -511,6 +536,31 @@
511536 }
512537 }
513538
539
+static void mb_group_bb_bitmap_alloc(struct super_block *sb,
540
+ struct ext4_group_info *grp, ext4_group_t group)
541
+{
542
+ struct buffer_head *bh;
543
+
544
+ grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
545
+ if (!grp->bb_bitmap)
546
+ return;
547
+
548
+ bh = ext4_read_block_bitmap(sb, group);
549
+ if (IS_ERR_OR_NULL(bh)) {
550
+ kfree(grp->bb_bitmap);
551
+ grp->bb_bitmap = NULL;
552
+ return;
553
+ }
554
+
555
+ memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
556
+ put_bh(bh);
557
+}
558
+
559
+static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
560
+{
561
+ kfree(grp->bb_bitmap);
562
+}
563
+
514564 #else
515565 static inline void mb_free_blocks_double(struct inode *inode,
516566 struct ext4_buddy *e4b, int first, int count)
....@@ -523,6 +573,17 @@
523573 return;
524574 }
525575 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
576
+{
577
+ return;
578
+}
579
+
580
+static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
581
+ struct ext4_group_info *grp, ext4_group_t group)
582
+{
583
+ return;
584
+}
585
+
586
+static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
526587 {
527588 return;
528589 }
....@@ -558,11 +619,8 @@
558619 void *buddy;
559620 void *buddy2;
560621
561
- {
562
- static int mb_check_counter;
563
- if (mb_check_counter++ % 100 != 0)
564
- return 0;
565
- }
622
+ if (e4b->bd_info->bb_check_counter++ % 10)
623
+ return 0;
566624
567625 while (order > 1) {
568626 buddy = mb_find_buddy(e4b, order, &max);
....@@ -820,13 +878,13 @@
820878 char *bitmap;
821879 struct ext4_group_info *grinfo;
822880
823
- mb_debug(1, "init page %lu\n", page->index);
824
-
825881 inode = page->mapping->host;
826882 sb = inode->i_sb;
827883 ngroups = ext4_get_groups_count(sb);
828884 blocksize = i_blocksize(inode);
829885 blocks_per_page = PAGE_SIZE / blocksize;
886
+
887
+ mb_debug(sb, "init page %lu\n", page->index);
830888
831889 groups_per_page = blocks_per_page >> 1;
832890 if (groups_per_page == 0)
....@@ -861,13 +919,13 @@
861919 bh[i] = NULL;
862920 continue;
863921 }
864
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
922
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
865923 if (IS_ERR(bh[i])) {
866924 err = PTR_ERR(bh[i]);
867925 bh[i] = NULL;
868926 goto out;
869927 }
870
- mb_debug(1, "read bitmap for group %u\n", group);
928
+ mb_debug(sb, "read bitmap for group %u\n", group);
871929 }
872930
873931 /* wait for I/O completion */
....@@ -912,7 +970,7 @@
912970 if ((first_block + i) & 1) {
913971 /* this is block of buddy */
914972 BUG_ON(incore == NULL);
915
- mb_debug(1, "put buddy for group %u in page %lu/%x\n",
973
+ mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
916974 group, page->index, i * blocksize);
917975 trace_ext4_mb_buddy_bitmap_load(sb, group);
918976 grinfo = ext4_get_group_info(sb, group);
....@@ -932,7 +990,7 @@
932990 } else {
933991 /* this is block of bitmap */
934992 BUG_ON(incore != NULL);
935
- mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
993
+ mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
936994 group, page->index, i * blocksize);
937995 trace_ext4_mb_bitmap_load(sb, group);
938996
....@@ -1038,7 +1096,7 @@
10381096 int ret = 0;
10391097
10401098 might_sleep();
1041
- mb_debug(1, "init group %u\n", group);
1099
+ mb_debug(sb, "init group %u\n", group);
10421100 this_grp = ext4_get_group_info(sb, group);
10431101 /*
10441102 * This ensures that we don't reinit the buddy cache
....@@ -1110,7 +1168,7 @@
11101168 struct inode *inode = sbi->s_buddy_cache;
11111169
11121170 might_sleep();
1113
- mb_debug(1, "load group %u\n", group);
1171
+ mb_debug(sb, "load group %u\n", group);
11141172
11151173 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
11161174 grp = ext4_get_group_info(sb, group);
....@@ -1217,9 +1275,6 @@
12171275 /* Pages marked accessed already */
12181276 e4b->bd_buddy_page = page;
12191277 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1220
-
1221
- BUG_ON(e4b->bd_bitmap_page == NULL);
1222
- BUG_ON(e4b->bd_buddy_page == NULL);
12231278
12241279 return 0;
12251280
....@@ -1336,9 +1391,6 @@
13361391 }
13371392 }
13381393
1339
-/*
1340
- * _________________________________________________________________ */
1341
-
13421394 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
13431395 {
13441396 if (mb_test_bit(*bit + side, bitmap)) {
....@@ -1430,6 +1482,7 @@
14301482 mb_check_buddy(e4b);
14311483 mb_free_blocks_double(inode, e4b, first, count);
14321484
1485
+ this_cpu_inc(discard_pa_seq);
14331486 e4b->bd_info->bb_free += count;
14341487 if (first < e4b->bd_info->bb_first_free)
14351488 e4b->bd_info->bb_first_free = first;
....@@ -1449,14 +1502,16 @@
14491502
14501503 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
14511504 blocknr += EXT4_C2B(sbi, block);
1452
- ext4_grp_locked_error(sb, e4b->bd_group,
1453
- inode ? inode->i_ino : 0,
1454
- blocknr,
1455
- "freeing already freed block "
1456
- "(bit %u); block bitmap corrupt.",
1457
- block);
1458
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
1505
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1506
+ ext4_grp_locked_error(sb, e4b->bd_group,
1507
+ inode ? inode->i_ino : 0,
1508
+ blocknr,
1509
+ "freeing already freed block (bit %u); block bitmap corrupt.",
1510
+ block);
1511
+ ext4_mark_group_bitmap_corrupted(
1512
+ sb, e4b->bd_group,
14591513 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1514
+ }
14601515 mb_regenerate_buddy(e4b);
14611516 goto done;
14621517 }
....@@ -1572,6 +1627,7 @@
15721627 mb_check_buddy(e4b);
15731628 mb_mark_used_double(e4b, start, len);
15741629
1630
+ this_cpu_inc(discard_pa_seq);
15751631 e4b->bd_info->bb_free -= len;
15761632 if (e4b->bd_info->bb_first_free == start)
15771633 e4b->bd_info->bb_first_free += len;
....@@ -1671,11 +1727,15 @@
16711727 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
16721728 spin_unlock(&sbi->s_md_lock);
16731729 }
1674
-}
1730
+ /*
1731
+ * As we've just preallocated more space than
1732
+ * user requested originally, we store allocated
1733
+ * space in a special descriptor.
1734
+ */
1735
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1736
+ ext4_mb_new_preallocation(ac);
16751737
1676
-/*
1677
- * regular allocator, for general purposes allocation
1678
- */
1738
+}
16791739
16801740 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
16811741 struct ext4_buddy *e4b,
....@@ -1919,7 +1979,7 @@
19191979
19201980 ext4_mb_use_best_found(ac, e4b);
19211981
1922
- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1982
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
19231983
19241984 if (EXT4_SB(sb)->s_mb_stats)
19251985 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
....@@ -1956,7 +2016,7 @@
19562016 /*
19572017 * IF we have corrupt bitmap, we won't find any
19582018 * free blocks even though group info says we
1959
- * we have free blocks
2019
+ * have free blocks
19602020 */
19612021 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
19622022 "%d free clusters as per "
....@@ -2036,39 +2096,29 @@
20362096 }
20372097
20382098 /*
2039
- * This is now called BEFORE we load the buddy bitmap.
2099
+ * This is also called BEFORE we load the buddy bitmap.
20402100 * Returns either 1 or 0 indicating that the group is either suitable
2041
- * for the allocation or not. In addition it can also return negative
2042
- * error code when something goes wrong.
2101
+ * for the allocation or not.
20432102 */
2044
-static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2103
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
20452104 ext4_group_t group, int cr)
20462105 {
2047
- unsigned free, fragments;
2106
+ ext4_grpblk_t free, fragments;
20482107 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
20492108 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
20502109
20512110 BUG_ON(cr < 0 || cr >= 4);
20522111
2112
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2113
+ return false;
2114
+
20532115 free = grp->bb_free;
20542116 if (free == 0)
2055
- return 0;
2056
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2057
- return 0;
2058
-
2059
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2060
- return 0;
2061
-
2062
- /* We only do this if the grp has never been initialized */
2063
- if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2064
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2065
- if (ret)
2066
- return ret;
2067
- }
2117
+ return false;
20682118
20692119 fragments = grp->bb_fragments;
20702120 if (fragments == 0)
2071
- return 0;
2121
+ return false;
20722122
20732123 switch (cr) {
20742124 case 0:
....@@ -2078,42 +2128,185 @@
20782128 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
20792129 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
20802130 ((group % flex_size) == 0))
2081
- return 0;
2131
+ return false;
20822132
2083
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2084
- (free / fragments) >= ac->ac_g_ex.fe_len)
2085
- return 1;
2133
+ if (free < ac->ac_g_ex.fe_len)
2134
+ return false;
2135
+
2136
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
2137
+ return true;
20862138
20872139 if (grp->bb_largest_free_order < ac->ac_2order)
2088
- return 0;
2140
+ return false;
20892141
2090
- return 1;
2142
+ return true;
20912143 case 1:
20922144 if ((free / fragments) >= ac->ac_g_ex.fe_len)
2093
- return 1;
2145
+ return true;
20942146 break;
20952147 case 2:
20962148 if (free >= ac->ac_g_ex.fe_len)
2097
- return 1;
2149
+ return true;
20982150 break;
20992151 case 3:
2100
- return 1;
2152
+ return true;
21012153 default:
21022154 BUG();
21032155 }
21042156
2105
- return 0;
2157
+ return false;
2158
+}
2159
+
2160
+/*
2161
+ * This could return negative error code if something goes wrong
2162
+ * during ext4_mb_init_group(). This should not be called with
2163
+ * ext4_lock_group() held.
2164
+ */
2165
+static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2166
+ ext4_group_t group, int cr)
2167
+{
2168
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2169
+ struct super_block *sb = ac->ac_sb;
2170
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2171
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2172
+ ext4_grpblk_t free;
2173
+ int ret = 0;
2174
+
2175
+ if (should_lock)
2176
+ ext4_lock_group(sb, group);
2177
+ free = grp->bb_free;
2178
+ if (free == 0)
2179
+ goto out;
2180
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2181
+ goto out;
2182
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2183
+ goto out;
2184
+ if (should_lock)
2185
+ ext4_unlock_group(sb, group);
2186
+
2187
+ /* We only do this if the grp has never been initialized */
2188
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2189
+ struct ext4_group_desc *gdp =
2190
+ ext4_get_group_desc(sb, group, NULL);
2191
+ int ret;
2192
+
2193
+ /* cr=0/1 is a very optimistic search to find large
2194
+ * good chunks almost for free. If buddy data is not
2195
+ * ready, then this optimization makes no sense. But
2196
+ * we never skip the first block group in a flex_bg,
2197
+ * since this gets used for metadata block allocation,
2198
+ * and we want to make sure we locate metadata blocks
2199
+ * in the first block group in the flex_bg if possible.
2200
+ */
2201
+ if (cr < 2 &&
2202
+ (!sbi->s_log_groups_per_flex ||
2203
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2204
+ !(ext4_has_group_desc_csum(sb) &&
2205
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2206
+ return 0;
2207
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2208
+ if (ret)
2209
+ return ret;
2210
+ }
2211
+
2212
+ if (should_lock)
2213
+ ext4_lock_group(sb, group);
2214
+ ret = ext4_mb_good_group(ac, group, cr);
2215
+out:
2216
+ if (should_lock)
2217
+ ext4_unlock_group(sb, group);
2218
+ return ret;
2219
+}
2220
+
2221
+/*
2222
+ * Start prefetching @nr block bitmaps starting at @group.
2223
+ * Return the next group which needs to be prefetched.
2224
+ */
2225
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2226
+ unsigned int nr, int *cnt)
2227
+{
2228
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
2229
+ struct buffer_head *bh;
2230
+ struct blk_plug plug;
2231
+
2232
+ blk_start_plug(&plug);
2233
+ while (nr-- > 0) {
2234
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2235
+ NULL);
2236
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2237
+
2238
+ /*
2239
+ * Prefetch block groups with free blocks; but don't
2240
+ * bother if it is marked uninitialized on disk, since
2241
+ * it won't require I/O to read. Also only try to
2242
+ * prefetch once, so we avoid getblk() call, which can
2243
+ * be expensive.
2244
+ */
2245
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2246
+ EXT4_MB_GRP_NEED_INIT(grp) &&
2247
+ ext4_free_group_clusters(sb, gdp) > 0 &&
2248
+ !(ext4_has_group_desc_csum(sb) &&
2249
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2250
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
2251
+ if (bh && !IS_ERR(bh)) {
2252
+ if (!buffer_uptodate(bh) && cnt)
2253
+ (*cnt)++;
2254
+ brelse(bh);
2255
+ }
2256
+ }
2257
+ if (++group >= ngroups)
2258
+ group = 0;
2259
+ }
2260
+ blk_finish_plug(&plug);
2261
+ return group;
2262
+}
2263
+
2264
+/*
2265
+ * Prefetching reads the block bitmap into the buffer cache; but we
2266
+ * need to make sure that the buddy bitmap in the page cache has been
2267
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
2268
+ * is not yet completed, or indeed if it was not initiated by
2269
+ * ext4_mb_prefetch did not start the I/O.
2270
+ *
2271
+ * TODO: We should actually kick off the buddy bitmap setup in a work
2272
+ * queue when the buffer I/O is completed, so that we don't block
2273
+ * waiting for the block allocation bitmap read to finish when
2274
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2275
+ */
2276
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2277
+ unsigned int nr)
2278
+{
2279
+ while (nr-- > 0) {
2280
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2281
+ NULL);
2282
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2283
+
2284
+ if (!group)
2285
+ group = ext4_get_groups_count(sb);
2286
+ group--;
2287
+ grp = ext4_get_group_info(sb, group);
2288
+
2289
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
2290
+ ext4_free_group_clusters(sb, gdp) > 0 &&
2291
+ !(ext4_has_group_desc_csum(sb) &&
2292
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2293
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
2294
+ break;
2295
+ }
2296
+ }
21062297 }
21072298
21082299 static noinline_for_stack int
21092300 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
21102301 {
2111
- ext4_group_t ngroups, group, i;
2112
- int cr;
2302
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
2303
+ int cr = -1;
21132304 int err = 0, first_err = 0;
2305
+ unsigned int nr = 0, prefetch_ios = 0;
21142306 struct ext4_sb_info *sbi;
21152307 struct super_block *sb;
21162308 struct ext4_buddy e4b;
2309
+ int lost;
21172310
21182311 sb = ac->ac_sb;
21192312 sbi = EXT4_SB(sb);
....@@ -2133,8 +2326,8 @@
21332326 goto out;
21342327
21352328 /*
2136
- * ac->ac2_order is set only if the fe_len is a power of 2
2137
- * if ac2_order is set we also set criteria to 0 so that we
2329
+ * ac->ac_2order is set only if the fe_len is a power of 2
2330
+ * if ac->ac_2order is set we also set criteria to 0 so that we
21382331 * try exact allocation using buddy.
21392332 */
21402333 i = fls(ac->ac_g_ex.fe_len);
....@@ -2178,6 +2371,7 @@
21782371 * from the goal value specified
21792372 */
21802373 group = ac->ac_g_ex.fe_group;
2374
+ prefetch_grp = group;
21812375
21822376 for (i = 0; i < ngroups; group++, i++) {
21832377 int ret = 0;
....@@ -2189,8 +2383,31 @@
21892383 if (group >= ngroups)
21902384 group = 0;
21912385
2386
+ /*
2387
+ * Batch reads of the block allocation bitmaps
2388
+ * to get multiple READs in flight; limit
2389
+ * prefetching at cr=0/1, otherwise mballoc can
2390
+ * spend a lot of time loading imperfect groups
2391
+ */
2392
+ if ((prefetch_grp == group) &&
2393
+ (cr > 1 ||
2394
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
2395
+ unsigned int curr_ios = prefetch_ios;
2396
+
2397
+ nr = sbi->s_mb_prefetch;
2398
+ if (ext4_has_feature_flex_bg(sb)) {
2399
+ nr = 1 << sbi->s_log_groups_per_flex;
2400
+ nr -= group & (nr - 1);
2401
+ nr = min(nr, sbi->s_mb_prefetch);
2402
+ }
2403
+ prefetch_grp = ext4_mb_prefetch(sb, group,
2404
+ nr, &prefetch_ios);
2405
+ if (prefetch_ios == curr_ios)
2406
+ nr = 0;
2407
+ }
2408
+
21922409 /* This now checks without needing the buddy page */
2193
- ret = ext4_mb_good_group(ac, group, cr);
2410
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
21942411 if (ret <= 0) {
21952412 if (!first_err)
21962413 first_err = ret;
....@@ -2208,11 +2425,9 @@
22082425 * block group
22092426 */
22102427 ret = ext4_mb_good_group(ac, group, cr);
2211
- if (ret <= 0) {
2428
+ if (ret == 0) {
22122429 ext4_unlock_group(sb, group);
22132430 ext4_mb_unload_buddy(&e4b);
2214
- if (!first_err)
2215
- first_err = ret;
22162431 continue;
22172432 }
22182433
....@@ -2239,28 +2454,38 @@
22392454 * We've been searching too long. Let's try to allocate
22402455 * the best chunk we've found so far
22412456 */
2242
-
22432457 ext4_mb_try_best_found(ac, &e4b);
22442458 if (ac->ac_status != AC_STATUS_FOUND) {
22452459 /*
22462460 * Someone more lucky has already allocated it.
22472461 * The only thing we can do is just take first
22482462 * found block(s)
2249
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
22502463 */
2464
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2465
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2466
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2467
+ ac->ac_b_ex.fe_len, lost);
2468
+
22512469 ac->ac_b_ex.fe_group = 0;
22522470 ac->ac_b_ex.fe_start = 0;
22532471 ac->ac_b_ex.fe_len = 0;
22542472 ac->ac_status = AC_STATUS_CONTINUE;
22552473 ac->ac_flags |= EXT4_MB_HINT_FIRST;
22562474 cr = 3;
2257
- atomic_inc(&sbi->s_mb_lost_chunks);
22582475 goto repeat;
22592476 }
22602477 }
22612478 out:
22622479 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
22632480 err = first_err;
2481
+
2482
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2483
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2484
+ ac->ac_flags, cr, err);
2485
+
2486
+ if (nr)
2487
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2488
+
22642489 return err;
22652490 }
22662491
....@@ -2333,7 +2558,7 @@
23332558 for (i = 0; i <= 13; i++)
23342559 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
23352560 sg.info.bb_counters[i] : 0);
2336
- seq_printf(seq, " ]\n");
2561
+ seq_puts(seq, " ]\n");
23372562
23382563 return 0;
23392564 }
....@@ -2453,20 +2678,7 @@
24532678 meta_group_info[i]->bb_free_root = RB_ROOT;
24542679 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
24552680
2456
-#ifdef DOUBLE_CHECK
2457
- {
2458
- struct buffer_head *bh;
2459
- meta_group_info[i]->bb_bitmap =
2460
- kmalloc(sb->s_blocksize, GFP_NOFS);
2461
- BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2462
- bh = ext4_read_block_bitmap(sb, group);
2463
- BUG_ON(IS_ERR_OR_NULL(bh));
2464
- memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2465
- sb->s_blocksize);
2466
- put_bh(bh);
2467
- }
2468
-#endif
2469
-
2681
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
24702682 return 0;
24712683
24722684 exit_group_info:
....@@ -2510,6 +2722,7 @@
25102722 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
25112723 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
25122724 for (i = 0; i < ngroups; i++) {
2725
+ cond_resched();
25132726 desc = ext4_get_group_desc(sb, i, NULL);
25142727 if (desc == NULL) {
25152728 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
....@@ -2518,6 +2731,34 @@
25182731 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
25192732 goto err_freebuddy;
25202733 }
2734
+
2735
+ if (ext4_has_feature_flex_bg(sb)) {
2736
+ /* a single flex group is supposed to be read by a single IO.
2737
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
2738
+ * unsigned integer, so the maximum shift is 32.
2739
+ */
2740
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
2741
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
2742
+ goto err_freebuddy;
2743
+ }
2744
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
2745
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
2746
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
2747
+ } else {
2748
+ sbi->s_mb_prefetch = 32;
2749
+ }
2750
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
2751
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
2752
+ /* now many real IOs to prefetch within a single allocation at cr=0
2753
+ * given cr=0 is an CPU-related optimization we shouldn't try to
2754
+ * load too many groups, at some point we should start to use what
2755
+ * we've got in memory.
2756
+ * with an average random access time 5ms, it'd take a second to get
2757
+ * 200 groups (* N with flex_bg), so let's make this limit 4
2758
+ */
2759
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
2760
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
2761
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
25212762
25222763 return 0;
25232764
....@@ -2642,6 +2883,7 @@
26422883 sbi->s_mb_stats = MB_DEFAULT_STATS;
26432884 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
26442885 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2886
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
26452887 /*
26462888 * The default group preallocation is 512, which for 4k block
26472889 * sizes translates to 2 megabytes. However for bigalloc file
....@@ -2702,7 +2944,7 @@
27022944 }
27032945
27042946 /* need to called with the ext4 group lock held */
2705
-static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2947
+static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
27062948 {
27072949 struct ext4_prealloc_space *pa;
27082950 struct list_head *cur, *tmp;
....@@ -2714,9 +2956,7 @@
27142956 count++;
27152957 kmem_cache_free(ext4_pspace_cachep, pa);
27162958 }
2717
- if (count)
2718
- mb_debug(1, "mballoc: %u PAs left\n", count);
2719
-
2959
+ return count;
27202960 }
27212961
27222962 int ext4_mb_release(struct super_block *sb)
....@@ -2727,15 +2967,18 @@
27272967 struct ext4_group_info *grinfo, ***group_info;
27282968 struct ext4_sb_info *sbi = EXT4_SB(sb);
27292969 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2970
+ int count;
27302971
27312972 if (sbi->s_group_info) {
27322973 for (i = 0; i < ngroups; i++) {
2974
+ cond_resched();
27332975 grinfo = ext4_get_group_info(sb, i);
2734
-#ifdef DOUBLE_CHECK
2735
- kfree(grinfo->bb_bitmap);
2736
-#endif
2976
+ mb_group_bb_bitmap_free(grinfo);
27372977 ext4_lock_group(sb, i);
2738
- ext4_mb_cleanup_pa(grinfo);
2978
+ count = ext4_mb_cleanup_pa(grinfo);
2979
+ if (count)
2980
+ mb_debug(sb, "mballoc: %d PAs left\n",
2981
+ count);
27392982 ext4_unlock_group(sb, i);
27402983 kmem_cache_free(cachep, grinfo);
27412984 }
....@@ -2808,7 +3051,7 @@
28083051 struct ext4_group_info *db;
28093052 int err, count = 0, count2 = 0;
28103053
2811
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
3054
+ mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
28123055 entry->efd_count, entry->efd_group, entry);
28133056
28143057 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
....@@ -2848,7 +3091,8 @@
28483091 kmem_cache_free(ext4_free_data_cachep, entry);
28493092 ext4_mb_unload_buddy(&e4b);
28503093
2851
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
3094
+ mb_debug(sb, "freed %d blocks in %d structures\n", count,
3095
+ count2);
28523096 }
28533097
28543098 /*
....@@ -2908,23 +3152,26 @@
29083152 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
29093153 SLAB_RECLAIM_ACCOUNT);
29103154 if (ext4_pspace_cachep == NULL)
2911
- return -ENOMEM;
3155
+ goto out;
29123156
29133157 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
29143158 SLAB_RECLAIM_ACCOUNT);
2915
- if (ext4_ac_cachep == NULL) {
2916
- kmem_cache_destroy(ext4_pspace_cachep);
2917
- return -ENOMEM;
2918
- }
3159
+ if (ext4_ac_cachep == NULL)
3160
+ goto out_pa_free;
29193161
29203162 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
29213163 SLAB_RECLAIM_ACCOUNT);
2922
- if (ext4_free_data_cachep == NULL) {
2923
- kmem_cache_destroy(ext4_pspace_cachep);
2924
- kmem_cache_destroy(ext4_ac_cachep);
2925
- return -ENOMEM;
2926
- }
3164
+ if (ext4_free_data_cachep == NULL)
3165
+ goto out_ac_free;
3166
+
29273167 return 0;
3168
+
3169
+out_ac_free:
3170
+ kmem_cache_destroy(ext4_ac_cachep);
3171
+out_pa_free:
3172
+ kmem_cache_destroy(ext4_pspace_cachep);
3173
+out:
3174
+ return -ENOMEM;
29283175 }
29293176
29303177 void ext4_exit_mballoc(void)
....@@ -3061,6 +3308,110 @@
30613308 }
30623309
30633310 /*
3311
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
3312
+ * blocks in bitmaps and update counters.
3313
+ */
3314
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3315
+ int len, int state)
3316
+{
3317
+ struct buffer_head *bitmap_bh = NULL;
3318
+ struct ext4_group_desc *gdp;
3319
+ struct buffer_head *gdp_bh;
3320
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
3321
+ ext4_group_t group;
3322
+ ext4_grpblk_t blkoff;
3323
+ int i, err;
3324
+ int already;
3325
+ unsigned int clen, clen_changed, thisgrp_len;
3326
+
3327
+ while (len > 0) {
3328
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
3329
+
3330
+ /*
3331
+ * Check to see if we are freeing blocks across a group
3332
+ * boundary.
3333
+ * In case of flex_bg, this can happen that (block, len) may
3334
+ * span across more than one group. In that case we need to
3335
+ * get the corresponding group metadata to work with.
3336
+ * For this we have goto again loop.
3337
+ */
3338
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
3339
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
3340
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
3341
+
3342
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
3343
+ if (IS_ERR(bitmap_bh)) {
3344
+ err = PTR_ERR(bitmap_bh);
3345
+ bitmap_bh = NULL;
3346
+ break;
3347
+ }
3348
+
3349
+ err = -EIO;
3350
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3351
+ if (!gdp)
3352
+ break;
3353
+
3354
+ ext4_lock_group(sb, group);
3355
+ already = 0;
3356
+ for (i = 0; i < clen; i++)
3357
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
3358
+ !state)
3359
+ already++;
3360
+
3361
+ clen_changed = clen - already;
3362
+ if (state)
3363
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
3364
+ else
3365
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
3366
+ if (ext4_has_group_desc_csum(sb) &&
3367
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3368
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3369
+ ext4_free_group_clusters_set(sb, gdp,
3370
+ ext4_free_clusters_after_init(sb, group, gdp));
3371
+ }
3372
+ if (state)
3373
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
3374
+ else
3375
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
3376
+
3377
+ ext4_free_group_clusters_set(sb, gdp, clen);
3378
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
3379
+ ext4_group_desc_csum_set(sb, group, gdp);
3380
+
3381
+ ext4_unlock_group(sb, group);
3382
+
3383
+ if (sbi->s_log_groups_per_flex) {
3384
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
3385
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
3386
+ s_flex_groups, flex_group);
3387
+
3388
+ if (state)
3389
+ atomic64_sub(clen_changed, &fg->free_clusters);
3390
+ else
3391
+ atomic64_add(clen_changed, &fg->free_clusters);
3392
+
3393
+ }
3394
+
3395
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
3396
+ if (err)
3397
+ break;
3398
+ sync_dirty_buffer(bitmap_bh);
3399
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
3400
+ sync_dirty_buffer(gdp_bh);
3401
+ if (err)
3402
+ break;
3403
+
3404
+ block += thisgrp_len;
3405
+ len -= thisgrp_len;
3406
+ brelse(bitmap_bh);
3407
+ BUG_ON(len < 0);
3408
+ }
3409
+
3410
+ if (err)
3411
+ brelse(bitmap_bh);
3412
+}
3413
+
3414
+/*
30643415 * here we normalize request for locality group
30653416 * Group request are normalized to s_mb_group_prealloc, which goes to
30663417 * s_strip if we set the same via mount option.
....@@ -3076,8 +3427,7 @@
30763427
30773428 BUG_ON(lg == NULL);
30783429 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3079
- mb_debug(1, "#%u: goal %u blocks for locality group\n",
3080
- current->pid, ac->ac_g_ex.fe_len);
3430
+ mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
30813431 }
30823432
30833433 /*
....@@ -3169,6 +3519,15 @@
31693519 }
31703520 size = size >> bsbits;
31713521 start = start_off >> bsbits;
3522
+
3523
+ /*
3524
+ * For tiny groups (smaller than 8MB) the chosen allocation
3525
+ * alignment may be larger than group size. Make sure the
3526
+ * alignment does not move allocation to a different group which
3527
+ * makes mballoc fail assertions later.
3528
+ */
3529
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
3530
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
31723531
31733532 /* don't cover already allocated blocks in selected range */
31743533 if (ar->pleft && start <= ar->lleft) {
....@@ -3275,8 +3634,8 @@
32753634 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
32763635 }
32773636
3278
- mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3279
- (unsigned) orig_size, (unsigned) start);
3637
+ mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
3638
+ orig_size, start);
32803639 }
32813640
32823641 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
....@@ -3365,7 +3724,7 @@
33653724 BUG_ON(pa->pa_free < len);
33663725 pa->pa_free -= len;
33673726
3368
- mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3727
+ mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
33693728 }
33703729
33713730 /*
....@@ -3389,7 +3748,8 @@
33893748 * in on-disk bitmap -- see ext4_mb_release_context()
33903749 * Other CPUs are prevented from allocating from this pa by lg_mutex
33913750 */
3392
- mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3751
+ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
3752
+ pa->pa_lstart-len, len, pa);
33933753 }
33943754
33953755 /*
....@@ -3424,7 +3784,7 @@
34243784 /*
34253785 * search goal blocks in preallocated space
34263786 */
3427
-static noinline_for_stack int
3787
+static noinline_for_stack bool
34283788 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
34293789 {
34303790 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
....@@ -3436,7 +3796,7 @@
34363796
34373797 /* only data can be preallocated */
34383798 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3439
- return 0;
3799
+ return false;
34403800
34413801 /* first, try per-file preallocation */
34423802 rcu_read_lock();
....@@ -3463,7 +3823,7 @@
34633823 spin_unlock(&pa->pa_lock);
34643824 ac->ac_criteria = 10;
34653825 rcu_read_unlock();
3466
- return 1;
3826
+ return true;
34673827 }
34683828 spin_unlock(&pa->pa_lock);
34693829 }
....@@ -3471,12 +3831,12 @@
34713831
34723832 /* can we use group allocation? */
34733833 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3474
- return 0;
3834
+ return false;
34753835
34763836 /* inode may have no locality group for some reason */
34773837 lg = ac->ac_lg;
34783838 if (lg == NULL)
3479
- return 0;
3839
+ return false;
34803840 order = fls(ac->ac_o_ex.fe_len) - 1;
34813841 if (order > PREALLOC_TB_SIZE - 1)
34823842 /* The max size of hash table is PREALLOC_TB_SIZE */
....@@ -3505,9 +3865,9 @@
35053865 if (cpa) {
35063866 ext4_mb_use_group_pa(ac, cpa);
35073867 ac->ac_criteria = 20;
3508
- return 1;
3868
+ return true;
35093869 }
3510
- return 0;
3870
+ return false;
35113871 }
35123872
35133873 /*
....@@ -3572,7 +3932,27 @@
35723932 ext4_set_bits(bitmap, start, len);
35733933 preallocated += len;
35743934 }
3575
- mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
3935
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
3936
+}
3937
+
3938
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
3939
+ struct ext4_prealloc_space *pa)
3940
+{
3941
+ struct ext4_inode_info *ei;
3942
+
3943
+ if (pa->pa_deleted) {
3944
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
3945
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
3946
+ pa->pa_len);
3947
+ return;
3948
+ }
3949
+
3950
+ pa->pa_deleted = 1;
3951
+
3952
+ if (pa->pa_type == MB_INODE_PA) {
3953
+ ei = EXT4_I(pa->pa_inode);
3954
+ atomic_dec(&ei->i_prealloc_active);
3955
+ }
35763956 }
35773957
35783958 static void ext4_mb_pa_callback(struct rcu_head *head)
....@@ -3607,7 +3987,7 @@
36073987 return;
36083988 }
36093989
3610
- pa->pa_deleted = 1;
3990
+ ext4_mb_mark_pa_deleted(sb, pa);
36113991 spin_unlock(&pa->pa_lock);
36123992
36133993 grp_blk = pa->pa_pstart;
....@@ -3648,7 +4028,7 @@
36484028 /*
36494029 * creates new preallocated space for given inode
36504030 */
3651
-static noinline_for_stack int
4031
+static noinline_for_stack void
36524032 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
36534033 {
36544034 struct super_block *sb = ac->ac_sb;
....@@ -3661,10 +4041,9 @@
36614041 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
36624042 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
36634043 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4044
+ BUG_ON(ac->ac_pa == NULL);
36644045
3665
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3666
- if (pa == NULL)
3667
- return -ENOMEM;
4046
+ pa = ac->ac_pa;
36684047
36694048 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
36704049 int winl;
....@@ -3708,15 +4087,14 @@
37084087 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
37094088 pa->pa_len = ac->ac_b_ex.fe_len;
37104089 pa->pa_free = pa->pa_len;
3711
- atomic_set(&pa->pa_count, 1);
37124090 spin_lock_init(&pa->pa_lock);
37134091 INIT_LIST_HEAD(&pa->pa_inode_list);
37144092 INIT_LIST_HEAD(&pa->pa_group_list);
37154093 pa->pa_deleted = 0;
37164094 pa->pa_type = MB_INODE_PA;
37174095
3718
- mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3719
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
4096
+ mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4097
+ pa->pa_len, pa->pa_lstart);
37204098 trace_ext4_mb_new_inode_pa(ac, pa);
37214099
37224100 ext4_mb_use_inode_pa(ac, pa);
....@@ -3728,21 +4106,18 @@
37284106 pa->pa_obj_lock = &ei->i_prealloc_lock;
37294107 pa->pa_inode = ac->ac_inode;
37304108
3731
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
37324109 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3733
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
37344110
37354111 spin_lock(pa->pa_obj_lock);
37364112 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
37374113 spin_unlock(pa->pa_obj_lock);
3738
-
3739
- return 0;
4114
+ atomic_inc(&ei->i_prealloc_active);
37404115 }
37414116
37424117 /*
37434118 * creates new preallocated space for locality group inodes belongs to
37444119 */
3745
-static noinline_for_stack int
4120
+static noinline_for_stack void
37464121 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
37474122 {
37484123 struct super_block *sb = ac->ac_sb;
....@@ -3754,11 +4129,9 @@
37544129 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
37554130 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
37564131 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4132
+ BUG_ON(ac->ac_pa == NULL);
37574133
3758
- BUG_ON(ext4_pspace_cachep == NULL);
3759
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3760
- if (pa == NULL)
3761
- return -ENOMEM;
4134
+ pa = ac->ac_pa;
37624135
37634136 /* preallocation can change ac_b_ex, thus we store actually
37644137 * allocated blocks for history */
....@@ -3768,15 +4141,14 @@
37684141 pa->pa_lstart = pa->pa_pstart;
37694142 pa->pa_len = ac->ac_b_ex.fe_len;
37704143 pa->pa_free = pa->pa_len;
3771
- atomic_set(&pa->pa_count, 1);
37724144 spin_lock_init(&pa->pa_lock);
37734145 INIT_LIST_HEAD(&pa->pa_inode_list);
37744146 INIT_LIST_HEAD(&pa->pa_group_list);
37754147 pa->pa_deleted = 0;
37764148 pa->pa_type = MB_GROUP_PA;
37774149
3778
- mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3779
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
4150
+ mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4151
+ pa->pa_len, pa->pa_lstart);
37804152 trace_ext4_mb_new_group_pa(ac, pa);
37814153
37824154 ext4_mb_use_group_pa(ac, pa);
....@@ -3789,26 +4161,20 @@
37894161 pa->pa_obj_lock = &lg->lg_prealloc_lock;
37904162 pa->pa_inode = NULL;
37914163
3792
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
37934164 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3794
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
37954165
37964166 /*
37974167 * We will later add the new pa to the right bucket
37984168 * after updating the pa_free in ext4_mb_release_context
37994169 */
3800
- return 0;
38014170 }
38024171
3803
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
4172
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
38044173 {
3805
- int err;
3806
-
38074174 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3808
- err = ext4_mb_new_group_pa(ac);
4175
+ ext4_mb_new_group_pa(ac);
38094176 else
3810
- err = ext4_mb_new_inode_pa(ac);
3811
- return err;
4177
+ ext4_mb_new_inode_pa(ac);
38124178 }
38134179
38144180 /*
....@@ -3843,7 +4209,7 @@
38434209 if (bit >= end)
38444210 break;
38454211 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3846
- mb_debug(1, " free preallocated %u/%u in group %u\n",
4212
+ mb_debug(sb, "free preallocated %u/%u in group %u\n",
38474213 (unsigned) ext4_group_first_block_no(sb, group) + bit,
38484214 (unsigned) next - bit, (unsigned) group);
38494215 free += next - bit;
....@@ -3857,10 +4223,10 @@
38574223 }
38584224 if (free != pa->pa_free) {
38594225 ext4_msg(e4b->bd_sb, KERN_CRIT,
3860
- "pa %p: logic %lu, phys. %lu, len %lu",
4226
+ "pa %p: logic %lu, phys. %lu, len %d",
38614227 pa, (unsigned long) pa->pa_lstart,
38624228 (unsigned long) pa->pa_pstart,
3863
- (unsigned long) pa->pa_len);
4229
+ pa->pa_len);
38644230 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
38654231 free, pa->pa_free);
38664232 /*
....@@ -3903,7 +4269,7 @@
39034269 */
39044270 static noinline_for_stack int
39054271 ext4_mb_discard_group_preallocations(struct super_block *sb,
3906
- ext4_group_t group, int needed)
4272
+ ext4_group_t group, int *busy)
39074273 {
39084274 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
39094275 struct buffer_head *bitmap_bh = NULL;
....@@ -3911,20 +4277,19 @@
39114277 struct list_head list;
39124278 struct ext4_buddy e4b;
39134279 int err;
3914
- int busy = 0;
39154280 int free = 0;
39164281
3917
- mb_debug(1, "discard preallocation for group %u\n", group);
3918
-
4282
+ mb_debug(sb, "discard preallocation for group %u\n", group);
39194283 if (list_empty(&grp->bb_prealloc_list))
3920
- return 0;
4284
+ goto out_dbg;
39214285
39224286 bitmap_bh = ext4_read_block_bitmap(sb, group);
39234287 if (IS_ERR(bitmap_bh)) {
39244288 err = PTR_ERR(bitmap_bh);
3925
- ext4_error(sb, "Error %d reading block bitmap for %u",
3926
- err, group);
3927
- return 0;
4289
+ ext4_error_err(sb, -err,
4290
+ "Error %d reading block bitmap for %u",
4291
+ err, group);
4292
+ goto out_dbg;
39284293 }
39294294
39304295 err = ext4_mb_load_buddy(sb, group, &e4b);
....@@ -3932,21 +4297,17 @@
39324297 ext4_warning(sb, "Error %d loading buddy information for %u",
39334298 err, group);
39344299 put_bh(bitmap_bh);
3935
- return 0;
4300
+ goto out_dbg;
39364301 }
39374302
3938
- if (needed == 0)
3939
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3940
-
39414303 INIT_LIST_HEAD(&list);
3942
-repeat:
39434304 ext4_lock_group(sb, group);
39444305 list_for_each_entry_safe(pa, tmp,
39454306 &grp->bb_prealloc_list, pa_group_list) {
39464307 spin_lock(&pa->pa_lock);
39474308 if (atomic_read(&pa->pa_count)) {
39484309 spin_unlock(&pa->pa_lock);
3949
- busy = 1;
4310
+ *busy = 1;
39504311 continue;
39514312 }
39524313 if (pa->pa_deleted) {
....@@ -3955,7 +4316,10 @@
39554316 }
39564317
39574318 /* seems this one can be freed ... */
3958
- pa->pa_deleted = 1;
4319
+ ext4_mb_mark_pa_deleted(sb, pa);
4320
+
4321
+ if (!free)
4322
+ this_cpu_inc(discard_pa_seq);
39594323
39604324 /* we can trust pa_free ... */
39614325 free += pa->pa_free;
....@@ -3964,20 +4328,6 @@
39644328
39654329 list_del(&pa->pa_group_list);
39664330 list_add(&pa->u.pa_tmp_list, &list);
3967
- }
3968
-
3969
- /* if we still need more blocks and some PAs were used, try again */
3970
- if (free < needed && busy) {
3971
- busy = 0;
3972
- ext4_unlock_group(sb, group);
3973
- cond_resched();
3974
- goto repeat;
3975
- }
3976
-
3977
- /* found anything to free? */
3978
- if (list_empty(&list)) {
3979
- BUG_ON(free != 0);
3980
- goto out;
39814331 }
39824332
39834333 /* now free all selected PAs */
....@@ -3997,10 +4347,12 @@
39974347 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
39984348 }
39994349
4000
-out:
40014350 ext4_unlock_group(sb, group);
40024351 ext4_mb_unload_buddy(&e4b);
40034352 put_bh(bitmap_bh);
4353
+out_dbg:
4354
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
4355
+ free, group, grp->bb_free);
40044356 return free;
40054357 }
40064358
....@@ -4013,7 +4365,7 @@
40134365 *
40144366 * FIXME!! Make sure it is valid at all the call sites
40154367 */
4016
-void ext4_discard_preallocations(struct inode *inode)
4368
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
40174369 {
40184370 struct ext4_inode_info *ei = EXT4_I(inode);
40194371 struct super_block *sb = inode->i_sb;
....@@ -4029,16 +4381,24 @@
40294381 return;
40304382 }
40314383
4032
- mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
4033
- trace_ext4_discard_preallocations(inode);
4384
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
4385
+ return;
4386
+
4387
+ mb_debug(sb, "discard preallocation for inode %lu\n",
4388
+ inode->i_ino);
4389
+ trace_ext4_discard_preallocations(inode,
4390
+ atomic_read(&ei->i_prealloc_active), needed);
40344391
40354392 INIT_LIST_HEAD(&list);
4393
+
4394
+ if (needed == 0)
4395
+ needed = UINT_MAX;
40364396
40374397 repeat:
40384398 /* first, collect all pa's in the inode */
40394399 spin_lock(&ei->i_prealloc_lock);
4040
- while (!list_empty(&ei->i_prealloc_list)) {
4041
- pa = list_entry(ei->i_prealloc_list.next,
4400
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
4401
+ pa = list_entry(ei->i_prealloc_list.prev,
40424402 struct ext4_prealloc_space, pa_inode_list);
40434403 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
40444404 spin_lock(&pa->pa_lock);
....@@ -4055,10 +4415,11 @@
40554415
40564416 }
40574417 if (pa->pa_deleted == 0) {
4058
- pa->pa_deleted = 1;
4418
+ ext4_mb_mark_pa_deleted(sb, pa);
40594419 spin_unlock(&pa->pa_lock);
40604420 list_del_rcu(&pa->pa_inode_list);
40614421 list_add(&pa->u.pa_tmp_list, &list);
4422
+ needed--;
40624423 continue;
40634424 }
40644425
....@@ -4090,16 +4451,16 @@
40904451 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
40914452 GFP_NOFS|__GFP_NOFAIL);
40924453 if (err) {
4093
- ext4_error(sb, "Error %d loading buddy information for %u",
4094
- err, group);
4454
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4455
+ err, group);
40954456 continue;
40964457 }
40974458
40984459 bitmap_bh = ext4_read_block_bitmap(sb, group);
40994460 if (IS_ERR(bitmap_bh)) {
41004461 err = PTR_ERR(bitmap_bh);
4101
- ext4_error(sb, "Error %d reading block bitmap for %u",
4102
- err, group);
4462
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
4463
+ err, group);
41034464 ext4_mb_unload_buddy(&e4b);
41044465 continue;
41054466 }
....@@ -4117,22 +4478,74 @@
41174478 }
41184479 }
41194480
4481
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
4482
+{
4483
+ struct ext4_prealloc_space *pa;
4484
+
4485
+ BUG_ON(ext4_pspace_cachep == NULL);
4486
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
4487
+ if (!pa)
4488
+ return -ENOMEM;
4489
+ atomic_set(&pa->pa_count, 1);
4490
+ ac->ac_pa = pa;
4491
+ return 0;
4492
+}
4493
+
4494
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
4495
+{
4496
+ struct ext4_prealloc_space *pa = ac->ac_pa;
4497
+
4498
+ BUG_ON(!pa);
4499
+ ac->ac_pa = NULL;
4500
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
4501
+ kmem_cache_free(ext4_pspace_cachep, pa);
4502
+}
4503
+
41204504 #ifdef CONFIG_EXT4_DEBUG
4505
+static inline void ext4_mb_show_pa(struct super_block *sb)
4506
+{
4507
+ ext4_group_t i, ngroups;
4508
+
4509
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
4510
+ return;
4511
+
4512
+ ngroups = ext4_get_groups_count(sb);
4513
+ mb_debug(sb, "groups: ");
4514
+ for (i = 0; i < ngroups; i++) {
4515
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4516
+ struct ext4_prealloc_space *pa;
4517
+ ext4_grpblk_t start;
4518
+ struct list_head *cur;
4519
+ ext4_lock_group(sb, i);
4520
+ list_for_each(cur, &grp->bb_prealloc_list) {
4521
+ pa = list_entry(cur, struct ext4_prealloc_space,
4522
+ pa_group_list);
4523
+ spin_lock(&pa->pa_lock);
4524
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4525
+ NULL, &start);
4526
+ spin_unlock(&pa->pa_lock);
4527
+ mb_debug(sb, "PA:%u:%d:%d\n", i, start,
4528
+ pa->pa_len);
4529
+ }
4530
+ ext4_unlock_group(sb, i);
4531
+ mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
4532
+ grp->bb_fragments);
4533
+ }
4534
+}
4535
+
41214536 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
41224537 {
41234538 struct super_block *sb = ac->ac_sb;
4124
- ext4_group_t ngroups, i;
41254539
4126
- if (!ext4_mballoc_debug ||
4127
- (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4540
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
41284541 return;
41294542
4130
- ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4543
+ mb_debug(sb, "Can't allocate:"
41314544 " Allocation context details:");
4132
- ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4545
+ mb_debug(sb, "status %u flags 0x%x",
41334546 ac->ac_status, ac->ac_flags);
4134
- ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4135
- "goal %lu/%lu/%lu@%lu, "
4547
+ mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
4548
+ "goal %lu/%lu/%lu@%lu, "
41364549 "best %lu/%lu/%lu@%lu cr %d",
41374550 (unsigned long)ac->ac_o_ex.fe_group,
41384551 (unsigned long)ac->ac_o_ex.fe_start,
....@@ -4147,37 +4560,17 @@
41474560 (unsigned long)ac->ac_b_ex.fe_len,
41484561 (unsigned long)ac->ac_b_ex.fe_logical,
41494562 (int)ac->ac_criteria);
4150
- ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4151
- ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4152
- ngroups = ext4_get_groups_count(sb);
4153
- for (i = 0; i < ngroups; i++) {
4154
- struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4155
- struct ext4_prealloc_space *pa;
4156
- ext4_grpblk_t start;
4157
- struct list_head *cur;
4158
- ext4_lock_group(sb, i);
4159
- list_for_each(cur, &grp->bb_prealloc_list) {
4160
- pa = list_entry(cur, struct ext4_prealloc_space,
4161
- pa_group_list);
4162
- spin_lock(&pa->pa_lock);
4163
- ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4164
- NULL, &start);
4165
- spin_unlock(&pa->pa_lock);
4166
- printk(KERN_ERR "PA:%u:%d:%u \n", i,
4167
- start, pa->pa_len);
4168
- }
4169
- ext4_unlock_group(sb, i);
4170
-
4171
- if (grp->bb_free == 0)
4172
- continue;
4173
- printk(KERN_ERR "%u: %d/%d \n",
4174
- i, grp->bb_free, grp->bb_fragments);
4175
- }
4176
- printk(KERN_ERR "\n");
4563
+ mb_debug(sb, "%u found", ac->ac_found);
4564
+ ext4_mb_show_pa(sb);
41774565 }
41784566 #else
4567
+static inline void ext4_mb_show_pa(struct super_block *sb)
4568
+{
4569
+ return;
4570
+}
41794571 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
41804572 {
4573
+ ext4_mb_show_pa(ac->ac_sb);
41814574 return;
41824575 }
41834576 #endif
....@@ -4205,9 +4598,8 @@
42054598 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
42064599 >> bsbits;
42074600
4208
- if ((size == isize) &&
4209
- !ext4_fs_is_busy(sbi) &&
4210
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4601
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
4602
+ !inode_is_open_for_write(ac->ac_inode)) {
42114603 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
42124604 return;
42134605 }
....@@ -4277,17 +4669,17 @@
42774669 ac->ac_g_ex = ac->ac_o_ex;
42784670 ac->ac_flags = ar->flags;
42794671
4280
- /* we have to define context: we'll we work with a file or
4672
+ /* we have to define context: we'll work with a file or
42814673 * locality group. this is a policy, actually */
42824674 ext4_mb_group_or_file(ac);
42834675
4284
- mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4676
+ mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
42854677 "left: %u/%u, right %u/%u to %swritable\n",
42864678 (unsigned) ar->len, (unsigned) ar->logical,
42874679 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
42884680 (unsigned) ar->lleft, (unsigned) ar->pleft,
42894681 (unsigned) ar->lright, (unsigned) ar->pright,
4290
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4682
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
42914683 return 0;
42924684
42934685 }
....@@ -4302,13 +4694,14 @@
43024694 struct list_head discard_list;
43034695 struct ext4_prealloc_space *pa, *tmp;
43044696
4305
- mb_debug(1, "discard locality group preallocation\n");
4697
+ mb_debug(sb, "discard locality group preallocation\n");
43064698
43074699 INIT_LIST_HEAD(&discard_list);
43084700
43094701 spin_lock(&lg->lg_prealloc_lock);
43104702 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4311
- pa_inode_list) {
4703
+ pa_inode_list,
4704
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
43124705 spin_lock(&pa->pa_lock);
43134706 if (atomic_read(&pa->pa_count)) {
43144707 /*
....@@ -4327,7 +4720,7 @@
43274720 BUG_ON(pa->pa_type != MB_GROUP_PA);
43284721
43294722 /* seems this one can be freed ... */
4330
- pa->pa_deleted = 1;
4723
+ ext4_mb_mark_pa_deleted(sb, pa);
43314724 spin_unlock(&pa->pa_lock);
43324725
43334726 list_del_rcu(&pa->pa_inode_list);
....@@ -4353,8 +4746,8 @@
43534746 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
43544747 GFP_NOFS|__GFP_NOFAIL);
43554748 if (err) {
4356
- ext4_error(sb, "Error %d loading buddy information for %u",
4357
- err, group);
4749
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4750
+ err, group);
43584751 continue;
43594752 }
43604753 ext4_lock_group(sb, group);
....@@ -4391,7 +4784,8 @@
43914784 /* Add the prealloc space to lg */
43924785 spin_lock(&lg->lg_prealloc_lock);
43934786 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4394
- pa_inode_list) {
4787
+ pa_inode_list,
4788
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
43954789 spin_lock(&tmp_pa->pa_lock);
43964790 if (tmp_pa->pa_deleted) {
43974791 spin_unlock(&tmp_pa->pa_lock);
....@@ -4425,10 +4819,29 @@
44254819 }
44264820
44274821 /*
4822
+ * if per-inode prealloc list is too long, trim some PA
4823
+ */
4824
+static void ext4_mb_trim_inode_pa(struct inode *inode)
4825
+{
4826
+ struct ext4_inode_info *ei = EXT4_I(inode);
4827
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4828
+ int count, delta;
4829
+
4830
+ count = atomic_read(&ei->i_prealloc_active);
4831
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
4832
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
4833
+ count -= sbi->s_mb_max_inode_prealloc;
4834
+ ext4_discard_preallocations(inode, count);
4835
+ }
4836
+}
4837
+
4838
+/*
44284839 * release all resource we used in allocation
44294840 */
44304841 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
44314842 {
4843
+ struct inode *inode = ac->ac_inode;
4844
+ struct ext4_inode_info *ei = EXT4_I(inode);
44324845 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
44334846 struct ext4_prealloc_space *pa = ac->ac_pa;
44344847 if (pa) {
....@@ -4440,21 +4853,31 @@
44404853 pa->pa_free -= ac->ac_b_ex.fe_len;
44414854 pa->pa_len -= ac->ac_b_ex.fe_len;
44424855 spin_unlock(&pa->pa_lock);
4856
+
4857
+ /*
4858
+ * We want to add the pa to the right bucket.
4859
+ * Remove it from the list and while adding
4860
+ * make sure the list to which we are adding
4861
+ * doesn't grow big.
4862
+ */
4863
+ if (likely(pa->pa_free)) {
4864
+ spin_lock(pa->pa_obj_lock);
4865
+ list_del_rcu(&pa->pa_inode_list);
4866
+ spin_unlock(pa->pa_obj_lock);
4867
+ ext4_mb_add_n_trim(ac);
4868
+ }
44434869 }
4444
- }
4445
- if (pa) {
4446
- /*
4447
- * We want to add the pa to the right bucket.
4448
- * Remove it from the list and while adding
4449
- * make sure the list to which we are adding
4450
- * doesn't grow big.
4451
- */
4452
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4870
+
4871
+ if (pa->pa_type == MB_INODE_PA) {
4872
+ /*
4873
+ * treat per-inode prealloc list as a lru list, then try
4874
+ * to trim the least recently used PA.
4875
+ */
44534876 spin_lock(pa->pa_obj_lock);
4454
- list_del_rcu(&pa->pa_inode_list);
4877
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
44554878 spin_unlock(pa->pa_obj_lock);
4456
- ext4_mb_add_n_trim(ac);
44574879 }
4880
+
44584881 ext4_mb_put_pa(ac, ac->ac_sb, pa);
44594882 }
44604883 if (ac->ac_bitmap_page)
....@@ -4464,6 +4887,7 @@
44644887 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
44654888 mutex_unlock(&ac->ac_lg->lg_mutex);
44664889 ext4_mb_collect_stats(ac);
4890
+ ext4_mb_trim_inode_pa(inode);
44674891 return 0;
44684892 }
44694893
....@@ -4471,17 +4895,55 @@
44714895 {
44724896 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
44734897 int ret;
4474
- int freed = 0;
4898
+ int freed = 0, busy = 0;
4899
+ int retry = 0;
44754900
44764901 trace_ext4_mb_discard_preallocations(sb, needed);
4902
+
4903
+ if (needed == 0)
4904
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
4905
+ repeat:
44774906 for (i = 0; i < ngroups && needed > 0; i++) {
4478
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4907
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
44794908 freed += ret;
44804909 needed -= ret;
4910
+ cond_resched();
4911
+ }
4912
+
4913
+ if (needed > 0 && busy && ++retry < 3) {
4914
+ busy = 0;
4915
+ goto repeat;
44814916 }
44824917
44834918 return freed;
44844919 }
4920
+
4921
+static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
4922
+ struct ext4_allocation_context *ac, u64 *seq)
4923
+{
4924
+ int freed;
4925
+ u64 seq_retry = 0;
4926
+ bool ret = false;
4927
+
4928
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4929
+ if (freed) {
4930
+ ret = true;
4931
+ goto out_dbg;
4932
+ }
4933
+ seq_retry = ext4_get_discard_pa_seq_sum();
4934
+ if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
4935
+ ac->ac_flags |= EXT4_MB_STRICT_CHECK;
4936
+ *seq = seq_retry;
4937
+ ret = true;
4938
+ }
4939
+
4940
+out_dbg:
4941
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
4942
+ return ret;
4943
+}
4944
+
4945
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
4946
+ struct ext4_allocation_request *ar, int *errp);
44854947
44864948 /*
44874949 * Main entry point into mballoc to allocate blocks
....@@ -4491,19 +4953,22 @@
44914953 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
44924954 struct ext4_allocation_request *ar, int *errp)
44934955 {
4494
- int freed;
44954956 struct ext4_allocation_context *ac = NULL;
44964957 struct ext4_sb_info *sbi;
44974958 struct super_block *sb;
44984959 ext4_fsblk_t block = 0;
44994960 unsigned int inquota = 0;
45004961 unsigned int reserv_clstrs = 0;
4962
+ int retries = 0;
4963
+ u64 seq;
45014964
45024965 might_sleep();
45034966 sb = ar->inode->i_sb;
45044967 sbi = EXT4_SB(sb);
45054968
45064969 trace_ext4_request_blocks(ar);
4970
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
4971
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
45074972
45084973 /* Allow to use superuser reservation for quota file */
45094974 if (ext4_is_quota_file(ar->inode))
....@@ -4522,6 +4987,7 @@
45224987 ar->len = ar->len >> 1;
45234988 }
45244989 if (!ar->len) {
4990
+ ext4_mb_show_pa(sb);
45254991 *errp = -ENOSPC;
45264992 return 0;
45274993 }
....@@ -4559,26 +5025,32 @@
45595025 }
45605026
45615027 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5028
+ seq = this_cpu_read(discard_pa_seq);
45625029 if (!ext4_mb_use_preallocated(ac)) {
45635030 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
45645031 ext4_mb_normalize_request(ac, ar);
5032
+
5033
+ *errp = ext4_mb_pa_alloc(ac);
5034
+ if (*errp)
5035
+ goto errout;
45655036 repeat:
45665037 /* allocate space in core */
45675038 *errp = ext4_mb_regular_allocator(ac);
4568
- if (*errp)
4569
- goto discard_and_exit;
4570
-
4571
- /* as we've just preallocated more space than
4572
- * user requested originally, we store allocated
4573
- * space in a special descriptor */
4574
- if (ac->ac_status == AC_STATUS_FOUND &&
4575
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4576
- *errp = ext4_mb_new_preallocation(ac);
5039
+ /*
5040
+ * pa allocated above is added to grp->bb_prealloc_list only
5041
+ * when we were able to allocate some block i.e. when
5042
+ * ac->ac_status == AC_STATUS_FOUND.
5043
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
5044
+ * So we have to free this pa here itself.
5045
+ */
45775046 if (*errp) {
4578
- discard_and_exit:
5047
+ ext4_mb_pa_free(ac);
45795048 ext4_discard_allocated_blocks(ac);
45805049 goto errout;
45815050 }
5051
+ if (ac->ac_status == AC_STATUS_FOUND &&
5052
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
5053
+ ext4_mb_pa_free(ac);
45825054 }
45835055 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
45845056 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
....@@ -4590,9 +5062,14 @@
45905062 ar->len = ac->ac_b_ex.fe_len;
45915063 }
45925064 } else {
4593
- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4594
- if (freed)
5065
+ if (++retries < 3 &&
5066
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
45955067 goto repeat;
5068
+ /*
5069
+ * If block allocation fails then the pa allocated above
5070
+ * needs to be freed here itself.
5071
+ */
5072
+ ext4_mb_pa_free(ac);
45965073 *errp = -ENOSPC;
45975074 }
45985075
....@@ -4721,12 +5198,117 @@
47215198 return 0;
47225199 }
47235200
5201
+/*
5202
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
5203
+ * linearly starting at the goal block and also excludes the blocks which
5204
+ * are going to be in use after fast commit replay.
5205
+ */
5206
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5207
+ struct ext4_allocation_request *ar, int *errp)
5208
+{
5209
+ struct buffer_head *bitmap_bh;
5210
+ struct super_block *sb = ar->inode->i_sb;
5211
+ ext4_group_t group;
5212
+ ext4_grpblk_t blkoff;
5213
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
5214
+ ext4_grpblk_t i = 0;
5215
+ ext4_fsblk_t goal, block;
5216
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
5217
+
5218
+ goal = ar->goal;
5219
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
5220
+ goal >= ext4_blocks_count(es))
5221
+ goal = le32_to_cpu(es->s_first_data_block);
5222
+
5223
+ ar->len = 0;
5224
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
5225
+ for (; group < ext4_get_groups_count(sb); group++) {
5226
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5227
+ if (IS_ERR(bitmap_bh)) {
5228
+ *errp = PTR_ERR(bitmap_bh);
5229
+ pr_warn("Failed to read block bitmap\n");
5230
+ return 0;
5231
+ }
5232
+
5233
+ ext4_get_group_no_and_offset(sb,
5234
+ max(ext4_group_first_block_no(sb, group), goal),
5235
+ NULL, &blkoff);
5236
+ while (1) {
5237
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
5238
+ blkoff);
5239
+ if (i >= max)
5240
+ break;
5241
+ if (ext4_fc_replay_check_excluded(sb,
5242
+ ext4_group_first_block_no(sb, group) + i)) {
5243
+ blkoff = i + 1;
5244
+ } else
5245
+ break;
5246
+ }
5247
+ brelse(bitmap_bh);
5248
+ if (i < max)
5249
+ break;
5250
+ }
5251
+
5252
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
5253
+ *errp = -ENOSPC;
5254
+ return 0;
5255
+ }
5256
+
5257
+ block = ext4_group_first_block_no(sb, group) + i;
5258
+ ext4_mb_mark_bb(sb, block, 1, 1);
5259
+ ar->len = 1;
5260
+
5261
+ return block;
5262
+}
5263
+
5264
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
5265
+ unsigned long count)
5266
+{
5267
+ struct buffer_head *bitmap_bh;
5268
+ struct super_block *sb = inode->i_sb;
5269
+ struct ext4_group_desc *gdp;
5270
+ struct buffer_head *gdp_bh;
5271
+ ext4_group_t group;
5272
+ ext4_grpblk_t blkoff;
5273
+ int already_freed = 0, err, i;
5274
+
5275
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
5276
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5277
+ if (IS_ERR(bitmap_bh)) {
5278
+ err = PTR_ERR(bitmap_bh);
5279
+ pr_warn("Failed to read block bitmap\n");
5280
+ return;
5281
+ }
5282
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
5283
+ if (!gdp)
5284
+ return;
5285
+
5286
+ for (i = 0; i < count; i++) {
5287
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
5288
+ already_freed++;
5289
+ }
5290
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
5291
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
5292
+ if (err)
5293
+ return;
5294
+ ext4_free_group_clusters_set(
5295
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
5296
+ count - already_freed);
5297
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
5298
+ ext4_group_desc_csum_set(sb, group, gdp);
5299
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
5300
+ sync_dirty_buffer(bitmap_bh);
5301
+ sync_dirty_buffer(gdp_bh);
5302
+ brelse(bitmap_bh);
5303
+}
5304
+
47245305 /**
47255306 * ext4_free_blocks() -- Free given blocks and update quota
47265307 * @handle: handle for this transaction
47275308 * @inode: inode
4728
- * @block: start physical block to free
4729
- * @count: number of blocks to count
5309
+ * @bh: optional buffer of the block to be freed
5310
+ * @block: starting physical block to be freed
5311
+ * @count: number of blocks to be freed
47305312 * @flags: flags used by ext4_free_blocks
47315313 */
47325314 void ext4_free_blocks(handle_t *handle, struct inode *inode,
....@@ -4746,6 +5328,13 @@
47465328 int err = 0;
47475329 int ret;
47485330
5331
+ sbi = EXT4_SB(sb);
5332
+
5333
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
5334
+ ext4_free_blocks_simple(inode, block, count);
5335
+ return;
5336
+ }
5337
+
47495338 might_sleep();
47505339 if (bh) {
47515340 if (block)
....@@ -4754,7 +5343,6 @@
47545343 block = bh->b_blocknr;
47555344 }
47565345
4757
- sbi = EXT4_SB(sb);
47585346 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
47595347 !ext4_inode_block_valid(inode, block, count)) {
47605348 ext4_error(sb, "Freeing blocks not in datazone - "
....@@ -4946,9 +5534,17 @@
49465534 flex_group)->free_clusters);
49475535 }
49485536
4949
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4950
- dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4951
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
5537
+ /*
5538
+ * on a bigalloc file system, defer the s_freeclusters_counter
5539
+ * update to the caller (ext4_remove_space and friends) so they
5540
+ * can determine if a cluster freed here should be rereserved
5541
+ */
5542
+ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
5543
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
5544
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
5545
+ percpu_counter_add(&sbi->s_freeclusters_counter,
5546
+ count_clusters);
5547
+ }
49525548
49535549 ext4_mb_unload_buddy(&e4b);
49545550