hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/ext4/mballoc.c
....@@ -16,6 +16,7 @@
1616 #include <linux/slab.h>
1717 #include <linux/nospec.h>
1818 #include <linux/backing-dev.h>
19
+#include <linux/freezer.h>
1920 #include <trace/events/ext4.h>
2021
2122 /*
....@@ -684,6 +685,8 @@
684685 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
685686
686687 grp = ext4_get_group_info(sb, e4b->bd_group);
688
+ if (!grp)
689
+ return NULL;
687690 list_for_each(cur, &grp->bb_prealloc_list) {
688691 ext4_group_t groupnr;
689692 struct ext4_prealloc_space *pa;
....@@ -767,9 +770,9 @@
767770
768771 static noinline_for_stack
769772 void ext4_mb_generate_buddy(struct super_block *sb,
770
- void *buddy, void *bitmap, ext4_group_t group)
773
+ void *buddy, void *bitmap, ext4_group_t group,
774
+ struct ext4_group_info *grp)
771775 {
772
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
773776 struct ext4_sb_info *sbi = EXT4_SB(sb);
774777 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
775778 ext4_grpblk_t i = 0;
....@@ -816,28 +819,8 @@
816819 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
817820
818821 period = get_cycles() - period;
819
- spin_lock(&sbi->s_bal_lock);
820
- sbi->s_mb_buddies_generated++;
821
- sbi->s_mb_generation_time += period;
822
- spin_unlock(&sbi->s_bal_lock);
823
-}
824
-
825
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
826
-{
827
- int count;
828
- int order = 1;
829
- void *buddy;
830
-
831
- while ((buddy = mb_find_buddy(e4b, order++, &count))) {
832
- ext4_set_bits(buddy, 0, count);
833
- }
834
- e4b->bd_info->bb_fragments = 0;
835
- memset(e4b->bd_info->bb_counters, 0,
836
- sizeof(*e4b->bd_info->bb_counters) *
837
- (e4b->bd_sb->s_blocksize_bits + 2));
838
-
839
- ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
840
- e4b->bd_bitmap, e4b->bd_group);
822
+ atomic_inc(&sbi->s_mb_buddies_generated);
823
+ atomic64_add(period, &sbi->s_mb_generation_time);
841824 }
842825
843826 /* The buddy information is attached the buddy cache inode
....@@ -909,6 +892,8 @@
909892 break;
910893
911894 grinfo = ext4_get_group_info(sb, group);
895
+ if (!grinfo)
896
+ continue;
912897 /*
913898 * If page is uptodate then we came here after online resize
914899 * which added some new uninitialized group info structs, so
....@@ -974,6 +959,10 @@
974959 group, page->index, i * blocksize);
975960 trace_ext4_mb_buddy_bitmap_load(sb, group);
976961 grinfo = ext4_get_group_info(sb, group);
962
+ if (!grinfo) {
963
+ err = -EFSCORRUPTED;
964
+ goto out;
965
+ }
977966 grinfo->bb_fragments = 0;
978967 memset(grinfo->bb_counters, 0,
979968 sizeof(*grinfo->bb_counters) *
....@@ -984,7 +973,7 @@
984973 ext4_lock_group(sb, group);
985974 /* init the buddy */
986975 memset(data, 0xff, blocksize);
987
- ext4_mb_generate_buddy(sb, data, incore, group);
976
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
988977 ext4_unlock_group(sb, group);
989978 incore = NULL;
990979 } else {
....@@ -1098,6 +1087,9 @@
10981087 might_sleep();
10991088 mb_debug(sb, "init group %u\n", group);
11001089 this_grp = ext4_get_group_info(sb, group);
1090
+ if (!this_grp)
1091
+ return -EFSCORRUPTED;
1092
+
11011093 /*
11021094 * This ensures that we don't reinit the buddy cache
11031095 * page which map to the group from which we are already
....@@ -1172,6 +1164,8 @@
11721164
11731165 blocks_per_page = PAGE_SIZE / sb->s_blocksize;
11741166 grp = ext4_get_group_info(sb, group);
1167
+ if (!grp)
1168
+ return -EFSCORRUPTED;
11751169
11761170 e4b->bd_blkbits = sb->s_blocksize_bits;
11771171 e4b->bd_info = grp;
....@@ -1512,7 +1506,6 @@
15121506 sb, e4b->bd_group,
15131507 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
15141508 }
1515
- mb_regenerate_buddy(e4b);
15161509 goto done;
15171510 }
15181511
....@@ -1885,7 +1878,9 @@
18851878 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
18861879 struct ext4_free_extent ex;
18871880
1888
- if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1881
+ if (!grp)
1882
+ return -EFSCORRUPTED;
1883
+ if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
18891884 return 0;
18901885 if (grp->bb_free == 0)
18911886 return 0;
....@@ -2109,7 +2104,7 @@
21092104
21102105 BUG_ON(cr < 0 || cr >= 4);
21112106
2112
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2107
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
21132108 return false;
21142109
21152110 free = grp->bb_free;
....@@ -2172,6 +2167,10 @@
21722167 ext4_grpblk_t free;
21732168 int ret = 0;
21742169
2170
+ if (!grp)
2171
+ return -EFSCORRUPTED;
2172
+ if (sbi->s_mb_stats)
2173
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
21752174 if (should_lock)
21762175 ext4_lock_group(sb, group);
21772176 free = grp->bb_free;
....@@ -2242,7 +2241,7 @@
22422241 * prefetch once, so we avoid getblk() call, which can
22432242 * be expensive.
22442243 */
2245
- if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2244
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
22462245 EXT4_MB_GRP_NEED_INIT(grp) &&
22472246 ext4_free_group_clusters(sb, gdp) > 0 &&
22482247 !(ext4_has_group_desc_csum(sb) &&
....@@ -2286,7 +2285,7 @@
22862285 group--;
22872286 grp = ext4_get_group_info(sb, group);
22882287
2289
- if (EXT4_MB_GRP_NEED_INIT(grp) &&
2288
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
22902289 ext4_free_group_clusters(sb, gdp) > 0 &&
22912290 !(ext4_has_group_desc_csum(sb) &&
22922291 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
....@@ -2446,6 +2445,9 @@
24462445 if (ac->ac_status != AC_STATUS_CONTINUE)
24472446 break;
24482447 }
2448
+ /* Processed all groups and haven't found blocks */
2449
+ if (sbi->s_mb_stats && i == ngroups)
2450
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
24492451 }
24502452
24512453 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
....@@ -2475,6 +2477,9 @@
24752477 goto repeat;
24762478 }
24772479 }
2480
+
2481
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2482
+ atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
24782483 out:
24792484 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
24802485 err = first_err;
....@@ -2538,6 +2543,8 @@
25382543 sizeof(struct ext4_group_info);
25392544
25402545 grinfo = ext4_get_group_info(sb, group);
2546
+ if (!grinfo)
2547
+ return 0;
25412548 /* Load the group info in memory only if not already loaded. */
25422549 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
25432550 err = ext4_mb_load_buddy(sb, group, &e4b);
....@@ -2548,7 +2555,7 @@
25482555 buddy_loaded = 1;
25492556 }
25502557
2551
- memcpy(&sg, ext4_get_group_info(sb, group), i);
2558
+ memcpy(&sg, grinfo, i);
25522559
25532560 if (buddy_loaded)
25542561 ext4_mb_unload_buddy(&e4b);
....@@ -2573,6 +2580,67 @@
25732580 .stop = ext4_mb_seq_groups_stop,
25742581 .show = ext4_mb_seq_groups_show,
25752582 };
2583
+
2584
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2585
+{
2586
+ struct super_block *sb = (struct super_block *)seq->private;
2587
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2588
+
2589
+ seq_puts(seq, "mballoc:\n");
2590
+ if (!sbi->s_mb_stats) {
2591
+ seq_puts(seq, "\tmb stats collection turned off.\n");
2592
+ seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2593
+ return 0;
2594
+ }
2595
+ seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2596
+ seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2597
+
2598
+ seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
2599
+
2600
+ seq_puts(seq, "\tcr0_stats:\n");
2601
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2602
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2603
+ atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2604
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2605
+ atomic64_read(&sbi->s_bal_cX_failed[0]));
2606
+
2607
+ seq_puts(seq, "\tcr1_stats:\n");
2608
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2609
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2610
+ atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2611
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2612
+ atomic64_read(&sbi->s_bal_cX_failed[1]));
2613
+
2614
+ seq_puts(seq, "\tcr2_stats:\n");
2615
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2616
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2617
+ atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2618
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2619
+ atomic64_read(&sbi->s_bal_cX_failed[2]));
2620
+
2621
+ seq_puts(seq, "\tcr3_stats:\n");
2622
+ seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2623
+ seq_printf(seq, "\t\tgroups_considered: %llu\n",
2624
+ atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2625
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
2626
+ atomic64_read(&sbi->s_bal_cX_failed[3]));
2627
+ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2628
+ seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2629
+ seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2630
+ seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2631
+ seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2632
+
2633
+ seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2634
+ atomic_read(&sbi->s_mb_buddies_generated),
2635
+ ext4_get_groups_count(sb));
2636
+ seq_printf(seq, "\tbuddies_time_used: %llu\n",
2637
+ atomic64_read(&sbi->s_mb_generation_time));
2638
+ seq_printf(seq, "\tpreallocated: %u\n",
2639
+ atomic_read(&sbi->s_mb_preallocated));
2640
+ seq_printf(seq, "\tdiscarded: %u\n",
2641
+ atomic_read(&sbi->s_mb_discarded));
2642
+ return 0;
2643
+}
25762644
25772645 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
25782646 {
....@@ -2764,8 +2832,12 @@
27642832
27652833 err_freebuddy:
27662834 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2767
- while (i-- > 0)
2768
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2835
+ while (i-- > 0) {
2836
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
2837
+
2838
+ if (grp)
2839
+ kmem_cache_free(cachep, grp);
2840
+ }
27692841 i = sbi->s_group_info_size;
27702842 rcu_read_lock();
27712843 group_info = rcu_dereference(sbi->s_group_info);
....@@ -2874,7 +2946,6 @@
28742946 } while (i <= sb->s_blocksize_bits + 1);
28752947
28762948 spin_lock_init(&sbi->s_md_lock);
2877
- spin_lock_init(&sbi->s_bal_lock);
28782949 sbi->s_mb_free_pending = 0;
28792950 INIT_LIST_HEAD(&sbi->s_freed_data_list);
28802951
....@@ -2973,6 +3044,8 @@
29733044 for (i = 0; i < ngroups; i++) {
29743045 cond_resched();
29753046 grinfo = ext4_get_group_info(sb, i);
3047
+ if (!grinfo)
3048
+ continue;
29763049 mb_group_bb_bitmap_free(grinfo);
29773050 ext4_lock_group(sb, i);
29783051 count = ext4_mb_cleanup_pa(grinfo);
....@@ -3002,17 +3075,18 @@
30023075 atomic_read(&sbi->s_bal_reqs),
30033076 atomic_read(&sbi->s_bal_success));
30043077 ext4_msg(sb, KERN_INFO,
3005
- "mballoc: %u extents scanned, %u goal hits, "
3078
+ "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
30063079 "%u 2^N hits, %u breaks, %u lost",
30073080 atomic_read(&sbi->s_bal_ex_scanned),
3081
+ atomic_read(&sbi->s_bal_groups_scanned),
30083082 atomic_read(&sbi->s_bal_goals),
30093083 atomic_read(&sbi->s_bal_2orders),
30103084 atomic_read(&sbi->s_bal_breaks),
30113085 atomic_read(&sbi->s_mb_lost_chunks));
30123086 ext4_msg(sb, KERN_INFO,
3013
- "mballoc: %lu generated and it took %Lu",
3014
- sbi->s_mb_buddies_generated,
3015
- sbi->s_mb_generation_time);
3087
+ "mballoc: %u generated and it took %llu",
3088
+ atomic_read(&sbi->s_mb_buddies_generated),
3089
+ atomic64_read(&sbi->s_mb_generation_time));
30163090 ext4_msg(sb, KERN_INFO,
30173091 "mballoc: %u preallocated, %u discarded",
30183092 atomic_read(&sbi->s_mb_preallocated),
....@@ -3439,6 +3513,7 @@
34393513 struct ext4_allocation_request *ar)
34403514 {
34413515 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3516
+ struct ext4_super_block *es = sbi->s_es;
34423517 int bsbits, max;
34433518 ext4_lblk_t end;
34443519 loff_t size, start_off;
....@@ -3619,18 +3694,21 @@
36193694 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
36203695
36213696 /* define goal start in order to merge */
3622
- if (ar->pright && (ar->lright == (start + size))) {
3697
+ if (ar->pright && (ar->lright == (start + size)) &&
3698
+ ar->pright >= size &&
3699
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
36233700 /* merge to the right */
36243701 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3625
- &ac->ac_f_ex.fe_group,
3626
- &ac->ac_f_ex.fe_start);
3702
+ &ac->ac_g_ex.fe_group,
3703
+ &ac->ac_g_ex.fe_start);
36273704 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
36283705 }
3629
- if (ar->pleft && (ar->lleft + 1 == start)) {
3706
+ if (ar->pleft && (ar->lleft + 1 == start) &&
3707
+ ar->pleft + 1 < ext4_blocks_count(es)) {
36303708 /* merge to the left */
36313709 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3632
- &ac->ac_f_ex.fe_group,
3633
- &ac->ac_f_ex.fe_start);
3710
+ &ac->ac_g_ex.fe_group,
3711
+ &ac->ac_g_ex.fe_start);
36343712 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
36353713 }
36363714
....@@ -3642,12 +3720,13 @@
36423720 {
36433721 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
36443722
3645
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3723
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
36463724 atomic_inc(&sbi->s_bal_reqs);
36473725 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
36483726 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
36493727 atomic_inc(&sbi->s_bal_success);
36503728 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3729
+ atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
36513730 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
36523731 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
36533732 atomic_inc(&sbi->s_bal_goals);
....@@ -3722,6 +3801,7 @@
37223801 BUG_ON(start < pa->pa_pstart);
37233802 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
37243803 BUG_ON(pa->pa_free < len);
3804
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
37253805 pa->pa_free -= len;
37263806
37273807 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
....@@ -3884,6 +3964,8 @@
38843964 struct ext4_free_data *entry;
38853965
38863966 grp = ext4_get_group_info(sb, group);
3967
+ if (!grp)
3968
+ return;
38873969 n = rb_first(&(grp->bb_free_root));
38883970
38893971 while (n) {
....@@ -3910,6 +3992,9 @@
39103992 ext4_grpblk_t start;
39113993 int preallocated = 0;
39123994 int len;
3995
+
3996
+ if (!grp)
3997
+ return;
39133998
39143999 /* all form of preallocation discards first load group,
39154000 * so the only competing code is preallocation use.
....@@ -4046,10 +4131,8 @@
40464131 pa = ac->ac_pa;
40474132
40484133 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
4049
- int winl;
4050
- int wins;
4051
- int win;
4052
- int offs;
4134
+ int new_bex_start;
4135
+ int new_bex_end;
40534136
40544137 /* we can't allocate as much as normalizer wants.
40554138 * so, found space must get proper lstart
....@@ -4057,26 +4140,40 @@
40574140 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
40584141 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
40594142
4060
- /* we're limited by original request in that
4061
- * logical block must be covered any way
4062
- * winl is window we can move our chunk within */
4063
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
4143
+ /*
4144
+ * Use the below logic for adjusting best extent as it keeps
4145
+ * fragmentation in check while ensuring logical range of best
4146
+ * extent doesn't overflow out of goal extent:
4147
+ *
4148
+ * 1. Check if best ex can be kept at end of goal and still
4149
+ * cover original start
4150
+ * 2. Else, check if best ex can be kept at start of goal and
4151
+ * still cover original start
4152
+ * 3. Else, keep the best ex at start of original request.
4153
+ */
4154
+ new_bex_end = ac->ac_g_ex.fe_logical +
4155
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
4156
+ new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4157
+ if (ac->ac_o_ex.fe_logical >= new_bex_start)
4158
+ goto adjust_bex;
40644159
4065
- /* also, we should cover whole original request */
4066
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
4160
+ new_bex_start = ac->ac_g_ex.fe_logical;
4161
+ new_bex_end =
4162
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4163
+ if (ac->ac_o_ex.fe_logical < new_bex_end)
4164
+ goto adjust_bex;
40674165
4068
- /* the smallest one defines real window */
4069
- win = min(winl, wins);
4166
+ new_bex_start = ac->ac_o_ex.fe_logical;
4167
+ new_bex_end =
4168
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
40704169
4071
- offs = ac->ac_o_ex.fe_logical %
4072
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4073
- if (offs && offs < win)
4074
- win = offs;
4170
+adjust_bex:
4171
+ ac->ac_b_ex.fe_logical = new_bex_start;
40754172
4076
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
4077
- EXT4_NUM_B2C(sbi, win);
40784173 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
40794174 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
4175
+ BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
4176
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
40804177 }
40814178
40824179 /* preallocation can change ac_b_ex, thus we store actually
....@@ -4102,6 +4199,8 @@
41024199
41034200 ei = EXT4_I(ac->ac_inode);
41044201 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4202
+ if (!grp)
4203
+ return;
41054204
41064205 pa->pa_obj_lock = &ei->i_prealloc_lock;
41074206 pa->pa_inode = ac->ac_inode;
....@@ -4155,6 +4254,8 @@
41554254 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
41564255
41574256 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4257
+ if (!grp)
4258
+ return;
41584259 lg = ac->ac_lg;
41594260 BUG_ON(lg == NULL);
41604261
....@@ -4250,7 +4351,11 @@
42504351 trace_ext4_mb_release_group_pa(sb, pa);
42514352 BUG_ON(pa->pa_deleted == 0);
42524353 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
4253
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
4354
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
4355
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
4356
+ e4b->bd_group, group, pa->pa_pstart);
4357
+ return 0;
4358
+ }
42544359 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
42554360 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
42564361 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
....@@ -4279,6 +4384,8 @@
42794384 int err;
42804385 int free = 0;
42814386
4387
+ if (!grp)
4388
+ return 0;
42824389 mb_debug(sb, "discard preallocation for group %u\n", group);
42834390 if (list_empty(&grp->bb_prealloc_list))
42844391 goto out_dbg;
....@@ -4516,6 +4623,9 @@
45164623 struct ext4_prealloc_space *pa;
45174624 ext4_grpblk_t start;
45184625 struct list_head *cur;
4626
+
4627
+ if (!grp)
4628
+ continue;
45194629 ext4_lock_group(sb, i);
45204630 list_for_each(cur, &grp->bb_prealloc_list) {
45214631 pa = list_entry(cur, struct ext4_prealloc_space,
....@@ -5303,7 +5413,8 @@
53035413 }
53045414
53055415 /**
5306
- * ext4_free_blocks() -- Free given blocks and update quota
5416
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
5417
+ * Used by ext4_free_blocks()
53075418 * @handle: handle for this transaction
53085419 * @inode: inode
53095420 * @bh: optional buffer of the block to be freed
....@@ -5311,13 +5422,14 @@
53115422 * @count: number of blocks to be freed
53125423 * @flags: flags used by ext4_free_blocks
53135424 */
5314
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
5315
- struct buffer_head *bh, ext4_fsblk_t block,
5316
- unsigned long count, int flags)
5425
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
5426
+ ext4_fsblk_t block, unsigned long count,
5427
+ int flags)
53175428 {
53185429 struct buffer_head *bitmap_bh = NULL;
53195430 struct super_block *sb = inode->i_sb;
53205431 struct ext4_group_desc *gdp;
5432
+ struct ext4_group_info *grp;
53215433 unsigned int overflow;
53225434 ext4_grpblk_t bit;
53235435 struct buffer_head *gd_bh;
....@@ -5330,86 +5442,21 @@
53305442
53315443 sbi = EXT4_SB(sb);
53325444
5333
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
5334
- ext4_free_blocks_simple(inode, block, count);
5335
- return;
5336
- }
5337
-
5338
- might_sleep();
5339
- if (bh) {
5340
- if (block)
5341
- BUG_ON(block != bh->b_blocknr);
5342
- else
5343
- block = bh->b_blocknr;
5344
- }
5345
-
53465445 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
53475446 !ext4_inode_block_valid(inode, block, count)) {
5348
- ext4_error(sb, "Freeing blocks not in datazone - "
5349
- "block = %llu, count = %lu", block, count);
5447
+ ext4_error(sb, "Freeing blocks in system zone - "
5448
+ "Block = %llu, count = %lu", block, count);
5449
+ /* err = 0. ext4_std_error should be a no op */
53505450 goto error_return;
53515451 }
5352
-
5353
- ext4_debug("freeing block %llu\n", block);
5354
- trace_ext4_free_blocks(inode, block, count, flags);
5355
-
5356
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5357
- BUG_ON(count > 1);
5358
-
5359
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
5360
- inode, bh, block);
5361
- }
5362
-
5363
- /*
5364
- * If the extent to be freed does not begin on a cluster
5365
- * boundary, we need to deal with partial clusters at the
5366
- * beginning and end of the extent. Normally we will free
5367
- * blocks at the beginning or the end unless we are explicitly
5368
- * requested to avoid doing so.
5369
- */
5370
- overflow = EXT4_PBLK_COFF(sbi, block);
5371
- if (overflow) {
5372
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
5373
- overflow = sbi->s_cluster_ratio - overflow;
5374
- block += overflow;
5375
- if (count > overflow)
5376
- count -= overflow;
5377
- else
5378
- return;
5379
- } else {
5380
- block -= overflow;
5381
- count += overflow;
5382
- }
5383
- }
5384
- overflow = EXT4_LBLK_COFF(sbi, count);
5385
- if (overflow) {
5386
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
5387
- if (count > overflow)
5388
- count -= overflow;
5389
- else
5390
- return;
5391
- } else
5392
- count += sbi->s_cluster_ratio - overflow;
5393
- }
5394
-
5395
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5396
- int i;
5397
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
5398
-
5399
- for (i = 0; i < count; i++) {
5400
- cond_resched();
5401
- if (is_metadata)
5402
- bh = sb_find_get_block(inode->i_sb, block + i);
5403
- ext4_forget(handle, is_metadata, inode, bh, block + i);
5404
- }
5405
- }
5452
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
54065453
54075454 do_more:
54085455 overflow = 0;
54095456 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
54105457
5411
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
5412
- ext4_get_group_info(sb, block_group))))
5458
+ grp = ext4_get_group_info(sb, block_group);
5459
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
54135460 return;
54145461
54155462 /*
....@@ -5420,6 +5467,8 @@
54205467 overflow = EXT4_C2B(sbi, bit) + count -
54215468 EXT4_BLOCKS_PER_GROUP(sb);
54225469 count -= overflow;
5470
+ /* The range changed so it's no longer validated */
5471
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
54235472 }
54245473 count_clusters = EXT4_NUM_B2C(sbi, count);
54255474 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
....@@ -5434,13 +5483,8 @@
54345483 goto error_return;
54355484 }
54365485
5437
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
5438
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
5439
- in_range(block, ext4_inode_table(sb, gdp),
5440
- sbi->s_itb_per_group) ||
5441
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
5442
- sbi->s_itb_per_group)) {
5443
-
5486
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5487
+ !ext4_inode_block_valid(inode, block, count)) {
54445488 ext4_error(sb, "Freeing blocks in system zone - "
54455489 "Block = %llu, count = %lu", block, count);
54465490 /* err = 0. ext4_std_error should be a no op */
....@@ -5506,11 +5550,11 @@
55065550 * them with group lock_held
55075551 */
55085552 if (test_opt(sb, DISCARD)) {
5509
- err = ext4_issue_discard(sb, block_group, bit, count,
5510
- NULL);
5553
+ err = ext4_issue_discard(sb, block_group, bit,
5554
+ count_clusters, NULL);
55115555 if (err && err != -EOPNOTSUPP)
55125556 ext4_msg(sb, KERN_WARNING, "discard request in"
5513
- " group:%d block:%d count:%lu failed"
5557
+ " group:%u block:%d count:%lu failed"
55145558 " with %d", block_group, bit, count,
55155559 err);
55165560 } else
....@@ -5562,11 +5606,116 @@
55625606 block += count;
55635607 count = overflow;
55645608 put_bh(bitmap_bh);
5609
+ /* The range changed so it's no longer validated */
5610
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
55655611 goto do_more;
55665612 }
55675613 error_return:
55685614 brelse(bitmap_bh);
55695615 ext4_std_error(sb, err);
5616
+ return;
5617
+}
5618
+
5619
+/**
5620
+ * ext4_free_blocks() -- Free given blocks and update quota
5621
+ * @handle: handle for this transaction
5622
+ * @inode: inode
5623
+ * @bh: optional buffer of the block to be freed
5624
+ * @block: starting physical block to be freed
5625
+ * @count: number of blocks to be freed
5626
+ * @flags: flags used by ext4_free_blocks
5627
+ */
5628
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
5629
+ struct buffer_head *bh, ext4_fsblk_t block,
5630
+ unsigned long count, int flags)
5631
+{
5632
+ struct super_block *sb = inode->i_sb;
5633
+ unsigned int overflow;
5634
+ struct ext4_sb_info *sbi;
5635
+
5636
+ sbi = EXT4_SB(sb);
5637
+
5638
+ if (bh) {
5639
+ if (block)
5640
+ BUG_ON(block != bh->b_blocknr);
5641
+ else
5642
+ block = bh->b_blocknr;
5643
+ }
5644
+
5645
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
5646
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
5647
+ return;
5648
+ }
5649
+
5650
+ might_sleep();
5651
+
5652
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5653
+ !ext4_inode_block_valid(inode, block, count)) {
5654
+ ext4_error(sb, "Freeing blocks not in datazone - "
5655
+ "block = %llu, count = %lu", block, count);
5656
+ return;
5657
+ }
5658
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
5659
+
5660
+ ext4_debug("freeing block %llu\n", block);
5661
+ trace_ext4_free_blocks(inode, block, count, flags);
5662
+
5663
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5664
+ BUG_ON(count > 1);
5665
+
5666
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
5667
+ inode, bh, block);
5668
+ }
5669
+
5670
+ /*
5671
+ * If the extent to be freed does not begin on a cluster
5672
+ * boundary, we need to deal with partial clusters at the
5673
+ * beginning and end of the extent. Normally we will free
5674
+ * blocks at the beginning or the end unless we are explicitly
5675
+ * requested to avoid doing so.
5676
+ */
5677
+ overflow = EXT4_PBLK_COFF(sbi, block);
5678
+ if (overflow) {
5679
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
5680
+ overflow = sbi->s_cluster_ratio - overflow;
5681
+ block += overflow;
5682
+ if (count > overflow)
5683
+ count -= overflow;
5684
+ else
5685
+ return;
5686
+ } else {
5687
+ block -= overflow;
5688
+ count += overflow;
5689
+ }
5690
+ /* The range changed so it's no longer validated */
5691
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5692
+ }
5693
+ overflow = EXT4_LBLK_COFF(sbi, count);
5694
+ if (overflow) {
5695
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
5696
+ if (count > overflow)
5697
+ count -= overflow;
5698
+ else
5699
+ return;
5700
+ } else
5701
+ count += sbi->s_cluster_ratio - overflow;
5702
+ /* The range changed so it's no longer validated */
5703
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5704
+ }
5705
+
5706
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5707
+ int i;
5708
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
5709
+
5710
+ for (i = 0; i < count; i++) {
5711
+ cond_resched();
5712
+ if (is_metadata)
5713
+ bh = sb_find_get_block(inode->i_sb, block + i);
5714
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
5715
+ }
5716
+ }
5717
+
5718
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
55705719 return;
55715720 }
55725721
....@@ -5626,11 +5775,7 @@
56265775 goto error_return;
56275776 }
56285777
5629
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
5630
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
5631
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
5632
- in_range(block + count - 1, ext4_inode_table(sb, desc),
5633
- sbi->s_itb_per_group)) {
5778
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
56345779 ext4_error(sb, "Adding blocks in system zones - "
56355780 "Block = %llu, count = %lu",
56365781 block, count);
....@@ -5715,19 +5860,19 @@
57155860 * @sb: super block for the file system
57165861 * @start: starting block of the free extent in the alloc. group
57175862 * @count: number of blocks to TRIM
5718
- * @group: alloc. group we are working with
57195863 * @e4b: ext4 buddy for the group
57205864 *
57215865 * Trim "count" blocks starting at "start" in the "group". To assure that no
57225866 * one will allocate those blocks, mark it as used in buddy bitmap. This must
57235867 * be called with under the group lock.
57245868 */
5725
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
5726
- ext4_group_t group, struct ext4_buddy *e4b)
5869
+static int ext4_trim_extent(struct super_block *sb,
5870
+ int start, int count, struct ext4_buddy *e4b)
57275871 __releases(bitlock)
57285872 __acquires(bitlock)
57295873 {
57305874 struct ext4_free_extent ex;
5875
+ ext4_group_t group = e4b->bd_group;
57315876 int ret = 0;
57325877
57335878 trace_ext4_trim_extent(sb, group, start, count);
....@@ -5748,6 +5893,71 @@
57485893 ext4_lock_group(sb, group);
57495894 mb_free_blocks(NULL, e4b, start, ex.fe_len);
57505895 return ret;
5896
+}
5897
+
5898
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
5899
+ ext4_group_t grp)
5900
+{
5901
+ if (grp < ext4_get_groups_count(sb))
5902
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5903
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
5904
+ ext4_group_first_block_no(sb, grp) - 1) >>
5905
+ EXT4_CLUSTER_BITS(sb);
5906
+}
5907
+
5908
+static bool ext4_trim_interrupted(void)
5909
+{
5910
+ return fatal_signal_pending(current) || freezing(current);
5911
+}
5912
+
5913
+static int ext4_try_to_trim_range(struct super_block *sb,
5914
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
5915
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
5916
+{
5917
+ ext4_grpblk_t next, count, free_count;
5918
+ bool set_trimmed = false;
5919
+ void *bitmap;
5920
+
5921
+ bitmap = e4b->bd_bitmap;
5922
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
5923
+ set_trimmed = true;
5924
+ start = max(e4b->bd_info->bb_first_free, start);
5925
+ count = 0;
5926
+ free_count = 0;
5927
+
5928
+ while (start <= max) {
5929
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
5930
+ if (start > max)
5931
+ break;
5932
+ next = mb_find_next_bit(bitmap, max + 1, start);
5933
+
5934
+ if ((next - start) >= minblocks) {
5935
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
5936
+
5937
+ if (ret && ret != -EOPNOTSUPP)
5938
+ return count;
5939
+ count += next - start;
5940
+ }
5941
+ free_count += next - start;
5942
+ start = next + 1;
5943
+
5944
+ if (ext4_trim_interrupted())
5945
+ return count;
5946
+
5947
+ if (need_resched()) {
5948
+ ext4_unlock_group(sb, e4b->bd_group);
5949
+ cond_resched();
5950
+ ext4_lock_group(sb, e4b->bd_group);
5951
+ }
5952
+
5953
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
5954
+ break;
5955
+ }
5956
+
5957
+ if (set_trimmed)
5958
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
5959
+
5960
+ return count;
57515961 }
57525962
57535963 /**
....@@ -5773,10 +5983,8 @@
57735983 ext4_grpblk_t start, ext4_grpblk_t max,
57745984 ext4_grpblk_t minblocks)
57755985 {
5776
- void *bitmap;
5777
- ext4_grpblk_t next, count = 0, free_count = 0;
57785986 struct ext4_buddy e4b;
5779
- int ret = 0;
5987
+ int ret;
57805988
57815989 trace_ext4_trim_all_free(sb, group, start, max);
57825990
....@@ -5786,58 +5994,20 @@
57865994 ret, group);
57875995 return ret;
57885996 }
5789
- bitmap = e4b.bd_bitmap;
57905997
57915998 ext4_lock_group(sb, group);
5792
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5793
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5794
- goto out;
57955999
5796
- start = (e4b.bd_info->bb_first_free > start) ?
5797
- e4b.bd_info->bb_first_free : start;
6000
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
6001
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
6002
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
6003
+ else
6004
+ ret = 0;
57986005
5799
- while (start <= max) {
5800
- start = mb_find_next_zero_bit(bitmap, max + 1, start);
5801
- if (start > max)
5802
- break;
5803
- next = mb_find_next_bit(bitmap, max + 1, start);
5804
-
5805
- if ((next - start) >= minblocks) {
5806
- ret = ext4_trim_extent(sb, start,
5807
- next - start, group, &e4b);
5808
- if (ret && ret != -EOPNOTSUPP)
5809
- break;
5810
- ret = 0;
5811
- count += next - start;
5812
- }
5813
- free_count += next - start;
5814
- start = next + 1;
5815
-
5816
- if (fatal_signal_pending(current)) {
5817
- count = -ERESTARTSYS;
5818
- break;
5819
- }
5820
-
5821
- if (need_resched()) {
5822
- ext4_unlock_group(sb, group);
5823
- cond_resched();
5824
- ext4_lock_group(sb, group);
5825
- }
5826
-
5827
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
5828
- break;
5829
- }
5830
-
5831
- if (!ret) {
5832
- ret = count;
5833
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5834
- }
5835
-out:
58366006 ext4_unlock_group(sb, group);
58376007 ext4_mb_unload_buddy(&e4b);
58386008
58396009 ext4_debug("trimmed %d blocks in the group %d\n",
5840
- count, group);
6010
+ ret, group);
58416011
58426012 return ret;
58436013 }
....@@ -5882,7 +6052,7 @@
58826052 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
58836053 goto out;
58846054 }
5885
- if (end >= max_blks)
6055
+ if (end >= max_blks - 1)
58866056 end = max_blks - 1;
58876057 if (end <= first_data_blk)
58886058 goto out;
....@@ -5899,7 +6069,11 @@
58996069 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
59006070
59016071 for (group = first_group; group <= last_group; group++) {
6072
+ if (ext4_trim_interrupted())
6073
+ break;
59026074 grp = ext4_get_group_info(sb, group);
6075
+ if (!grp)
6076
+ continue;
59036077 /* We only do this if the grp has never been initialized */
59046078 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
59056079 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
....@@ -5915,10 +6089,9 @@
59156089 */
59166090 if (group == last_group)
59176091 end = last_cluster;
5918
-
59196092 if (grp->bb_free >= minlen) {
59206093 cnt = ext4_trim_all_free(sb, group, first_cluster,
5921
- end, minlen);
6094
+ end, minlen);
59226095 if (cnt < 0) {
59236096 ret = cnt;
59246097 break;
....@@ -5934,7 +6107,7 @@
59346107 }
59356108
59366109 if (!ret)
5937
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
6110
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
59386111
59396112 out:
59406113 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
....@@ -5963,8 +6136,7 @@
59636136
59646137 ext4_lock_group(sb, group);
59656138
5966
- start = (e4b.bd_info->bb_first_free > start) ?
5967
- e4b.bd_info->bb_first_free : start;
6139
+ start = max(e4b.bd_info->bb_first_free, start);
59686140 if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
59696141 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
59706142