.. | .. |
---|
18 | 18 | #include <linux/backing-dev.h> |
---|
19 | 19 | #include <trace/events/ext4.h> |
---|
20 | 20 | |
---|
21 | | -#ifdef CONFIG_EXT4_DEBUG |
---|
22 | | -ushort ext4_mballoc_debug __read_mostly; |
---|
23 | | - |
---|
24 | | -module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); |
---|
25 | | -MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); |
---|
26 | | -#endif |
---|
27 | | - |
---|
28 | 21 | /* |
---|
29 | 22 | * MUSTDO: |
---|
30 | 23 | * - test ext4_ext_search_left() and ext4_ext_search_right() |
---|
.. | .. |
---|
131 | 124 | * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in |
---|
132 | 125 | * terms of number of blocks. If we have mounted the file system with -O |
---|
133 | 126 | * stripe=<value> option the group prealloc request is normalized to the |
---|
134 | | - * the smallest multiple of the stripe value (sbi->s_stripe) which is |
---|
| 127 | + * smallest multiple of the stripe value (sbi->s_stripe) which is |
---|
135 | 128 | * greater than the default mb_group_prealloc. |
---|
136 | 129 | * |
---|
137 | 130 | * The regular allocator (using the buddy cache) supports a few tunables. |
---|
.. | .. |
---|
356 | 349 | ext4_group_t group); |
---|
357 | 350 | static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, |
---|
358 | 351 | ext4_group_t group); |
---|
| 352 | +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); |
---|
| 353 | + |
---|
| 354 | +/* |
---|
| 355 | + * The algorithm using this percpu seq counter goes below: |
---|
| 356 | + * 1. We sample the percpu discard_pa_seq counter before trying for block |
---|
| 357 | + * allocation in ext4_mb_new_blocks(). |
---|
| 358 | + * 2. We increment this percpu discard_pa_seq counter when we either allocate |
---|
| 359 | + * or free these blocks i.e. while marking those blocks as used/free in |
---|
| 360 | + * mb_mark_used()/mb_free_blocks(). |
---|
| 361 | + * 3. We also increment this percpu seq counter when we successfully identify |
---|
| 362 | + * that the bb_prealloc_list is not empty and hence proceed for discarding |
---|
| 363 | + * of those PAs inside ext4_mb_discard_group_preallocations(). |
---|
| 364 | + * |
---|
| 365 | + * Now to make sure that the regular fast path of block allocation is not |
---|
| 366 | + * affected, as a small optimization we only sample the percpu seq counter |
---|
| 367 | + * on that cpu. Only when the block allocation fails and when freed blocks |
---|
| 368 | + * found were 0, that is when we sample percpu seq counter for all cpus using |
---|
| 369 | + * below function ext4_get_discard_pa_seq_sum(). This happens after making |
---|
| 370 | + * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. |
---|
| 371 | + */ |
---|
| 372 | +static DEFINE_PER_CPU(u64, discard_pa_seq); |
---|
| 373 | +static inline u64 ext4_get_discard_pa_seq_sum(void) |
---|
| 374 | +{ |
---|
| 375 | + int __cpu; |
---|
| 376 | + u64 __seq = 0; |
---|
| 377 | + |
---|
| 378 | + for_each_possible_cpu(__cpu) |
---|
| 379 | + __seq += per_cpu(discard_pa_seq, __cpu); |
---|
| 380 | + return __seq; |
---|
| 381 | +} |
---|
359 | 382 | |
---|
360 | 383 | static inline void *mb_correct_addr_and_bit(int *bit, void *addr) |
---|
361 | 384 | { |
---|
.. | .. |
---|
493 | 516 | |
---|
494 | 517 | static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) |
---|
495 | 518 | { |
---|
| 519 | + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) |
---|
| 520 | + return; |
---|
496 | 521 | if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { |
---|
497 | 522 | unsigned char *b1, *b2; |
---|
498 | 523 | int i; |
---|
.. | .. |
---|
511 | 536 | } |
---|
512 | 537 | } |
---|
513 | 538 | |
---|
| 539 | +static void mb_group_bb_bitmap_alloc(struct super_block *sb, |
---|
| 540 | + struct ext4_group_info *grp, ext4_group_t group) |
---|
| 541 | +{ |
---|
| 542 | + struct buffer_head *bh; |
---|
| 543 | + |
---|
| 544 | + grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); |
---|
| 545 | + if (!grp->bb_bitmap) |
---|
| 546 | + return; |
---|
| 547 | + |
---|
| 548 | + bh = ext4_read_block_bitmap(sb, group); |
---|
| 549 | + if (IS_ERR_OR_NULL(bh)) { |
---|
| 550 | + kfree(grp->bb_bitmap); |
---|
| 551 | + grp->bb_bitmap = NULL; |
---|
| 552 | + return; |
---|
| 553 | + } |
---|
| 554 | + |
---|
| 555 | + memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); |
---|
| 556 | + put_bh(bh); |
---|
| 557 | +} |
---|
| 558 | + |
---|
| 559 | +static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) |
---|
| 560 | +{ |
---|
| 561 | + kfree(grp->bb_bitmap); |
---|
| 562 | +} |
---|
| 563 | + |
---|
514 | 564 | #else |
---|
515 | 565 | static inline void mb_free_blocks_double(struct inode *inode, |
---|
516 | 566 | struct ext4_buddy *e4b, int first, int count) |
---|
.. | .. |
---|
523 | 573 | return; |
---|
524 | 574 | } |
---|
525 | 575 | static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) |
---|
| 576 | +{ |
---|
| 577 | + return; |
---|
| 578 | +} |
---|
| 579 | + |
---|
| 580 | +static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, |
---|
| 581 | + struct ext4_group_info *grp, ext4_group_t group) |
---|
| 582 | +{ |
---|
| 583 | + return; |
---|
| 584 | +} |
---|
| 585 | + |
---|
| 586 | +static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) |
---|
526 | 587 | { |
---|
527 | 588 | return; |
---|
528 | 589 | } |
---|
.. | .. |
---|
558 | 619 | void *buddy; |
---|
559 | 620 | void *buddy2; |
---|
560 | 621 | |
---|
561 | | - { |
---|
562 | | - static int mb_check_counter; |
---|
563 | | - if (mb_check_counter++ % 100 != 0) |
---|
564 | | - return 0; |
---|
565 | | - } |
---|
| 622 | + if (e4b->bd_info->bb_check_counter++ % 10) |
---|
| 623 | + return 0; |
---|
566 | 624 | |
---|
567 | 625 | while (order > 1) { |
---|
568 | 626 | buddy = mb_find_buddy(e4b, order, &max); |
---|
.. | .. |
---|
820 | 878 | char *bitmap; |
---|
821 | 879 | struct ext4_group_info *grinfo; |
---|
822 | 880 | |
---|
823 | | - mb_debug(1, "init page %lu\n", page->index); |
---|
824 | | - |
---|
825 | 881 | inode = page->mapping->host; |
---|
826 | 882 | sb = inode->i_sb; |
---|
827 | 883 | ngroups = ext4_get_groups_count(sb); |
---|
828 | 884 | blocksize = i_blocksize(inode); |
---|
829 | 885 | blocks_per_page = PAGE_SIZE / blocksize; |
---|
| 886 | + |
---|
| 887 | + mb_debug(sb, "init page %lu\n", page->index); |
---|
830 | 888 | |
---|
831 | 889 | groups_per_page = blocks_per_page >> 1; |
---|
832 | 890 | if (groups_per_page == 0) |
---|
.. | .. |
---|
861 | 919 | bh[i] = NULL; |
---|
862 | 920 | continue; |
---|
863 | 921 | } |
---|
864 | | - bh[i] = ext4_read_block_bitmap_nowait(sb, group); |
---|
| 922 | + bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); |
---|
865 | 923 | if (IS_ERR(bh[i])) { |
---|
866 | 924 | err = PTR_ERR(bh[i]); |
---|
867 | 925 | bh[i] = NULL; |
---|
868 | 926 | goto out; |
---|
869 | 927 | } |
---|
870 | | - mb_debug(1, "read bitmap for group %u\n", group); |
---|
| 928 | + mb_debug(sb, "read bitmap for group %u\n", group); |
---|
871 | 929 | } |
---|
872 | 930 | |
---|
873 | 931 | /* wait for I/O completion */ |
---|
.. | .. |
---|
912 | 970 | if ((first_block + i) & 1) { |
---|
913 | 971 | /* this is block of buddy */ |
---|
914 | 972 | BUG_ON(incore == NULL); |
---|
915 | | - mb_debug(1, "put buddy for group %u in page %lu/%x\n", |
---|
| 973 | + mb_debug(sb, "put buddy for group %u in page %lu/%x\n", |
---|
916 | 974 | group, page->index, i * blocksize); |
---|
917 | 975 | trace_ext4_mb_buddy_bitmap_load(sb, group); |
---|
918 | 976 | grinfo = ext4_get_group_info(sb, group); |
---|
.. | .. |
---|
932 | 990 | } else { |
---|
933 | 991 | /* this is block of bitmap */ |
---|
934 | 992 | BUG_ON(incore != NULL); |
---|
935 | | - mb_debug(1, "put bitmap for group %u in page %lu/%x\n", |
---|
| 993 | + mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", |
---|
936 | 994 | group, page->index, i * blocksize); |
---|
937 | 995 | trace_ext4_mb_bitmap_load(sb, group); |
---|
938 | 996 | |
---|
.. | .. |
---|
1038 | 1096 | int ret = 0; |
---|
1039 | 1097 | |
---|
1040 | 1098 | might_sleep(); |
---|
1041 | | - mb_debug(1, "init group %u\n", group); |
---|
| 1099 | + mb_debug(sb, "init group %u\n", group); |
---|
1042 | 1100 | this_grp = ext4_get_group_info(sb, group); |
---|
1043 | 1101 | /* |
---|
1044 | 1102 | * This ensures that we don't reinit the buddy cache |
---|
.. | .. |
---|
1110 | 1168 | struct inode *inode = sbi->s_buddy_cache; |
---|
1111 | 1169 | |
---|
1112 | 1170 | might_sleep(); |
---|
1113 | | - mb_debug(1, "load group %u\n", group); |
---|
| 1171 | + mb_debug(sb, "load group %u\n", group); |
---|
1114 | 1172 | |
---|
1115 | 1173 | blocks_per_page = PAGE_SIZE / sb->s_blocksize; |
---|
1116 | 1174 | grp = ext4_get_group_info(sb, group); |
---|
.. | .. |
---|
1217 | 1275 | /* Pages marked accessed already */ |
---|
1218 | 1276 | e4b->bd_buddy_page = page; |
---|
1219 | 1277 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); |
---|
1220 | | - |
---|
1221 | | - BUG_ON(e4b->bd_bitmap_page == NULL); |
---|
1222 | | - BUG_ON(e4b->bd_buddy_page == NULL); |
---|
1223 | 1278 | |
---|
1224 | 1279 | return 0; |
---|
1225 | 1280 | |
---|
.. | .. |
---|
1336 | 1391 | } |
---|
1337 | 1392 | } |
---|
1338 | 1393 | |
---|
1339 | | -/* |
---|
1340 | | - * _________________________________________________________________ */ |
---|
1341 | | - |
---|
1342 | 1394 | static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) |
---|
1343 | 1395 | { |
---|
1344 | 1396 | if (mb_test_bit(*bit + side, bitmap)) { |
---|
.. | .. |
---|
1430 | 1482 | mb_check_buddy(e4b); |
---|
1431 | 1483 | mb_free_blocks_double(inode, e4b, first, count); |
---|
1432 | 1484 | |
---|
| 1485 | + this_cpu_inc(discard_pa_seq); |
---|
1433 | 1486 | e4b->bd_info->bb_free += count; |
---|
1434 | 1487 | if (first < e4b->bd_info->bb_first_free) |
---|
1435 | 1488 | e4b->bd_info->bb_first_free = first; |
---|
.. | .. |
---|
1449 | 1502 | |
---|
1450 | 1503 | blocknr = ext4_group_first_block_no(sb, e4b->bd_group); |
---|
1451 | 1504 | blocknr += EXT4_C2B(sbi, block); |
---|
1452 | | - ext4_grp_locked_error(sb, e4b->bd_group, |
---|
1453 | | - inode ? inode->i_ino : 0, |
---|
1454 | | - blocknr, |
---|
1455 | | - "freeing already freed block " |
---|
1456 | | - "(bit %u); block bitmap corrupt.", |
---|
1457 | | - block); |
---|
1458 | | - ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, |
---|
| 1505 | + if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { |
---|
| 1506 | + ext4_grp_locked_error(sb, e4b->bd_group, |
---|
| 1507 | + inode ? inode->i_ino : 0, |
---|
| 1508 | + blocknr, |
---|
| 1509 | + "freeing already freed block (bit %u); block bitmap corrupt.", |
---|
| 1510 | + block); |
---|
| 1511 | + ext4_mark_group_bitmap_corrupted( |
---|
| 1512 | + sb, e4b->bd_group, |
---|
1459 | 1513 | EXT4_GROUP_INFO_BBITMAP_CORRUPT); |
---|
| 1514 | + } |
---|
1460 | 1515 | mb_regenerate_buddy(e4b); |
---|
1461 | 1516 | goto done; |
---|
1462 | 1517 | } |
---|
.. | .. |
---|
1572 | 1627 | mb_check_buddy(e4b); |
---|
1573 | 1628 | mb_mark_used_double(e4b, start, len); |
---|
1574 | 1629 | |
---|
| 1630 | + this_cpu_inc(discard_pa_seq); |
---|
1575 | 1631 | e4b->bd_info->bb_free -= len; |
---|
1576 | 1632 | if (e4b->bd_info->bb_first_free == start) |
---|
1577 | 1633 | e4b->bd_info->bb_first_free += len; |
---|
.. | .. |
---|
1671 | 1727 | sbi->s_mb_last_start = ac->ac_f_ex.fe_start; |
---|
1672 | 1728 | spin_unlock(&sbi->s_md_lock); |
---|
1673 | 1729 | } |
---|
1674 | | -} |
---|
| 1730 | + /* |
---|
| 1731 | + * As we've just preallocated more space than |
---|
| 1732 | + * user requested originally, we store allocated |
---|
| 1733 | + * space in a special descriptor. |
---|
| 1734 | + */ |
---|
| 1735 | + if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) |
---|
| 1736 | + ext4_mb_new_preallocation(ac); |
---|
1675 | 1737 | |
---|
1676 | | -/* |
---|
1677 | | - * regular allocator, for general purposes allocation |
---|
1678 | | - */ |
---|
| 1738 | +} |
---|
1679 | 1739 | |
---|
1680 | 1740 | static void ext4_mb_check_limits(struct ext4_allocation_context *ac, |
---|
1681 | 1741 | struct ext4_buddy *e4b, |
---|
.. | .. |
---|
1919 | 1979 | |
---|
1920 | 1980 | ext4_mb_use_best_found(ac, e4b); |
---|
1921 | 1981 | |
---|
1922 | | - BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); |
---|
| 1982 | + BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); |
---|
1923 | 1983 | |
---|
1924 | 1984 | if (EXT4_SB(sb)->s_mb_stats) |
---|
1925 | 1985 | atomic_inc(&EXT4_SB(sb)->s_bal_2orders); |
---|
.. | .. |
---|
1956 | 2016 | /* |
---|
1957 | 2017 | * IF we have corrupt bitmap, we won't find any |
---|
1958 | 2018 | * free blocks even though group info says we |
---|
1959 | | - * we have free blocks |
---|
| 2019 | + * have free blocks |
---|
1960 | 2020 | */ |
---|
1961 | 2021 | ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, |
---|
1962 | 2022 | "%d free clusters as per " |
---|
.. | .. |
---|
2036 | 2096 | } |
---|
2037 | 2097 | |
---|
2038 | 2098 | /* |
---|
2039 | | - * This is now called BEFORE we load the buddy bitmap. |
---|
| 2099 | + * This is also called BEFORE we load the buddy bitmap. |
---|
2040 | 2100 | * Returns either 1 or 0 indicating that the group is either suitable |
---|
2041 | | - * for the allocation or not. In addition it can also return negative |
---|
2042 | | - * error code when something goes wrong. |
---|
| 2101 | + * for the allocation or not. |
---|
2043 | 2102 | */ |
---|
2044 | | -static int ext4_mb_good_group(struct ext4_allocation_context *ac, |
---|
| 2103 | +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, |
---|
2045 | 2104 | ext4_group_t group, int cr) |
---|
2046 | 2105 | { |
---|
2047 | | - unsigned free, fragments; |
---|
| 2106 | + ext4_grpblk_t free, fragments; |
---|
2048 | 2107 | int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); |
---|
2049 | 2108 | struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); |
---|
2050 | 2109 | |
---|
2051 | 2110 | BUG_ON(cr < 0 || cr >= 4); |
---|
2052 | 2111 | |
---|
| 2112 | + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) |
---|
| 2113 | + return false; |
---|
| 2114 | + |
---|
2053 | 2115 | free = grp->bb_free; |
---|
2054 | 2116 | if (free == 0) |
---|
2055 | | - return 0; |
---|
2056 | | - if (cr <= 2 && free < ac->ac_g_ex.fe_len) |
---|
2057 | | - return 0; |
---|
2058 | | - |
---|
2059 | | - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) |
---|
2060 | | - return 0; |
---|
2061 | | - |
---|
2062 | | - /* We only do this if the grp has never been initialized */ |
---|
2063 | | - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { |
---|
2064 | | - int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); |
---|
2065 | | - if (ret) |
---|
2066 | | - return ret; |
---|
2067 | | - } |
---|
| 2117 | + return false; |
---|
2068 | 2118 | |
---|
2069 | 2119 | fragments = grp->bb_fragments; |
---|
2070 | 2120 | if (fragments == 0) |
---|
2071 | | - return 0; |
---|
| 2121 | + return false; |
---|
2072 | 2122 | |
---|
2073 | 2123 | switch (cr) { |
---|
2074 | 2124 | case 0: |
---|
.. | .. |
---|
2078 | 2128 | if ((ac->ac_flags & EXT4_MB_HINT_DATA) && |
---|
2079 | 2129 | (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && |
---|
2080 | 2130 | ((group % flex_size) == 0)) |
---|
2081 | | - return 0; |
---|
| 2131 | + return false; |
---|
2082 | 2132 | |
---|
2083 | | - if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || |
---|
2084 | | - (free / fragments) >= ac->ac_g_ex.fe_len) |
---|
2085 | | - return 1; |
---|
| 2133 | + if (free < ac->ac_g_ex.fe_len) |
---|
| 2134 | + return false; |
---|
| 2135 | + |
---|
| 2136 | + if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) |
---|
| 2137 | + return true; |
---|
2086 | 2138 | |
---|
2087 | 2139 | if (grp->bb_largest_free_order < ac->ac_2order) |
---|
2088 | | - return 0; |
---|
| 2140 | + return false; |
---|
2089 | 2141 | |
---|
2090 | | - return 1; |
---|
| 2142 | + return true; |
---|
2091 | 2143 | case 1: |
---|
2092 | 2144 | if ((free / fragments) >= ac->ac_g_ex.fe_len) |
---|
2093 | | - return 1; |
---|
| 2145 | + return true; |
---|
2094 | 2146 | break; |
---|
2095 | 2147 | case 2: |
---|
2096 | 2148 | if (free >= ac->ac_g_ex.fe_len) |
---|
2097 | | - return 1; |
---|
| 2149 | + return true; |
---|
2098 | 2150 | break; |
---|
2099 | 2151 | case 3: |
---|
2100 | | - return 1; |
---|
| 2152 | + return true; |
---|
2101 | 2153 | default: |
---|
2102 | 2154 | BUG(); |
---|
2103 | 2155 | } |
---|
2104 | 2156 | |
---|
2105 | | - return 0; |
---|
| 2157 | + return false; |
---|
| 2158 | +} |
---|
| 2159 | + |
---|
| 2160 | +/* |
---|
| 2161 | + * This could return negative error code if something goes wrong |
---|
| 2162 | + * during ext4_mb_init_group(). This should not be called with |
---|
| 2163 | + * ext4_lock_group() held. |
---|
| 2164 | + */ |
---|
| 2165 | +static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, |
---|
| 2166 | + ext4_group_t group, int cr) |
---|
| 2167 | +{ |
---|
| 2168 | + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); |
---|
| 2169 | + struct super_block *sb = ac->ac_sb; |
---|
| 2170 | + struct ext4_sb_info *sbi = EXT4_SB(sb); |
---|
| 2171 | + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; |
---|
| 2172 | + ext4_grpblk_t free; |
---|
| 2173 | + int ret = 0; |
---|
| 2174 | + |
---|
| 2175 | + if (should_lock) |
---|
| 2176 | + ext4_lock_group(sb, group); |
---|
| 2177 | + free = grp->bb_free; |
---|
| 2178 | + if (free == 0) |
---|
| 2179 | + goto out; |
---|
| 2180 | + if (cr <= 2 && free < ac->ac_g_ex.fe_len) |
---|
| 2181 | + goto out; |
---|
| 2182 | + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) |
---|
| 2183 | + goto out; |
---|
| 2184 | + if (should_lock) |
---|
| 2185 | + ext4_unlock_group(sb, group); |
---|
| 2186 | + |
---|
| 2187 | + /* We only do this if the grp has never been initialized */ |
---|
| 2188 | + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { |
---|
| 2189 | + struct ext4_group_desc *gdp = |
---|
| 2190 | + ext4_get_group_desc(sb, group, NULL); |
---|
| 2191 | + int ret; |
---|
| 2192 | + |
---|
| 2193 | + /* cr=0/1 is a very optimistic search to find large |
---|
| 2194 | + * good chunks almost for free. If buddy data is not |
---|
| 2195 | + * ready, then this optimization makes no sense. But |
---|
| 2196 | + * we never skip the first block group in a flex_bg, |
---|
| 2197 | + * since this gets used for metadata block allocation, |
---|
| 2198 | + * and we want to make sure we locate metadata blocks |
---|
| 2199 | + * in the first block group in the flex_bg if possible. |
---|
| 2200 | + */ |
---|
| 2201 | + if (cr < 2 && |
---|
| 2202 | + (!sbi->s_log_groups_per_flex || |
---|
| 2203 | + ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && |
---|
| 2204 | + !(ext4_has_group_desc_csum(sb) && |
---|
| 2205 | + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) |
---|
| 2206 | + return 0; |
---|
| 2207 | + ret = ext4_mb_init_group(sb, group, GFP_NOFS); |
---|
| 2208 | + if (ret) |
---|
| 2209 | + return ret; |
---|
| 2210 | + } |
---|
| 2211 | + |
---|
| 2212 | + if (should_lock) |
---|
| 2213 | + ext4_lock_group(sb, group); |
---|
| 2214 | + ret = ext4_mb_good_group(ac, group, cr); |
---|
| 2215 | +out: |
---|
| 2216 | + if (should_lock) |
---|
| 2217 | + ext4_unlock_group(sb, group); |
---|
| 2218 | + return ret; |
---|
| 2219 | +} |
---|
| 2220 | + |
---|
| 2221 | +/* |
---|
| 2222 | + * Start prefetching @nr block bitmaps starting at @group. |
---|
| 2223 | + * Return the next group which needs to be prefetched. |
---|
| 2224 | + */ |
---|
| 2225 | +ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, |
---|
| 2226 | + unsigned int nr, int *cnt) |
---|
| 2227 | +{ |
---|
| 2228 | + ext4_group_t ngroups = ext4_get_groups_count(sb); |
---|
| 2229 | + struct buffer_head *bh; |
---|
| 2230 | + struct blk_plug plug; |
---|
| 2231 | + |
---|
| 2232 | + blk_start_plug(&plug); |
---|
| 2233 | + while (nr-- > 0) { |
---|
| 2234 | + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, |
---|
| 2235 | + NULL); |
---|
| 2236 | + struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
---|
| 2237 | + |
---|
| 2238 | + /* |
---|
| 2239 | + * Prefetch block groups with free blocks; but don't |
---|
| 2240 | + * bother if it is marked uninitialized on disk, since |
---|
| 2241 | + * it won't require I/O to read. Also only try to |
---|
| 2242 | + * prefetch once, so we avoid getblk() call, which can |
---|
| 2243 | + * be expensive. |
---|
| 2244 | + */ |
---|
| 2245 | + if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && |
---|
| 2246 | + EXT4_MB_GRP_NEED_INIT(grp) && |
---|
| 2247 | + ext4_free_group_clusters(sb, gdp) > 0 && |
---|
| 2248 | + !(ext4_has_group_desc_csum(sb) && |
---|
| 2249 | + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { |
---|
| 2250 | + bh = ext4_read_block_bitmap_nowait(sb, group, true); |
---|
| 2251 | + if (bh && !IS_ERR(bh)) { |
---|
| 2252 | + if (!buffer_uptodate(bh) && cnt) |
---|
| 2253 | + (*cnt)++; |
---|
| 2254 | + brelse(bh); |
---|
| 2255 | + } |
---|
| 2256 | + } |
---|
| 2257 | + if (++group >= ngroups) |
---|
| 2258 | + group = 0; |
---|
| 2259 | + } |
---|
| 2260 | + blk_finish_plug(&plug); |
---|
| 2261 | + return group; |
---|
| 2262 | +} |
---|
| 2263 | + |
---|
| 2264 | +/* |
---|
| 2265 | + * Prefetching reads the block bitmap into the buffer cache; but we |
---|
| 2266 | + * need to make sure that the buddy bitmap in the page cache has been |
---|
| 2267 | + * initialized. Note that ext4_mb_init_group() will block if the I/O |
---|
| 2268 | + * is not yet completed, or indeed if it was not initiated by |
---|
| 2269 | + * ext4_mb_prefetch did not start the I/O. |
---|
| 2270 | + * |
---|
| 2271 | + * TODO: We should actually kick off the buddy bitmap setup in a work |
---|
| 2272 | + * queue when the buffer I/O is completed, so that we don't block |
---|
| 2273 | + * waiting for the block allocation bitmap read to finish when |
---|
| 2274 | + * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). |
---|
| 2275 | + */ |
---|
| 2276 | +void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, |
---|
| 2277 | + unsigned int nr) |
---|
| 2278 | +{ |
---|
| 2279 | + while (nr-- > 0) { |
---|
| 2280 | + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, |
---|
| 2281 | + NULL); |
---|
| 2282 | + struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
---|
| 2283 | + |
---|
| 2284 | + if (!group) |
---|
| 2285 | + group = ext4_get_groups_count(sb); |
---|
| 2286 | + group--; |
---|
| 2287 | + grp = ext4_get_group_info(sb, group); |
---|
| 2288 | + |
---|
| 2289 | + if (EXT4_MB_GRP_NEED_INIT(grp) && |
---|
| 2290 | + ext4_free_group_clusters(sb, gdp) > 0 && |
---|
| 2291 | + !(ext4_has_group_desc_csum(sb) && |
---|
| 2292 | + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { |
---|
| 2293 | + if (ext4_mb_init_group(sb, group, GFP_NOFS)) |
---|
| 2294 | + break; |
---|
| 2295 | + } |
---|
| 2296 | + } |
---|
2106 | 2297 | } |
---|
2107 | 2298 | |
---|
2108 | 2299 | static noinline_for_stack int |
---|
2109 | 2300 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) |
---|
2110 | 2301 | { |
---|
2111 | | - ext4_group_t ngroups, group, i; |
---|
2112 | | - int cr; |
---|
| 2302 | + ext4_group_t prefetch_grp = 0, ngroups, group, i; |
---|
| 2303 | + int cr = -1; |
---|
2113 | 2304 | int err = 0, first_err = 0; |
---|
| 2305 | + unsigned int nr = 0, prefetch_ios = 0; |
---|
2114 | 2306 | struct ext4_sb_info *sbi; |
---|
2115 | 2307 | struct super_block *sb; |
---|
2116 | 2308 | struct ext4_buddy e4b; |
---|
| 2309 | + int lost; |
---|
2117 | 2310 | |
---|
2118 | 2311 | sb = ac->ac_sb; |
---|
2119 | 2312 | sbi = EXT4_SB(sb); |
---|
.. | .. |
---|
2133 | 2326 | goto out; |
---|
2134 | 2327 | |
---|
2135 | 2328 | /* |
---|
2136 | | - * ac->ac2_order is set only if the fe_len is a power of 2 |
---|
2137 | | - * if ac2_order is set we also set criteria to 0 so that we |
---|
| 2329 | + * ac->ac_2order is set only if the fe_len is a power of 2 |
---|
| 2330 | + * if ac->ac_2order is set we also set criteria to 0 so that we |
---|
2138 | 2331 | * try exact allocation using buddy. |
---|
2139 | 2332 | */ |
---|
2140 | 2333 | i = fls(ac->ac_g_ex.fe_len); |
---|
.. | .. |
---|
2178 | 2371 | * from the goal value specified |
---|
2179 | 2372 | */ |
---|
2180 | 2373 | group = ac->ac_g_ex.fe_group; |
---|
| 2374 | + prefetch_grp = group; |
---|
2181 | 2375 | |
---|
2182 | 2376 | for (i = 0; i < ngroups; group++, i++) { |
---|
2183 | 2377 | int ret = 0; |
---|
.. | .. |
---|
2189 | 2383 | if (group >= ngroups) |
---|
2190 | 2384 | group = 0; |
---|
2191 | 2385 | |
---|
| 2386 | + /* |
---|
| 2387 | + * Batch reads of the block allocation bitmaps |
---|
| 2388 | + * to get multiple READs in flight; limit |
---|
| 2389 | + * prefetching at cr=0/1, otherwise mballoc can |
---|
| 2390 | + * spend a lot of time loading imperfect groups |
---|
| 2391 | + */ |
---|
| 2392 | + if ((prefetch_grp == group) && |
---|
| 2393 | + (cr > 1 || |
---|
| 2394 | + prefetch_ios < sbi->s_mb_prefetch_limit)) { |
---|
| 2395 | + unsigned int curr_ios = prefetch_ios; |
---|
| 2396 | + |
---|
| 2397 | + nr = sbi->s_mb_prefetch; |
---|
| 2398 | + if (ext4_has_feature_flex_bg(sb)) { |
---|
| 2399 | + nr = 1 << sbi->s_log_groups_per_flex; |
---|
| 2400 | + nr -= group & (nr - 1); |
---|
| 2401 | + nr = min(nr, sbi->s_mb_prefetch); |
---|
| 2402 | + } |
---|
| 2403 | + prefetch_grp = ext4_mb_prefetch(sb, group, |
---|
| 2404 | + nr, &prefetch_ios); |
---|
| 2405 | + if (prefetch_ios == curr_ios) |
---|
| 2406 | + nr = 0; |
---|
| 2407 | + } |
---|
| 2408 | + |
---|
2192 | 2409 | /* This now checks without needing the buddy page */ |
---|
2193 | | - ret = ext4_mb_good_group(ac, group, cr); |
---|
| 2410 | + ret = ext4_mb_good_group_nolock(ac, group, cr); |
---|
2194 | 2411 | if (ret <= 0) { |
---|
2195 | 2412 | if (!first_err) |
---|
2196 | 2413 | first_err = ret; |
---|
.. | .. |
---|
2208 | 2425 | * block group |
---|
2209 | 2426 | */ |
---|
2210 | 2427 | ret = ext4_mb_good_group(ac, group, cr); |
---|
2211 | | - if (ret <= 0) { |
---|
| 2428 | + if (ret == 0) { |
---|
2212 | 2429 | ext4_unlock_group(sb, group); |
---|
2213 | 2430 | ext4_mb_unload_buddy(&e4b); |
---|
2214 | | - if (!first_err) |
---|
2215 | | - first_err = ret; |
---|
2216 | 2431 | continue; |
---|
2217 | 2432 | } |
---|
2218 | 2433 | |
---|
.. | .. |
---|
2239 | 2454 | * We've been searching too long. Let's try to allocate |
---|
2240 | 2455 | * the best chunk we've found so far |
---|
2241 | 2456 | */ |
---|
2242 | | - |
---|
2243 | 2457 | ext4_mb_try_best_found(ac, &e4b); |
---|
2244 | 2458 | if (ac->ac_status != AC_STATUS_FOUND) { |
---|
2245 | 2459 | /* |
---|
2246 | 2460 | * Someone more lucky has already allocated it. |
---|
2247 | 2461 | * The only thing we can do is just take first |
---|
2248 | 2462 | * found block(s) |
---|
2249 | | - printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); |
---|
2250 | 2463 | */ |
---|
| 2464 | + lost = atomic_inc_return(&sbi->s_mb_lost_chunks); |
---|
| 2465 | + mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", |
---|
| 2466 | + ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, |
---|
| 2467 | + ac->ac_b_ex.fe_len, lost); |
---|
| 2468 | + |
---|
2251 | 2469 | ac->ac_b_ex.fe_group = 0; |
---|
2252 | 2470 | ac->ac_b_ex.fe_start = 0; |
---|
2253 | 2471 | ac->ac_b_ex.fe_len = 0; |
---|
2254 | 2472 | ac->ac_status = AC_STATUS_CONTINUE; |
---|
2255 | 2473 | ac->ac_flags |= EXT4_MB_HINT_FIRST; |
---|
2256 | 2474 | cr = 3; |
---|
2257 | | - atomic_inc(&sbi->s_mb_lost_chunks); |
---|
2258 | 2475 | goto repeat; |
---|
2259 | 2476 | } |
---|
2260 | 2477 | } |
---|
2261 | 2478 | out: |
---|
2262 | 2479 | if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) |
---|
2263 | 2480 | err = first_err; |
---|
| 2481 | + |
---|
| 2482 | + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", |
---|
| 2483 | + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, |
---|
| 2484 | + ac->ac_flags, cr, err); |
---|
| 2485 | + |
---|
| 2486 | + if (nr) |
---|
| 2487 | + ext4_mb_prefetch_fini(sb, prefetch_grp, nr); |
---|
| 2488 | + |
---|
2264 | 2489 | return err; |
---|
2265 | 2490 | } |
---|
2266 | 2491 | |
---|
.. | .. |
---|
2333 | 2558 | for (i = 0; i <= 13; i++) |
---|
2334 | 2559 | seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? |
---|
2335 | 2560 | sg.info.bb_counters[i] : 0); |
---|
2336 | | - seq_printf(seq, " ]\n"); |
---|
| 2561 | + seq_puts(seq, " ]\n"); |
---|
2337 | 2562 | |
---|
2338 | 2563 | return 0; |
---|
2339 | 2564 | } |
---|
.. | .. |
---|
2453 | 2678 | meta_group_info[i]->bb_free_root = RB_ROOT; |
---|
2454 | 2679 | meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ |
---|
2455 | 2680 | |
---|
2456 | | -#ifdef DOUBLE_CHECK |
---|
2457 | | - { |
---|
2458 | | - struct buffer_head *bh; |
---|
2459 | | - meta_group_info[i]->bb_bitmap = |
---|
2460 | | - kmalloc(sb->s_blocksize, GFP_NOFS); |
---|
2461 | | - BUG_ON(meta_group_info[i]->bb_bitmap == NULL); |
---|
2462 | | - bh = ext4_read_block_bitmap(sb, group); |
---|
2463 | | - BUG_ON(IS_ERR_OR_NULL(bh)); |
---|
2464 | | - memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, |
---|
2465 | | - sb->s_blocksize); |
---|
2466 | | - put_bh(bh); |
---|
2467 | | - } |
---|
2468 | | -#endif |
---|
2469 | | - |
---|
| 2681 | + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); |
---|
2470 | 2682 | return 0; |
---|
2471 | 2683 | |
---|
2472 | 2684 | exit_group_info: |
---|
.. | .. |
---|
2510 | 2722 | sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; |
---|
2511 | 2723 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; |
---|
2512 | 2724 | for (i = 0; i < ngroups; i++) { |
---|
| 2725 | + cond_resched(); |
---|
2513 | 2726 | desc = ext4_get_group_desc(sb, i, NULL); |
---|
2514 | 2727 | if (desc == NULL) { |
---|
2515 | 2728 | ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); |
---|
.. | .. |
---|
2518 | 2731 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
---|
2519 | 2732 | goto err_freebuddy; |
---|
2520 | 2733 | } |
---|
| 2734 | + |
---|
| 2735 | + if (ext4_has_feature_flex_bg(sb)) { |
---|
| 2736 | + /* a single flex group is supposed to be read by a single IO. |
---|
| 2737 | + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is |
---|
| 2738 | + * unsigned integer, so the maximum shift is 32. |
---|
| 2739 | + */ |
---|
| 2740 | + if (sbi->s_es->s_log_groups_per_flex >= 32) { |
---|
| 2741 | + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); |
---|
| 2742 | + goto err_freebuddy; |
---|
| 2743 | + } |
---|
| 2744 | + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, |
---|
| 2745 | + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); |
---|
| 2746 | + sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ |
---|
| 2747 | + } else { |
---|
| 2748 | + sbi->s_mb_prefetch = 32; |
---|
| 2749 | + } |
---|
| 2750 | + if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) |
---|
| 2751 | + sbi->s_mb_prefetch = ext4_get_groups_count(sb); |
---|
| 2752 | + /* now many real IOs to prefetch within a single allocation at cr=0 |
---|
| 2753 | + * given cr=0 is an CPU-related optimization we shouldn't try to |
---|
| 2754 | + * load too many groups, at some point we should start to use what |
---|
| 2755 | + * we've got in memory. |
---|
| 2756 | + * with an average random access time 5ms, it'd take a second to get |
---|
| 2757 | + * 200 groups (* N with flex_bg), so let's make this limit 4 |
---|
| 2758 | + */ |
---|
| 2759 | + sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; |
---|
| 2760 | + if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) |
---|
| 2761 | + sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); |
---|
2521 | 2762 | |
---|
2522 | 2763 | return 0; |
---|
2523 | 2764 | |
---|
.. | .. |
---|
2642 | 2883 | sbi->s_mb_stats = MB_DEFAULT_STATS; |
---|
2643 | 2884 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; |
---|
2644 | 2885 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; |
---|
| 2886 | + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; |
---|
2645 | 2887 | /* |
---|
2646 | 2888 | * The default group preallocation is 512, which for 4k block |
---|
2647 | 2889 | * sizes translates to 2 megabytes. However for bigalloc file |
---|
.. | .. |
---|
2702 | 2944 | } |
---|
2703 | 2945 | |
---|
2704 | 2946 | /* need to called with the ext4 group lock held */ |
---|
2705 | | -static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) |
---|
| 2947 | +static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) |
---|
2706 | 2948 | { |
---|
2707 | 2949 | struct ext4_prealloc_space *pa; |
---|
2708 | 2950 | struct list_head *cur, *tmp; |
---|
.. | .. |
---|
2714 | 2956 | count++; |
---|
2715 | 2957 | kmem_cache_free(ext4_pspace_cachep, pa); |
---|
2716 | 2958 | } |
---|
2717 | | - if (count) |
---|
2718 | | - mb_debug(1, "mballoc: %u PAs left\n", count); |
---|
2719 | | - |
---|
| 2959 | + return count; |
---|
2720 | 2960 | } |
---|
2721 | 2961 | |
---|
2722 | 2962 | int ext4_mb_release(struct super_block *sb) |
---|
.. | .. |
---|
2727 | 2967 | struct ext4_group_info *grinfo, ***group_info; |
---|
2728 | 2968 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
---|
2729 | 2969 | struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); |
---|
| 2970 | + int count; |
---|
2730 | 2971 | |
---|
2731 | 2972 | if (sbi->s_group_info) { |
---|
2732 | 2973 | for (i = 0; i < ngroups; i++) { |
---|
| 2974 | + cond_resched(); |
---|
2733 | 2975 | grinfo = ext4_get_group_info(sb, i); |
---|
2734 | | -#ifdef DOUBLE_CHECK |
---|
2735 | | - kfree(grinfo->bb_bitmap); |
---|
2736 | | -#endif |
---|
| 2976 | + mb_group_bb_bitmap_free(grinfo); |
---|
2737 | 2977 | ext4_lock_group(sb, i); |
---|
2738 | | - ext4_mb_cleanup_pa(grinfo); |
---|
| 2978 | + count = ext4_mb_cleanup_pa(grinfo); |
---|
| 2979 | + if (count) |
---|
| 2980 | + mb_debug(sb, "mballoc: %d PAs left\n", |
---|
| 2981 | + count); |
---|
2739 | 2982 | ext4_unlock_group(sb, i); |
---|
2740 | 2983 | kmem_cache_free(cachep, grinfo); |
---|
2741 | 2984 | } |
---|
.. | .. |
---|
2808 | 3051 | struct ext4_group_info *db; |
---|
2809 | 3052 | int err, count = 0, count2 = 0; |
---|
2810 | 3053 | |
---|
2811 | | - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", |
---|
| 3054 | + mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", |
---|
2812 | 3055 | entry->efd_count, entry->efd_group, entry); |
---|
2813 | 3056 | |
---|
2814 | 3057 | err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); |
---|
.. | .. |
---|
2848 | 3091 | kmem_cache_free(ext4_free_data_cachep, entry); |
---|
2849 | 3092 | ext4_mb_unload_buddy(&e4b); |
---|
2850 | 3093 | |
---|
2851 | | - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); |
---|
| 3094 | + mb_debug(sb, "freed %d blocks in %d structures\n", count, |
---|
| 3095 | + count2); |
---|
2852 | 3096 | } |
---|
2853 | 3097 | |
---|
2854 | 3098 | /* |
---|
.. | .. |
---|
2908 | 3152 | ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, |
---|
2909 | 3153 | SLAB_RECLAIM_ACCOUNT); |
---|
2910 | 3154 | if (ext4_pspace_cachep == NULL) |
---|
2911 | | - return -ENOMEM; |
---|
| 3155 | + goto out; |
---|
2912 | 3156 | |
---|
2913 | 3157 | ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, |
---|
2914 | 3158 | SLAB_RECLAIM_ACCOUNT); |
---|
2915 | | - if (ext4_ac_cachep == NULL) { |
---|
2916 | | - kmem_cache_destroy(ext4_pspace_cachep); |
---|
2917 | | - return -ENOMEM; |
---|
2918 | | - } |
---|
| 3159 | + if (ext4_ac_cachep == NULL) |
---|
| 3160 | + goto out_pa_free; |
---|
2919 | 3161 | |
---|
2920 | 3162 | ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, |
---|
2921 | 3163 | SLAB_RECLAIM_ACCOUNT); |
---|
2922 | | - if (ext4_free_data_cachep == NULL) { |
---|
2923 | | - kmem_cache_destroy(ext4_pspace_cachep); |
---|
2924 | | - kmem_cache_destroy(ext4_ac_cachep); |
---|
2925 | | - return -ENOMEM; |
---|
2926 | | - } |
---|
| 3164 | + if (ext4_free_data_cachep == NULL) |
---|
| 3165 | + goto out_ac_free; |
---|
| 3166 | + |
---|
2927 | 3167 | return 0; |
---|
| 3168 | + |
---|
| 3169 | +out_ac_free: |
---|
| 3170 | + kmem_cache_destroy(ext4_ac_cachep); |
---|
| 3171 | +out_pa_free: |
---|
| 3172 | + kmem_cache_destroy(ext4_pspace_cachep); |
---|
| 3173 | +out: |
---|
| 3174 | + return -ENOMEM; |
---|
2928 | 3175 | } |
---|
2929 | 3176 | |
---|
2930 | 3177 | void ext4_exit_mballoc(void) |
---|
.. | .. |
---|
3061 | 3308 | } |
---|
3062 | 3309 | |
---|
3063 | 3310 | /* |
---|
| 3311 | + * Idempotent helper for Ext4 fast commit replay path to set the state of |
---|
| 3312 | + * blocks in bitmaps and update counters. |
---|
| 3313 | + */ |
---|
| 3314 | +void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, |
---|
| 3315 | + int len, int state) |
---|
| 3316 | +{ |
---|
| 3317 | + struct buffer_head *bitmap_bh = NULL; |
---|
| 3318 | + struct ext4_group_desc *gdp; |
---|
| 3319 | + struct buffer_head *gdp_bh; |
---|
| 3320 | + struct ext4_sb_info *sbi = EXT4_SB(sb); |
---|
| 3321 | + ext4_group_t group; |
---|
| 3322 | + ext4_grpblk_t blkoff; |
---|
| 3323 | + int i, err; |
---|
| 3324 | + int already; |
---|
| 3325 | + unsigned int clen, clen_changed, thisgrp_len; |
---|
| 3326 | + |
---|
| 3327 | + while (len > 0) { |
---|
| 3328 | + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); |
---|
| 3329 | + |
---|
| 3330 | + /* |
---|
| 3331 | + * Check to see if we are freeing blocks across a group |
---|
| 3332 | + * boundary. |
---|
| 3333 | + * In case of flex_bg, this can happen that (block, len) may |
---|
| 3334 | + * span across more than one group. In that case we need to |
---|
| 3335 | + * get the corresponding group metadata to work with. |
---|
| 3336 | + * For this we have goto again loop. |
---|
| 3337 | + */ |
---|
| 3338 | + thisgrp_len = min_t(unsigned int, (unsigned int)len, |
---|
| 3339 | + EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); |
---|
| 3340 | + clen = EXT4_NUM_B2C(sbi, thisgrp_len); |
---|
| 3341 | + |
---|
| 3342 | + bitmap_bh = ext4_read_block_bitmap(sb, group); |
---|
| 3343 | + if (IS_ERR(bitmap_bh)) { |
---|
| 3344 | + err = PTR_ERR(bitmap_bh); |
---|
| 3345 | + bitmap_bh = NULL; |
---|
| 3346 | + break; |
---|
| 3347 | + } |
---|
| 3348 | + |
---|
| 3349 | + err = -EIO; |
---|
| 3350 | + gdp = ext4_get_group_desc(sb, group, &gdp_bh); |
---|
| 3351 | + if (!gdp) |
---|
| 3352 | + break; |
---|
| 3353 | + |
---|
| 3354 | + ext4_lock_group(sb, group); |
---|
| 3355 | + already = 0; |
---|
| 3356 | + for (i = 0; i < clen; i++) |
---|
| 3357 | + if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == |
---|
| 3358 | + !state) |
---|
| 3359 | + already++; |
---|
| 3360 | + |
---|
| 3361 | + clen_changed = clen - already; |
---|
| 3362 | + if (state) |
---|
| 3363 | + ext4_set_bits(bitmap_bh->b_data, blkoff, clen); |
---|
| 3364 | + else |
---|
| 3365 | + mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen); |
---|
| 3366 | + if (ext4_has_group_desc_csum(sb) && |
---|
| 3367 | + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { |
---|
| 3368 | + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
---|
| 3369 | + ext4_free_group_clusters_set(sb, gdp, |
---|
| 3370 | + ext4_free_clusters_after_init(sb, group, gdp)); |
---|
| 3371 | + } |
---|
| 3372 | + if (state) |
---|
| 3373 | + clen = ext4_free_group_clusters(sb, gdp) - clen_changed; |
---|
| 3374 | + else |
---|
| 3375 | + clen = ext4_free_group_clusters(sb, gdp) + clen_changed; |
---|
| 3376 | + |
---|
| 3377 | + ext4_free_group_clusters_set(sb, gdp, clen); |
---|
| 3378 | + ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); |
---|
| 3379 | + ext4_group_desc_csum_set(sb, group, gdp); |
---|
| 3380 | + |
---|
| 3381 | + ext4_unlock_group(sb, group); |
---|
| 3382 | + |
---|
| 3383 | + if (sbi->s_log_groups_per_flex) { |
---|
| 3384 | + ext4_group_t flex_group = ext4_flex_group(sbi, group); |
---|
| 3385 | + struct flex_groups *fg = sbi_array_rcu_deref(sbi, |
---|
| 3386 | + s_flex_groups, flex_group); |
---|
| 3387 | + |
---|
| 3388 | + if (state) |
---|
| 3389 | + atomic64_sub(clen_changed, &fg->free_clusters); |
---|
| 3390 | + else |
---|
| 3391 | + atomic64_add(clen_changed, &fg->free_clusters); |
---|
| 3392 | + |
---|
| 3393 | + } |
---|
| 3394 | + |
---|
| 3395 | + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); |
---|
| 3396 | + if (err) |
---|
| 3397 | + break; |
---|
| 3398 | + sync_dirty_buffer(bitmap_bh); |
---|
| 3399 | + err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); |
---|
| 3400 | + sync_dirty_buffer(gdp_bh); |
---|
| 3401 | + if (err) |
---|
| 3402 | + break; |
---|
| 3403 | + |
---|
| 3404 | + block += thisgrp_len; |
---|
| 3405 | + len -= thisgrp_len; |
---|
| 3406 | + brelse(bitmap_bh); |
---|
| 3407 | + BUG_ON(len < 0); |
---|
| 3408 | + } |
---|
| 3409 | + |
---|
| 3410 | + if (err) |
---|
| 3411 | + brelse(bitmap_bh); |
---|
| 3412 | +} |
---|
| 3413 | + |
---|
| 3414 | +/* |
---|
3064 | 3415 | * here we normalize request for locality group |
---|
3065 | 3416 | * Group request are normalized to s_mb_group_prealloc, which goes to |
---|
3066 | 3417 | * s_strip if we set the same via mount option. |
---|
.. | .. |
---|
3076 | 3427 | |
---|
3077 | 3428 | BUG_ON(lg == NULL); |
---|
3078 | 3429 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; |
---|
3079 | | - mb_debug(1, "#%u: goal %u blocks for locality group\n", |
---|
3080 | | - current->pid, ac->ac_g_ex.fe_len); |
---|
| 3430 | + mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); |
---|
3081 | 3431 | } |
---|
3082 | 3432 | |
---|
3083 | 3433 | /* |
---|
.. | .. |
---|
3169 | 3519 | } |
---|
3170 | 3520 | size = size >> bsbits; |
---|
3171 | 3521 | start = start_off >> bsbits; |
---|
| 3522 | + |
---|
| 3523 | + /* |
---|
| 3524 | + * For tiny groups (smaller than 8MB) the chosen allocation |
---|
| 3525 | + * alignment may be larger than group size. Make sure the |
---|
| 3526 | + * alignment does not move allocation to a different group which |
---|
| 3527 | + * makes mballoc fail assertions later. |
---|
| 3528 | + */ |
---|
| 3529 | + start = max(start, rounddown(ac->ac_o_ex.fe_logical, |
---|
| 3530 | + (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); |
---|
3172 | 3531 | |
---|
3173 | 3532 | /* don't cover already allocated blocks in selected range */ |
---|
3174 | 3533 | if (ar->pleft && start <= ar->lleft) { |
---|
.. | .. |
---|
3275 | 3634 | ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; |
---|
3276 | 3635 | } |
---|
3277 | 3636 | |
---|
3278 | | - mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, |
---|
3279 | | - (unsigned) orig_size, (unsigned) start); |
---|
| 3637 | + mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, |
---|
| 3638 | + orig_size, start); |
---|
3280 | 3639 | } |
---|
3281 | 3640 | |
---|
3282 | 3641 | static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) |
---|
.. | .. |
---|
3365 | 3724 | BUG_ON(pa->pa_free < len); |
---|
3366 | 3725 | pa->pa_free -= len; |
---|
3367 | 3726 | |
---|
3368 | | - mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); |
---|
| 3727 | + mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); |
---|
3369 | 3728 | } |
---|
3370 | 3729 | |
---|
3371 | 3730 | /* |
---|
.. | .. |
---|
3389 | 3748 | * in on-disk bitmap -- see ext4_mb_release_context() |
---|
3390 | 3749 | * Other CPUs are prevented from allocating from this pa by lg_mutex |
---|
3391 | 3750 | */ |
---|
3392 | | - mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); |
---|
| 3751 | + mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", |
---|
| 3752 | + pa->pa_lstart-len, len, pa); |
---|
3393 | 3753 | } |
---|
3394 | 3754 | |
---|
3395 | 3755 | /* |
---|
.. | .. |
---|
3424 | 3784 | /* |
---|
3425 | 3785 | * search goal blocks in preallocated space |
---|
3426 | 3786 | */ |
---|
3427 | | -static noinline_for_stack int |
---|
| 3787 | +static noinline_for_stack bool |
---|
3428 | 3788 | ext4_mb_use_preallocated(struct ext4_allocation_context *ac) |
---|
3429 | 3789 | { |
---|
3430 | 3790 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); |
---|
.. | .. |
---|
3436 | 3796 | |
---|
3437 | 3797 | /* only data can be preallocated */ |
---|
3438 | 3798 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) |
---|
3439 | | - return 0; |
---|
| 3799 | + return false; |
---|
3440 | 3800 | |
---|
3441 | 3801 | /* first, try per-file preallocation */ |
---|
3442 | 3802 | rcu_read_lock(); |
---|
.. | .. |
---|
3463 | 3823 | spin_unlock(&pa->pa_lock); |
---|
3464 | 3824 | ac->ac_criteria = 10; |
---|
3465 | 3825 | rcu_read_unlock(); |
---|
3466 | | - return 1; |
---|
| 3826 | + return true; |
---|
3467 | 3827 | } |
---|
3468 | 3828 | spin_unlock(&pa->pa_lock); |
---|
3469 | 3829 | } |
---|
.. | .. |
---|
3471 | 3831 | |
---|
3472 | 3832 | /* can we use group allocation? */ |
---|
3473 | 3833 | if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) |
---|
3474 | | - return 0; |
---|
| 3834 | + return false; |
---|
3475 | 3835 | |
---|
3476 | 3836 | /* inode may have no locality group for some reason */ |
---|
3477 | 3837 | lg = ac->ac_lg; |
---|
3478 | 3838 | if (lg == NULL) |
---|
3479 | | - return 0; |
---|
| 3839 | + return false; |
---|
3480 | 3840 | order = fls(ac->ac_o_ex.fe_len) - 1; |
---|
3481 | 3841 | if (order > PREALLOC_TB_SIZE - 1) |
---|
3482 | 3842 | /* The max size of hash table is PREALLOC_TB_SIZE */ |
---|
.. | .. |
---|
3505 | 3865 | if (cpa) { |
---|
3506 | 3866 | ext4_mb_use_group_pa(ac, cpa); |
---|
3507 | 3867 | ac->ac_criteria = 20; |
---|
3508 | | - return 1; |
---|
| 3868 | + return true; |
---|
3509 | 3869 | } |
---|
3510 | | - return 0; |
---|
| 3870 | + return false; |
---|
3511 | 3871 | } |
---|
3512 | 3872 | |
---|
3513 | 3873 | /* |
---|
.. | .. |
---|
3572 | 3932 | ext4_set_bits(bitmap, start, len); |
---|
3573 | 3933 | preallocated += len; |
---|
3574 | 3934 | } |
---|
3575 | | - mb_debug(1, "preallocated %u for group %u\n", preallocated, group); |
---|
| 3935 | + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); |
---|
| 3936 | +} |
---|
| 3937 | + |
---|
| 3938 | +static void ext4_mb_mark_pa_deleted(struct super_block *sb, |
---|
| 3939 | + struct ext4_prealloc_space *pa) |
---|
| 3940 | +{ |
---|
| 3941 | + struct ext4_inode_info *ei; |
---|
| 3942 | + |
---|
| 3943 | + if (pa->pa_deleted) { |
---|
| 3944 | + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", |
---|
| 3945 | + pa->pa_type, pa->pa_pstart, pa->pa_lstart, |
---|
| 3946 | + pa->pa_len); |
---|
| 3947 | + return; |
---|
| 3948 | + } |
---|
| 3949 | + |
---|
| 3950 | + pa->pa_deleted = 1; |
---|
| 3951 | + |
---|
| 3952 | + if (pa->pa_type == MB_INODE_PA) { |
---|
| 3953 | + ei = EXT4_I(pa->pa_inode); |
---|
| 3954 | + atomic_dec(&ei->i_prealloc_active); |
---|
| 3955 | + } |
---|
3576 | 3956 | } |
---|
3577 | 3957 | |
---|
3578 | 3958 | static void ext4_mb_pa_callback(struct rcu_head *head) |
---|
.. | .. |
---|
3607 | 3987 | return; |
---|
3608 | 3988 | } |
---|
3609 | 3989 | |
---|
3610 | | - pa->pa_deleted = 1; |
---|
| 3990 | + ext4_mb_mark_pa_deleted(sb, pa); |
---|
3611 | 3991 | spin_unlock(&pa->pa_lock); |
---|
3612 | 3992 | |
---|
3613 | 3993 | grp_blk = pa->pa_pstart; |
---|
.. | .. |
---|
3648 | 4028 | /* |
---|
3649 | 4029 | * creates new preallocated space for given inode |
---|
3650 | 4030 | */ |
---|
3651 | | -static noinline_for_stack int |
---|
| 4031 | +static noinline_for_stack void |
---|
3652 | 4032 | ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) |
---|
3653 | 4033 | { |
---|
3654 | 4034 | struct super_block *sb = ac->ac_sb; |
---|
.. | .. |
---|
3661 | 4041 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); |
---|
3662 | 4042 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); |
---|
3663 | 4043 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); |
---|
| 4044 | + BUG_ON(ac->ac_pa == NULL); |
---|
3664 | 4045 | |
---|
3665 | | - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); |
---|
3666 | | - if (pa == NULL) |
---|
3667 | | - return -ENOMEM; |
---|
| 4046 | + pa = ac->ac_pa; |
---|
3668 | 4047 | |
---|
3669 | 4048 | if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { |
---|
3670 | 4049 | int winl; |
---|
.. | .. |
---|
3708 | 4087 | pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); |
---|
3709 | 4088 | pa->pa_len = ac->ac_b_ex.fe_len; |
---|
3710 | 4089 | pa->pa_free = pa->pa_len; |
---|
3711 | | - atomic_set(&pa->pa_count, 1); |
---|
3712 | 4090 | spin_lock_init(&pa->pa_lock); |
---|
3713 | 4091 | INIT_LIST_HEAD(&pa->pa_inode_list); |
---|
3714 | 4092 | INIT_LIST_HEAD(&pa->pa_group_list); |
---|
3715 | 4093 | pa->pa_deleted = 0; |
---|
3716 | 4094 | pa->pa_type = MB_INODE_PA; |
---|
3717 | 4095 | |
---|
3718 | | - mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, |
---|
3719 | | - pa->pa_pstart, pa->pa_len, pa->pa_lstart); |
---|
| 4096 | + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, |
---|
| 4097 | + pa->pa_len, pa->pa_lstart); |
---|
3720 | 4098 | trace_ext4_mb_new_inode_pa(ac, pa); |
---|
3721 | 4099 | |
---|
3722 | 4100 | ext4_mb_use_inode_pa(ac, pa); |
---|
.. | .. |
---|
3728 | 4106 | pa->pa_obj_lock = &ei->i_prealloc_lock; |
---|
3729 | 4107 | pa->pa_inode = ac->ac_inode; |
---|
3730 | 4108 | |
---|
3731 | | - ext4_lock_group(sb, ac->ac_b_ex.fe_group); |
---|
3732 | 4109 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); |
---|
3733 | | - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
---|
3734 | 4110 | |
---|
3735 | 4111 | spin_lock(pa->pa_obj_lock); |
---|
3736 | 4112 | list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); |
---|
3737 | 4113 | spin_unlock(pa->pa_obj_lock); |
---|
3738 | | - |
---|
3739 | | - return 0; |
---|
| 4114 | + atomic_inc(&ei->i_prealloc_active); |
---|
3740 | 4115 | } |
---|
3741 | 4116 | |
---|
3742 | 4117 | /* |
---|
3743 | 4118 | * creates new preallocated space for locality group inodes belongs to |
---|
3744 | 4119 | */ |
---|
3745 | | -static noinline_for_stack int |
---|
| 4120 | +static noinline_for_stack void |
---|
3746 | 4121 | ext4_mb_new_group_pa(struct ext4_allocation_context *ac) |
---|
3747 | 4122 | { |
---|
3748 | 4123 | struct super_block *sb = ac->ac_sb; |
---|
.. | .. |
---|
3754 | 4129 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); |
---|
3755 | 4130 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); |
---|
3756 | 4131 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); |
---|
| 4132 | + BUG_ON(ac->ac_pa == NULL); |
---|
3757 | 4133 | |
---|
3758 | | - BUG_ON(ext4_pspace_cachep == NULL); |
---|
3759 | | - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); |
---|
3760 | | - if (pa == NULL) |
---|
3761 | | - return -ENOMEM; |
---|
| 4134 | + pa = ac->ac_pa; |
---|
3762 | 4135 | |
---|
3763 | 4136 | /* preallocation can change ac_b_ex, thus we store actually |
---|
3764 | 4137 | * allocated blocks for history */ |
---|
.. | .. |
---|
3768 | 4141 | pa->pa_lstart = pa->pa_pstart; |
---|
3769 | 4142 | pa->pa_len = ac->ac_b_ex.fe_len; |
---|
3770 | 4143 | pa->pa_free = pa->pa_len; |
---|
3771 | | - atomic_set(&pa->pa_count, 1); |
---|
3772 | 4144 | spin_lock_init(&pa->pa_lock); |
---|
3773 | 4145 | INIT_LIST_HEAD(&pa->pa_inode_list); |
---|
3774 | 4146 | INIT_LIST_HEAD(&pa->pa_group_list); |
---|
3775 | 4147 | pa->pa_deleted = 0; |
---|
3776 | 4148 | pa->pa_type = MB_GROUP_PA; |
---|
3777 | 4149 | |
---|
3778 | | - mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, |
---|
3779 | | - pa->pa_pstart, pa->pa_len, pa->pa_lstart); |
---|
| 4150 | + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, |
---|
| 4151 | + pa->pa_len, pa->pa_lstart); |
---|
3780 | 4152 | trace_ext4_mb_new_group_pa(ac, pa); |
---|
3781 | 4153 | |
---|
3782 | 4154 | ext4_mb_use_group_pa(ac, pa); |
---|
.. | .. |
---|
3789 | 4161 | pa->pa_obj_lock = &lg->lg_prealloc_lock; |
---|
3790 | 4162 | pa->pa_inode = NULL; |
---|
3791 | 4163 | |
---|
3792 | | - ext4_lock_group(sb, ac->ac_b_ex.fe_group); |
---|
3793 | 4164 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); |
---|
3794 | | - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
---|
3795 | 4165 | |
---|
3796 | 4166 | /* |
---|
3797 | 4167 | * We will later add the new pa to the right bucket |
---|
3798 | 4168 | * after updating the pa_free in ext4_mb_release_context |
---|
3799 | 4169 | */ |
---|
3800 | | - return 0; |
---|
3801 | 4170 | } |
---|
3802 | 4171 | |
---|
3803 | | -static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) |
---|
| 4172 | +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) |
---|
3804 | 4173 | { |
---|
3805 | | - int err; |
---|
3806 | | - |
---|
3807 | 4174 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) |
---|
3808 | | - err = ext4_mb_new_group_pa(ac); |
---|
| 4175 | + ext4_mb_new_group_pa(ac); |
---|
3809 | 4176 | else |
---|
3810 | | - err = ext4_mb_new_inode_pa(ac); |
---|
3811 | | - return err; |
---|
| 4177 | + ext4_mb_new_inode_pa(ac); |
---|
3812 | 4178 | } |
---|
3813 | 4179 | |
---|
3814 | 4180 | /* |
---|
.. | .. |
---|
3843 | 4209 | if (bit >= end) |
---|
3844 | 4210 | break; |
---|
3845 | 4211 | next = mb_find_next_bit(bitmap_bh->b_data, end, bit); |
---|
3846 | | - mb_debug(1, " free preallocated %u/%u in group %u\n", |
---|
| 4212 | + mb_debug(sb, "free preallocated %u/%u in group %u\n", |
---|
3847 | 4213 | (unsigned) ext4_group_first_block_no(sb, group) + bit, |
---|
3848 | 4214 | (unsigned) next - bit, (unsigned) group); |
---|
3849 | 4215 | free += next - bit; |
---|
.. | .. |
---|
3857 | 4223 | } |
---|
3858 | 4224 | if (free != pa->pa_free) { |
---|
3859 | 4225 | ext4_msg(e4b->bd_sb, KERN_CRIT, |
---|
3860 | | - "pa %p: logic %lu, phys. %lu, len %lu", |
---|
| 4226 | + "pa %p: logic %lu, phys. %lu, len %d", |
---|
3861 | 4227 | pa, (unsigned long) pa->pa_lstart, |
---|
3862 | 4228 | (unsigned long) pa->pa_pstart, |
---|
3863 | | - (unsigned long) pa->pa_len); |
---|
| 4229 | + pa->pa_len); |
---|
3864 | 4230 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", |
---|
3865 | 4231 | free, pa->pa_free); |
---|
3866 | 4232 | /* |
---|
.. | .. |
---|
3903 | 4269 | */ |
---|
3904 | 4270 | static noinline_for_stack int |
---|
3905 | 4271 | ext4_mb_discard_group_preallocations(struct super_block *sb, |
---|
3906 | | - ext4_group_t group, int needed) |
---|
| 4272 | + ext4_group_t group, int *busy) |
---|
3907 | 4273 | { |
---|
3908 | 4274 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
---|
3909 | 4275 | struct buffer_head *bitmap_bh = NULL; |
---|
.. | .. |
---|
3911 | 4277 | struct list_head list; |
---|
3912 | 4278 | struct ext4_buddy e4b; |
---|
3913 | 4279 | int err; |
---|
3914 | | - int busy = 0; |
---|
3915 | 4280 | int free = 0; |
---|
3916 | 4281 | |
---|
3917 | | - mb_debug(1, "discard preallocation for group %u\n", group); |
---|
3918 | | - |
---|
| 4282 | + mb_debug(sb, "discard preallocation for group %u\n", group); |
---|
3919 | 4283 | if (list_empty(&grp->bb_prealloc_list)) |
---|
3920 | | - return 0; |
---|
| 4284 | + goto out_dbg; |
---|
3921 | 4285 | |
---|
3922 | 4286 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
---|
3923 | 4287 | if (IS_ERR(bitmap_bh)) { |
---|
3924 | 4288 | err = PTR_ERR(bitmap_bh); |
---|
3925 | | - ext4_error(sb, "Error %d reading block bitmap for %u", |
---|
3926 | | - err, group); |
---|
3927 | | - return 0; |
---|
| 4289 | + ext4_error_err(sb, -err, |
---|
| 4290 | + "Error %d reading block bitmap for %u", |
---|
| 4291 | + err, group); |
---|
| 4292 | + goto out_dbg; |
---|
3928 | 4293 | } |
---|
3929 | 4294 | |
---|
3930 | 4295 | err = ext4_mb_load_buddy(sb, group, &e4b); |
---|
.. | .. |
---|
3932 | 4297 | ext4_warning(sb, "Error %d loading buddy information for %u", |
---|
3933 | 4298 | err, group); |
---|
3934 | 4299 | put_bh(bitmap_bh); |
---|
3935 | | - return 0; |
---|
| 4300 | + goto out_dbg; |
---|
3936 | 4301 | } |
---|
3937 | 4302 | |
---|
3938 | | - if (needed == 0) |
---|
3939 | | - needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; |
---|
3940 | | - |
---|
3941 | 4303 | INIT_LIST_HEAD(&list); |
---|
3942 | | -repeat: |
---|
3943 | 4304 | ext4_lock_group(sb, group); |
---|
3944 | 4305 | list_for_each_entry_safe(pa, tmp, |
---|
3945 | 4306 | &grp->bb_prealloc_list, pa_group_list) { |
---|
3946 | 4307 | spin_lock(&pa->pa_lock); |
---|
3947 | 4308 | if (atomic_read(&pa->pa_count)) { |
---|
3948 | 4309 | spin_unlock(&pa->pa_lock); |
---|
3949 | | - busy = 1; |
---|
| 4310 | + *busy = 1; |
---|
3950 | 4311 | continue; |
---|
3951 | 4312 | } |
---|
3952 | 4313 | if (pa->pa_deleted) { |
---|
.. | .. |
---|
3955 | 4316 | } |
---|
3956 | 4317 | |
---|
3957 | 4318 | /* seems this one can be freed ... */ |
---|
3958 | | - pa->pa_deleted = 1; |
---|
| 4319 | + ext4_mb_mark_pa_deleted(sb, pa); |
---|
| 4320 | + |
---|
| 4321 | + if (!free) |
---|
| 4322 | + this_cpu_inc(discard_pa_seq); |
---|
3959 | 4323 | |
---|
3960 | 4324 | /* we can trust pa_free ... */ |
---|
3961 | 4325 | free += pa->pa_free; |
---|
.. | .. |
---|
3964 | 4328 | |
---|
3965 | 4329 | list_del(&pa->pa_group_list); |
---|
3966 | 4330 | list_add(&pa->u.pa_tmp_list, &list); |
---|
3967 | | - } |
---|
3968 | | - |
---|
3969 | | - /* if we still need more blocks and some PAs were used, try again */ |
---|
3970 | | - if (free < needed && busy) { |
---|
3971 | | - busy = 0; |
---|
3972 | | - ext4_unlock_group(sb, group); |
---|
3973 | | - cond_resched(); |
---|
3974 | | - goto repeat; |
---|
3975 | | - } |
---|
3976 | | - |
---|
3977 | | - /* found anything to free? */ |
---|
3978 | | - if (list_empty(&list)) { |
---|
3979 | | - BUG_ON(free != 0); |
---|
3980 | | - goto out; |
---|
3981 | 4331 | } |
---|
3982 | 4332 | |
---|
3983 | 4333 | /* now free all selected PAs */ |
---|
.. | .. |
---|
3997 | 4347 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); |
---|
3998 | 4348 | } |
---|
3999 | 4349 | |
---|
4000 | | -out: |
---|
4001 | 4350 | ext4_unlock_group(sb, group); |
---|
4002 | 4351 | ext4_mb_unload_buddy(&e4b); |
---|
4003 | 4352 | put_bh(bitmap_bh); |
---|
| 4353 | +out_dbg: |
---|
| 4354 | + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", |
---|
| 4355 | + free, group, grp->bb_free); |
---|
4004 | 4356 | return free; |
---|
4005 | 4357 | } |
---|
4006 | 4358 | |
---|
.. | .. |
---|
4013 | 4365 | * |
---|
4014 | 4366 | * FIXME!! Make sure it is valid at all the call sites |
---|
4015 | 4367 | */ |
---|
4016 | | -void ext4_discard_preallocations(struct inode *inode) |
---|
| 4368 | +void ext4_discard_preallocations(struct inode *inode, unsigned int needed) |
---|
4017 | 4369 | { |
---|
4018 | 4370 | struct ext4_inode_info *ei = EXT4_I(inode); |
---|
4019 | 4371 | struct super_block *sb = inode->i_sb; |
---|
.. | .. |
---|
4029 | 4381 | return; |
---|
4030 | 4382 | } |
---|
4031 | 4383 | |
---|
4032 | | - mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); |
---|
4033 | | - trace_ext4_discard_preallocations(inode); |
---|
| 4384 | + if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) |
---|
| 4385 | + return; |
---|
| 4386 | + |
---|
| 4387 | + mb_debug(sb, "discard preallocation for inode %lu\n", |
---|
| 4388 | + inode->i_ino); |
---|
| 4389 | + trace_ext4_discard_preallocations(inode, |
---|
| 4390 | + atomic_read(&ei->i_prealloc_active), needed); |
---|
4034 | 4391 | |
---|
4035 | 4392 | INIT_LIST_HEAD(&list); |
---|
| 4393 | + |
---|
| 4394 | + if (needed == 0) |
---|
| 4395 | + needed = UINT_MAX; |
---|
4036 | 4396 | |
---|
4037 | 4397 | repeat: |
---|
4038 | 4398 | /* first, collect all pa's in the inode */ |
---|
4039 | 4399 | spin_lock(&ei->i_prealloc_lock); |
---|
4040 | | - while (!list_empty(&ei->i_prealloc_list)) { |
---|
4041 | | - pa = list_entry(ei->i_prealloc_list.next, |
---|
| 4400 | + while (!list_empty(&ei->i_prealloc_list) && needed) { |
---|
| 4401 | + pa = list_entry(ei->i_prealloc_list.prev, |
---|
4042 | 4402 | struct ext4_prealloc_space, pa_inode_list); |
---|
4043 | 4403 | BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); |
---|
4044 | 4404 | spin_lock(&pa->pa_lock); |
---|
.. | .. |
---|
4055 | 4415 | |
---|
4056 | 4416 | } |
---|
4057 | 4417 | if (pa->pa_deleted == 0) { |
---|
4058 | | - pa->pa_deleted = 1; |
---|
| 4418 | + ext4_mb_mark_pa_deleted(sb, pa); |
---|
4059 | 4419 | spin_unlock(&pa->pa_lock); |
---|
4060 | 4420 | list_del_rcu(&pa->pa_inode_list); |
---|
4061 | 4421 | list_add(&pa->u.pa_tmp_list, &list); |
---|
| 4422 | + needed--; |
---|
4062 | 4423 | continue; |
---|
4063 | 4424 | } |
---|
4064 | 4425 | |
---|
.. | .. |
---|
4090 | 4451 | err = ext4_mb_load_buddy_gfp(sb, group, &e4b, |
---|
4091 | 4452 | GFP_NOFS|__GFP_NOFAIL); |
---|
4092 | 4453 | if (err) { |
---|
4093 | | - ext4_error(sb, "Error %d loading buddy information for %u", |
---|
4094 | | - err, group); |
---|
| 4454 | + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", |
---|
| 4455 | + err, group); |
---|
4095 | 4456 | continue; |
---|
4096 | 4457 | } |
---|
4097 | 4458 | |
---|
4098 | 4459 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
---|
4099 | 4460 | if (IS_ERR(bitmap_bh)) { |
---|
4100 | 4461 | err = PTR_ERR(bitmap_bh); |
---|
4101 | | - ext4_error(sb, "Error %d reading block bitmap for %u", |
---|
4102 | | - err, group); |
---|
| 4462 | + ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", |
---|
| 4463 | + err, group); |
---|
4103 | 4464 | ext4_mb_unload_buddy(&e4b); |
---|
4104 | 4465 | continue; |
---|
4105 | 4466 | } |
---|
.. | .. |
---|
4117 | 4478 | } |
---|
4118 | 4479 | } |
---|
4119 | 4480 | |
---|
| 4481 | +static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) |
---|
| 4482 | +{ |
---|
| 4483 | + struct ext4_prealloc_space *pa; |
---|
| 4484 | + |
---|
| 4485 | + BUG_ON(ext4_pspace_cachep == NULL); |
---|
| 4486 | + pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); |
---|
| 4487 | + if (!pa) |
---|
| 4488 | + return -ENOMEM; |
---|
| 4489 | + atomic_set(&pa->pa_count, 1); |
---|
| 4490 | + ac->ac_pa = pa; |
---|
| 4491 | + return 0; |
---|
| 4492 | +} |
---|
| 4493 | + |
---|
| 4494 | +static void ext4_mb_pa_free(struct ext4_allocation_context *ac) |
---|
| 4495 | +{ |
---|
| 4496 | + struct ext4_prealloc_space *pa = ac->ac_pa; |
---|
| 4497 | + |
---|
| 4498 | + BUG_ON(!pa); |
---|
| 4499 | + ac->ac_pa = NULL; |
---|
| 4500 | + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); |
---|
| 4501 | + kmem_cache_free(ext4_pspace_cachep, pa); |
---|
| 4502 | +} |
---|
| 4503 | + |
---|
4120 | 4504 | #ifdef CONFIG_EXT4_DEBUG |
---|
| 4505 | +static inline void ext4_mb_show_pa(struct super_block *sb) |
---|
| 4506 | +{ |
---|
| 4507 | + ext4_group_t i, ngroups; |
---|
| 4508 | + |
---|
| 4509 | + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) |
---|
| 4510 | + return; |
---|
| 4511 | + |
---|
| 4512 | + ngroups = ext4_get_groups_count(sb); |
---|
| 4513 | + mb_debug(sb, "groups: "); |
---|
| 4514 | + for (i = 0; i < ngroups; i++) { |
---|
| 4515 | + struct ext4_group_info *grp = ext4_get_group_info(sb, i); |
---|
| 4516 | + struct ext4_prealloc_space *pa; |
---|
| 4517 | + ext4_grpblk_t start; |
---|
| 4518 | + struct list_head *cur; |
---|
| 4519 | + ext4_lock_group(sb, i); |
---|
| 4520 | + list_for_each(cur, &grp->bb_prealloc_list) { |
---|
| 4521 | + pa = list_entry(cur, struct ext4_prealloc_space, |
---|
| 4522 | + pa_group_list); |
---|
| 4523 | + spin_lock(&pa->pa_lock); |
---|
| 4524 | + ext4_get_group_no_and_offset(sb, pa->pa_pstart, |
---|
| 4525 | + NULL, &start); |
---|
| 4526 | + spin_unlock(&pa->pa_lock); |
---|
| 4527 | + mb_debug(sb, "PA:%u:%d:%d\n", i, start, |
---|
| 4528 | + pa->pa_len); |
---|
| 4529 | + } |
---|
| 4530 | + ext4_unlock_group(sb, i); |
---|
| 4531 | + mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, |
---|
| 4532 | + grp->bb_fragments); |
---|
| 4533 | + } |
---|
| 4534 | +} |
---|
| 4535 | + |
---|
4121 | 4536 | static void ext4_mb_show_ac(struct ext4_allocation_context *ac) |
---|
4122 | 4537 | { |
---|
4123 | 4538 | struct super_block *sb = ac->ac_sb; |
---|
4124 | | - ext4_group_t ngroups, i; |
---|
4125 | 4539 | |
---|
4126 | | - if (!ext4_mballoc_debug || |
---|
4127 | | - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) |
---|
| 4540 | + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) |
---|
4128 | 4541 | return; |
---|
4129 | 4542 | |
---|
4130 | | - ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" |
---|
| 4543 | + mb_debug(sb, "Can't allocate:" |
---|
4131 | 4544 | " Allocation context details:"); |
---|
4132 | | - ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", |
---|
| 4545 | + mb_debug(sb, "status %u flags 0x%x", |
---|
4133 | 4546 | ac->ac_status, ac->ac_flags); |
---|
4134 | | - ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " |
---|
4135 | | - "goal %lu/%lu/%lu@%lu, " |
---|
| 4547 | + mb_debug(sb, "orig %lu/%lu/%lu@%lu, " |
---|
| 4548 | + "goal %lu/%lu/%lu@%lu, " |
---|
4136 | 4549 | "best %lu/%lu/%lu@%lu cr %d", |
---|
4137 | 4550 | (unsigned long)ac->ac_o_ex.fe_group, |
---|
4138 | 4551 | (unsigned long)ac->ac_o_ex.fe_start, |
---|
.. | .. |
---|
4147 | 4560 | (unsigned long)ac->ac_b_ex.fe_len, |
---|
4148 | 4561 | (unsigned long)ac->ac_b_ex.fe_logical, |
---|
4149 | 4562 | (int)ac->ac_criteria); |
---|
4150 | | - ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); |
---|
4151 | | - ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); |
---|
4152 | | - ngroups = ext4_get_groups_count(sb); |
---|
4153 | | - for (i = 0; i < ngroups; i++) { |
---|
4154 | | - struct ext4_group_info *grp = ext4_get_group_info(sb, i); |
---|
4155 | | - struct ext4_prealloc_space *pa; |
---|
4156 | | - ext4_grpblk_t start; |
---|
4157 | | - struct list_head *cur; |
---|
4158 | | - ext4_lock_group(sb, i); |
---|
4159 | | - list_for_each(cur, &grp->bb_prealloc_list) { |
---|
4160 | | - pa = list_entry(cur, struct ext4_prealloc_space, |
---|
4161 | | - pa_group_list); |
---|
4162 | | - spin_lock(&pa->pa_lock); |
---|
4163 | | - ext4_get_group_no_and_offset(sb, pa->pa_pstart, |
---|
4164 | | - NULL, &start); |
---|
4165 | | - spin_unlock(&pa->pa_lock); |
---|
4166 | | - printk(KERN_ERR "PA:%u:%d:%u \n", i, |
---|
4167 | | - start, pa->pa_len); |
---|
4168 | | - } |
---|
4169 | | - ext4_unlock_group(sb, i); |
---|
4170 | | - |
---|
4171 | | - if (grp->bb_free == 0) |
---|
4172 | | - continue; |
---|
4173 | | - printk(KERN_ERR "%u: %d/%d \n", |
---|
4174 | | - i, grp->bb_free, grp->bb_fragments); |
---|
4175 | | - } |
---|
4176 | | - printk(KERN_ERR "\n"); |
---|
| 4563 | + mb_debug(sb, "%u found", ac->ac_found); |
---|
| 4564 | + ext4_mb_show_pa(sb); |
---|
4177 | 4565 | } |
---|
4178 | 4566 | #else |
---|
| 4567 | +static inline void ext4_mb_show_pa(struct super_block *sb) |
---|
| 4568 | +{ |
---|
| 4569 | + return; |
---|
| 4570 | +} |
---|
4179 | 4571 | static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) |
---|
4180 | 4572 | { |
---|
| 4573 | + ext4_mb_show_pa(ac->ac_sb); |
---|
4181 | 4574 | return; |
---|
4182 | 4575 | } |
---|
4183 | 4576 | #endif |
---|
.. | .. |
---|
4205 | 4598 | isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) |
---|
4206 | 4599 | >> bsbits; |
---|
4207 | 4600 | |
---|
4208 | | - if ((size == isize) && |
---|
4209 | | - !ext4_fs_is_busy(sbi) && |
---|
4210 | | - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { |
---|
| 4601 | + if ((size == isize) && !ext4_fs_is_busy(sbi) && |
---|
| 4602 | + !inode_is_open_for_write(ac->ac_inode)) { |
---|
4211 | 4603 | ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; |
---|
4212 | 4604 | return; |
---|
4213 | 4605 | } |
---|
.. | .. |
---|
4277 | 4669 | ac->ac_g_ex = ac->ac_o_ex; |
---|
4278 | 4670 | ac->ac_flags = ar->flags; |
---|
4279 | 4671 | |
---|
4280 | | - /* we have to define context: we'll we work with a file or |
---|
| 4672 | + /* we have to define context: we'll work with a file or |
---|
4281 | 4673 | * locality group. this is a policy, actually */ |
---|
4282 | 4674 | ext4_mb_group_or_file(ac); |
---|
4283 | 4675 | |
---|
4284 | | - mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " |
---|
| 4676 | + mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " |
---|
4285 | 4677 | "left: %u/%u, right %u/%u to %swritable\n", |
---|
4286 | 4678 | (unsigned) ar->len, (unsigned) ar->logical, |
---|
4287 | 4679 | (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, |
---|
4288 | 4680 | (unsigned) ar->lleft, (unsigned) ar->pleft, |
---|
4289 | 4681 | (unsigned) ar->lright, (unsigned) ar->pright, |
---|
4290 | | - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); |
---|
| 4682 | + inode_is_open_for_write(ar->inode) ? "" : "non-"); |
---|
4291 | 4683 | return 0; |
---|
4292 | 4684 | |
---|
4293 | 4685 | } |
---|
.. | .. |
---|
4302 | 4694 | struct list_head discard_list; |
---|
4303 | 4695 | struct ext4_prealloc_space *pa, *tmp; |
---|
4304 | 4696 | |
---|
4305 | | - mb_debug(1, "discard locality group preallocation\n"); |
---|
| 4697 | + mb_debug(sb, "discard locality group preallocation\n"); |
---|
4306 | 4698 | |
---|
4307 | 4699 | INIT_LIST_HEAD(&discard_list); |
---|
4308 | 4700 | |
---|
4309 | 4701 | spin_lock(&lg->lg_prealloc_lock); |
---|
4310 | 4702 | list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], |
---|
4311 | | - pa_inode_list) { |
---|
| 4703 | + pa_inode_list, |
---|
| 4704 | + lockdep_is_held(&lg->lg_prealloc_lock)) { |
---|
4312 | 4705 | spin_lock(&pa->pa_lock); |
---|
4313 | 4706 | if (atomic_read(&pa->pa_count)) { |
---|
4314 | 4707 | /* |
---|
.. | .. |
---|
4327 | 4720 | BUG_ON(pa->pa_type != MB_GROUP_PA); |
---|
4328 | 4721 | |
---|
4329 | 4722 | /* seems this one can be freed ... */ |
---|
4330 | | - pa->pa_deleted = 1; |
---|
| 4723 | + ext4_mb_mark_pa_deleted(sb, pa); |
---|
4331 | 4724 | spin_unlock(&pa->pa_lock); |
---|
4332 | 4725 | |
---|
4333 | 4726 | list_del_rcu(&pa->pa_inode_list); |
---|
.. | .. |
---|
4353 | 4746 | err = ext4_mb_load_buddy_gfp(sb, group, &e4b, |
---|
4354 | 4747 | GFP_NOFS|__GFP_NOFAIL); |
---|
4355 | 4748 | if (err) { |
---|
4356 | | - ext4_error(sb, "Error %d loading buddy information for %u", |
---|
4357 | | - err, group); |
---|
| 4749 | + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", |
---|
| 4750 | + err, group); |
---|
4358 | 4751 | continue; |
---|
4359 | 4752 | } |
---|
4360 | 4753 | ext4_lock_group(sb, group); |
---|
.. | .. |
---|
4391 | 4784 | /* Add the prealloc space to lg */ |
---|
4392 | 4785 | spin_lock(&lg->lg_prealloc_lock); |
---|
4393 | 4786 | list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], |
---|
4394 | | - pa_inode_list) { |
---|
| 4787 | + pa_inode_list, |
---|
| 4788 | + lockdep_is_held(&lg->lg_prealloc_lock)) { |
---|
4395 | 4789 | spin_lock(&tmp_pa->pa_lock); |
---|
4396 | 4790 | if (tmp_pa->pa_deleted) { |
---|
4397 | 4791 | spin_unlock(&tmp_pa->pa_lock); |
---|
.. | .. |
---|
4425 | 4819 | } |
---|
4426 | 4820 | |
---|
4427 | 4821 | /* |
---|
| 4822 | + * if per-inode prealloc list is too long, trim some PA |
---|
| 4823 | + */ |
---|
| 4824 | +static void ext4_mb_trim_inode_pa(struct inode *inode) |
---|
| 4825 | +{ |
---|
| 4826 | + struct ext4_inode_info *ei = EXT4_I(inode); |
---|
| 4827 | + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
---|
| 4828 | + int count, delta; |
---|
| 4829 | + |
---|
| 4830 | + count = atomic_read(&ei->i_prealloc_active); |
---|
| 4831 | + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; |
---|
| 4832 | + if (count > sbi->s_mb_max_inode_prealloc + delta) { |
---|
| 4833 | + count -= sbi->s_mb_max_inode_prealloc; |
---|
| 4834 | + ext4_discard_preallocations(inode, count); |
---|
| 4835 | + } |
---|
| 4836 | +} |
---|
| 4837 | + |
---|
| 4838 | +/* |
---|
4428 | 4839 | * release all resource we used in allocation |
---|
4429 | 4840 | */ |
---|
4430 | 4841 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) |
---|
4431 | 4842 | { |
---|
| 4843 | + struct inode *inode = ac->ac_inode; |
---|
| 4844 | + struct ext4_inode_info *ei = EXT4_I(inode); |
---|
4432 | 4845 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); |
---|
4433 | 4846 | struct ext4_prealloc_space *pa = ac->ac_pa; |
---|
4434 | 4847 | if (pa) { |
---|
.. | .. |
---|
4440 | 4853 | pa->pa_free -= ac->ac_b_ex.fe_len; |
---|
4441 | 4854 | pa->pa_len -= ac->ac_b_ex.fe_len; |
---|
4442 | 4855 | spin_unlock(&pa->pa_lock); |
---|
| 4856 | + |
---|
| 4857 | + /* |
---|
| 4858 | + * We want to add the pa to the right bucket. |
---|
| 4859 | + * Remove it from the list and while adding |
---|
| 4860 | + * make sure the list to which we are adding |
---|
| 4861 | + * doesn't grow big. |
---|
| 4862 | + */ |
---|
| 4863 | + if (likely(pa->pa_free)) { |
---|
| 4864 | + spin_lock(pa->pa_obj_lock); |
---|
| 4865 | + list_del_rcu(&pa->pa_inode_list); |
---|
| 4866 | + spin_unlock(pa->pa_obj_lock); |
---|
| 4867 | + ext4_mb_add_n_trim(ac); |
---|
| 4868 | + } |
---|
4443 | 4869 | } |
---|
4444 | | - } |
---|
4445 | | - if (pa) { |
---|
4446 | | - /* |
---|
4447 | | - * We want to add the pa to the right bucket. |
---|
4448 | | - * Remove it from the list and while adding |
---|
4449 | | - * make sure the list to which we are adding |
---|
4450 | | - * doesn't grow big. |
---|
4451 | | - */ |
---|
4452 | | - if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { |
---|
| 4870 | + |
---|
| 4871 | + if (pa->pa_type == MB_INODE_PA) { |
---|
| 4872 | + /* |
---|
| 4873 | + * treat per-inode prealloc list as a lru list, then try |
---|
| 4874 | + * to trim the least recently used PA. |
---|
| 4875 | + */ |
---|
4453 | 4876 | spin_lock(pa->pa_obj_lock); |
---|
4454 | | - list_del_rcu(&pa->pa_inode_list); |
---|
| 4877 | + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); |
---|
4455 | 4878 | spin_unlock(pa->pa_obj_lock); |
---|
4456 | | - ext4_mb_add_n_trim(ac); |
---|
4457 | 4879 | } |
---|
| 4880 | + |
---|
4458 | 4881 | ext4_mb_put_pa(ac, ac->ac_sb, pa); |
---|
4459 | 4882 | } |
---|
4460 | 4883 | if (ac->ac_bitmap_page) |
---|
.. | .. |
---|
4464 | 4887 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) |
---|
4465 | 4888 | mutex_unlock(&ac->ac_lg->lg_mutex); |
---|
4466 | 4889 | ext4_mb_collect_stats(ac); |
---|
| 4890 | + ext4_mb_trim_inode_pa(inode); |
---|
4467 | 4891 | return 0; |
---|
4468 | 4892 | } |
---|
4469 | 4893 | |
---|
.. | .. |
---|
4471 | 4895 | { |
---|
4472 | 4896 | ext4_group_t i, ngroups = ext4_get_groups_count(sb); |
---|
4473 | 4897 | int ret; |
---|
4474 | | - int freed = 0; |
---|
| 4898 | + int freed = 0, busy = 0; |
---|
| 4899 | + int retry = 0; |
---|
4475 | 4900 | |
---|
4476 | 4901 | trace_ext4_mb_discard_preallocations(sb, needed); |
---|
| 4902 | + |
---|
| 4903 | + if (needed == 0) |
---|
| 4904 | + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; |
---|
| 4905 | + repeat: |
---|
4477 | 4906 | for (i = 0; i < ngroups && needed > 0; i++) { |
---|
4478 | | - ret = ext4_mb_discard_group_preallocations(sb, i, needed); |
---|
| 4907 | + ret = ext4_mb_discard_group_preallocations(sb, i, &busy); |
---|
4479 | 4908 | freed += ret; |
---|
4480 | 4909 | needed -= ret; |
---|
| 4910 | + cond_resched(); |
---|
| 4911 | + } |
---|
| 4912 | + |
---|
| 4913 | + if (needed > 0 && busy && ++retry < 3) { |
---|
| 4914 | + busy = 0; |
---|
| 4915 | + goto repeat; |
---|
4481 | 4916 | } |
---|
4482 | 4917 | |
---|
4483 | 4918 | return freed; |
---|
4484 | 4919 | } |
---|
| 4920 | + |
---|
| 4921 | +static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, |
---|
| 4922 | + struct ext4_allocation_context *ac, u64 *seq) |
---|
| 4923 | +{ |
---|
| 4924 | + int freed; |
---|
| 4925 | + u64 seq_retry = 0; |
---|
| 4926 | + bool ret = false; |
---|
| 4927 | + |
---|
| 4928 | + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); |
---|
| 4929 | + if (freed) { |
---|
| 4930 | + ret = true; |
---|
| 4931 | + goto out_dbg; |
---|
| 4932 | + } |
---|
| 4933 | + seq_retry = ext4_get_discard_pa_seq_sum(); |
---|
| 4934 | + if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { |
---|
| 4935 | + ac->ac_flags |= EXT4_MB_STRICT_CHECK; |
---|
| 4936 | + *seq = seq_retry; |
---|
| 4937 | + ret = true; |
---|
| 4938 | + } |
---|
| 4939 | + |
---|
| 4940 | +out_dbg: |
---|
| 4941 | + mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); |
---|
| 4942 | + return ret; |
---|
| 4943 | +} |
---|
| 4944 | + |
---|
| 4945 | +static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, |
---|
| 4946 | + struct ext4_allocation_request *ar, int *errp); |
---|
4485 | 4947 | |
---|
4486 | 4948 | /* |
---|
4487 | 4949 | * Main entry point into mballoc to allocate blocks |
---|
.. | .. |
---|
4491 | 4953 | ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, |
---|
4492 | 4954 | struct ext4_allocation_request *ar, int *errp) |
---|
4493 | 4955 | { |
---|
4494 | | - int freed; |
---|
4495 | 4956 | struct ext4_allocation_context *ac = NULL; |
---|
4496 | 4957 | struct ext4_sb_info *sbi; |
---|
4497 | 4958 | struct super_block *sb; |
---|
4498 | 4959 | ext4_fsblk_t block = 0; |
---|
4499 | 4960 | unsigned int inquota = 0; |
---|
4500 | 4961 | unsigned int reserv_clstrs = 0; |
---|
| 4962 | + int retries = 0; |
---|
| 4963 | + u64 seq; |
---|
4501 | 4964 | |
---|
4502 | 4965 | might_sleep(); |
---|
4503 | 4966 | sb = ar->inode->i_sb; |
---|
4504 | 4967 | sbi = EXT4_SB(sb); |
---|
4505 | 4968 | |
---|
4506 | 4969 | trace_ext4_request_blocks(ar); |
---|
| 4970 | + if (sbi->s_mount_state & EXT4_FC_REPLAY) |
---|
| 4971 | + return ext4_mb_new_blocks_simple(handle, ar, errp); |
---|
4507 | 4972 | |
---|
4508 | 4973 | /* Allow to use superuser reservation for quota file */ |
---|
4509 | 4974 | if (ext4_is_quota_file(ar->inode)) |
---|
.. | .. |
---|
4522 | 4987 | ar->len = ar->len >> 1; |
---|
4523 | 4988 | } |
---|
4524 | 4989 | if (!ar->len) { |
---|
| 4990 | + ext4_mb_show_pa(sb); |
---|
4525 | 4991 | *errp = -ENOSPC; |
---|
4526 | 4992 | return 0; |
---|
4527 | 4993 | } |
---|
.. | .. |
---|
4559 | 5025 | } |
---|
4560 | 5026 | |
---|
4561 | 5027 | ac->ac_op = EXT4_MB_HISTORY_PREALLOC; |
---|
| 5028 | + seq = this_cpu_read(discard_pa_seq); |
---|
4562 | 5029 | if (!ext4_mb_use_preallocated(ac)) { |
---|
4563 | 5030 | ac->ac_op = EXT4_MB_HISTORY_ALLOC; |
---|
4564 | 5031 | ext4_mb_normalize_request(ac, ar); |
---|
| 5032 | + |
---|
| 5033 | + *errp = ext4_mb_pa_alloc(ac); |
---|
| 5034 | + if (*errp) |
---|
| 5035 | + goto errout; |
---|
4565 | 5036 | repeat: |
---|
4566 | 5037 | /* allocate space in core */ |
---|
4567 | 5038 | *errp = ext4_mb_regular_allocator(ac); |
---|
4568 | | - if (*errp) |
---|
4569 | | - goto discard_and_exit; |
---|
4570 | | - |
---|
4571 | | - /* as we've just preallocated more space than |
---|
4572 | | - * user requested originally, we store allocated |
---|
4573 | | - * space in a special descriptor */ |
---|
4574 | | - if (ac->ac_status == AC_STATUS_FOUND && |
---|
4575 | | - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) |
---|
4576 | | - *errp = ext4_mb_new_preallocation(ac); |
---|
| 5039 | + /* |
---|
| 5040 | + * pa allocated above is added to grp->bb_prealloc_list only |
---|
| 5041 | + * when we were able to allocate some block i.e. when |
---|
| 5042 | + * ac->ac_status == AC_STATUS_FOUND. |
---|
| 5043 | + * And error from above mean ac->ac_status != AC_STATUS_FOUND |
---|
| 5044 | + * So we have to free this pa here itself. |
---|
| 5045 | + */ |
---|
4577 | 5046 | if (*errp) { |
---|
4578 | | - discard_and_exit: |
---|
| 5047 | + ext4_mb_pa_free(ac); |
---|
4579 | 5048 | ext4_discard_allocated_blocks(ac); |
---|
4580 | 5049 | goto errout; |
---|
4581 | 5050 | } |
---|
| 5051 | + if (ac->ac_status == AC_STATUS_FOUND && |
---|
| 5052 | + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) |
---|
| 5053 | + ext4_mb_pa_free(ac); |
---|
4582 | 5054 | } |
---|
4583 | 5055 | if (likely(ac->ac_status == AC_STATUS_FOUND)) { |
---|
4584 | 5056 | *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); |
---|
.. | .. |
---|
4590 | 5062 | ar->len = ac->ac_b_ex.fe_len; |
---|
4591 | 5063 | } |
---|
4592 | 5064 | } else { |
---|
4593 | | - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); |
---|
4594 | | - if (freed) |
---|
| 5065 | + if (++retries < 3 && |
---|
| 5066 | + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) |
---|
4595 | 5067 | goto repeat; |
---|
| 5068 | + /* |
---|
| 5069 | + * If block allocation fails then the pa allocated above |
---|
| 5070 | + * needs to be freed here itself. |
---|
| 5071 | + */ |
---|
| 5072 | + ext4_mb_pa_free(ac); |
---|
4596 | 5073 | *errp = -ENOSPC; |
---|
4597 | 5074 | } |
---|
4598 | 5075 | |
---|
.. | .. |
---|
4721 | 5198 | return 0; |
---|
4722 | 5199 | } |
---|
4723 | 5200 | |
---|
| 5201 | +/* |
---|
| 5202 | + * Simple allocator for Ext4 fast commit replay path. It searches for blocks |
---|
| 5203 | + * linearly starting at the goal block and also excludes the blocks which |
---|
| 5204 | + * are going to be in use after fast commit replay. |
---|
| 5205 | + */ |
---|
| 5206 | +static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, |
---|
| 5207 | + struct ext4_allocation_request *ar, int *errp) |
---|
| 5208 | +{ |
---|
| 5209 | + struct buffer_head *bitmap_bh; |
---|
| 5210 | + struct super_block *sb = ar->inode->i_sb; |
---|
| 5211 | + ext4_group_t group; |
---|
| 5212 | + ext4_grpblk_t blkoff; |
---|
| 5213 | + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); |
---|
| 5214 | + ext4_grpblk_t i = 0; |
---|
| 5215 | + ext4_fsblk_t goal, block; |
---|
| 5216 | + struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
---|
| 5217 | + |
---|
| 5218 | + goal = ar->goal; |
---|
| 5219 | + if (goal < le32_to_cpu(es->s_first_data_block) || |
---|
| 5220 | + goal >= ext4_blocks_count(es)) |
---|
| 5221 | + goal = le32_to_cpu(es->s_first_data_block); |
---|
| 5222 | + |
---|
| 5223 | + ar->len = 0; |
---|
| 5224 | + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); |
---|
| 5225 | + for (; group < ext4_get_groups_count(sb); group++) { |
---|
| 5226 | + bitmap_bh = ext4_read_block_bitmap(sb, group); |
---|
| 5227 | + if (IS_ERR(bitmap_bh)) { |
---|
| 5228 | + *errp = PTR_ERR(bitmap_bh); |
---|
| 5229 | + pr_warn("Failed to read block bitmap\n"); |
---|
| 5230 | + return 0; |
---|
| 5231 | + } |
---|
| 5232 | + |
---|
| 5233 | + ext4_get_group_no_and_offset(sb, |
---|
| 5234 | + max(ext4_group_first_block_no(sb, group), goal), |
---|
| 5235 | + NULL, &blkoff); |
---|
| 5236 | + while (1) { |
---|
| 5237 | + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, |
---|
| 5238 | + blkoff); |
---|
| 5239 | + if (i >= max) |
---|
| 5240 | + break; |
---|
| 5241 | + if (ext4_fc_replay_check_excluded(sb, |
---|
| 5242 | + ext4_group_first_block_no(sb, group) + i)) { |
---|
| 5243 | + blkoff = i + 1; |
---|
| 5244 | + } else |
---|
| 5245 | + break; |
---|
| 5246 | + } |
---|
| 5247 | + brelse(bitmap_bh); |
---|
| 5248 | + if (i < max) |
---|
| 5249 | + break; |
---|
| 5250 | + } |
---|
| 5251 | + |
---|
| 5252 | + if (group >= ext4_get_groups_count(sb) || i >= max) { |
---|
| 5253 | + *errp = -ENOSPC; |
---|
| 5254 | + return 0; |
---|
| 5255 | + } |
---|
| 5256 | + |
---|
| 5257 | + block = ext4_group_first_block_no(sb, group) + i; |
---|
| 5258 | + ext4_mb_mark_bb(sb, block, 1, 1); |
---|
| 5259 | + ar->len = 1; |
---|
| 5260 | + |
---|
| 5261 | + return block; |
---|
| 5262 | +} |
---|
| 5263 | + |
---|
| 5264 | +static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, |
---|
| 5265 | + unsigned long count) |
---|
| 5266 | +{ |
---|
| 5267 | + struct buffer_head *bitmap_bh; |
---|
| 5268 | + struct super_block *sb = inode->i_sb; |
---|
| 5269 | + struct ext4_group_desc *gdp; |
---|
| 5270 | + struct buffer_head *gdp_bh; |
---|
| 5271 | + ext4_group_t group; |
---|
| 5272 | + ext4_grpblk_t blkoff; |
---|
| 5273 | + int already_freed = 0, err, i; |
---|
| 5274 | + |
---|
| 5275 | + ext4_get_group_no_and_offset(sb, block, &group, &blkoff); |
---|
| 5276 | + bitmap_bh = ext4_read_block_bitmap(sb, group); |
---|
| 5277 | + if (IS_ERR(bitmap_bh)) { |
---|
| 5278 | + err = PTR_ERR(bitmap_bh); |
---|
| 5279 | + pr_warn("Failed to read block bitmap\n"); |
---|
| 5280 | + return; |
---|
| 5281 | + } |
---|
| 5282 | + gdp = ext4_get_group_desc(sb, group, &gdp_bh); |
---|
| 5283 | + if (!gdp) |
---|
| 5284 | + return; |
---|
| 5285 | + |
---|
| 5286 | + for (i = 0; i < count; i++) { |
---|
| 5287 | + if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) |
---|
| 5288 | + already_freed++; |
---|
| 5289 | + } |
---|
| 5290 | + mb_clear_bits(bitmap_bh->b_data, blkoff, count); |
---|
| 5291 | + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); |
---|
| 5292 | + if (err) |
---|
| 5293 | + return; |
---|
| 5294 | + ext4_free_group_clusters_set( |
---|
| 5295 | + sb, gdp, ext4_free_group_clusters(sb, gdp) + |
---|
| 5296 | + count - already_freed); |
---|
| 5297 | + ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); |
---|
| 5298 | + ext4_group_desc_csum_set(sb, group, gdp); |
---|
| 5299 | + ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); |
---|
| 5300 | + sync_dirty_buffer(bitmap_bh); |
---|
| 5301 | + sync_dirty_buffer(gdp_bh); |
---|
| 5302 | + brelse(bitmap_bh); |
---|
| 5303 | +} |
---|
| 5304 | + |
---|
4724 | 5305 | /** |
---|
4725 | 5306 | * ext4_free_blocks() -- Free given blocks and update quota |
---|
4726 | 5307 | * @handle: handle for this transaction |
---|
4727 | 5308 | * @inode: inode |
---|
4728 | | - * @block: start physical block to free |
---|
4729 | | - * @count: number of blocks to count |
---|
| 5309 | + * @bh: optional buffer of the block to be freed |
---|
| 5310 | + * @block: starting physical block to be freed |
---|
| 5311 | + * @count: number of blocks to be freed |
---|
4730 | 5312 | * @flags: flags used by ext4_free_blocks |
---|
4731 | 5313 | */ |
---|
4732 | 5314 | void ext4_free_blocks(handle_t *handle, struct inode *inode, |
---|
.. | .. |
---|
4746 | 5328 | int err = 0; |
---|
4747 | 5329 | int ret; |
---|
4748 | 5330 | |
---|
| 5331 | + sbi = EXT4_SB(sb); |
---|
| 5332 | + |
---|
| 5333 | + if (sbi->s_mount_state & EXT4_FC_REPLAY) { |
---|
| 5334 | + ext4_free_blocks_simple(inode, block, count); |
---|
| 5335 | + return; |
---|
| 5336 | + } |
---|
| 5337 | + |
---|
4749 | 5338 | might_sleep(); |
---|
4750 | 5339 | if (bh) { |
---|
4751 | 5340 | if (block) |
---|
.. | .. |
---|
4754 | 5343 | block = bh->b_blocknr; |
---|
4755 | 5344 | } |
---|
4756 | 5345 | |
---|
4757 | | - sbi = EXT4_SB(sb); |
---|
4758 | 5346 | if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && |
---|
4759 | 5347 | !ext4_inode_block_valid(inode, block, count)) { |
---|
4760 | 5348 | ext4_error(sb, "Freeing blocks not in datazone - " |
---|
.. | .. |
---|
4946 | 5534 | flex_group)->free_clusters); |
---|
4947 | 5535 | } |
---|
4948 | 5536 | |
---|
4949 | | - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) |
---|
4950 | | - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); |
---|
4951 | | - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); |
---|
| 5537 | + /* |
---|
| 5538 | + * on a bigalloc file system, defer the s_freeclusters_counter |
---|
| 5539 | + * update to the caller (ext4_remove_space and friends) so they |
---|
| 5540 | + * can determine if a cluster freed here should be rereserved |
---|
| 5541 | + */ |
---|
| 5542 | + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { |
---|
| 5543 | + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) |
---|
| 5544 | + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); |
---|
| 5545 | + percpu_counter_add(&sbi->s_freeclusters_counter, |
---|
| 5546 | + count_clusters); |
---|
| 5547 | + } |
---|
4952 | 5548 | |
---|
4953 | 5549 | ext4_mb_unload_buddy(&e4b); |
---|
4954 | 5550 | |
---|