From 9370bb92b2d16684ee45cf24e879c93c509162da Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Thu, 19 Dec 2024 01:47:39 +0000 Subject: [PATCH] add wifi6 8852be driver --- kernel/fs/ext4/super.c | 1253 +++++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 834 insertions(+), 419 deletions(-) diff --git a/kernel/fs/ext4/super.c b/kernel/fs/ext4/super.c index 3f91397..2c0a592 100644 --- a/kernel/fs/ext4/super.c +++ b/kernel/fs/ext4/super.c @@ -43,7 +43,7 @@ #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> - +#include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -93,11 +93,11 @@ * i_mmap_rwsem (inode->i_mmap_rwsem)! * * page fault path: - * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> * page lock -> i_data_sem (rw) * * buffered write path: - * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * @@ -107,7 +107,7 @@ * i_data_sem (rw) * * direct IO: - * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: @@ -141,27 +141,109 @@ MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) + +static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io) +{ + /* + * buffer's verified bit is no longer valid after reading from + * disk again due to write out error, clear it to make sure we + * recheck the buffer contents. + */ + clear_buffer_verified(bh); + + bh->b_end_io = end_io ? end_io : end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ, op_flags, bh); +} + +void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, + bh_end_io_t *end_io) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return; + } + __ext4_read_bh(bh, op_flags, end_io); +} + +int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) +{ + BUG_ON(!buffer_locked(bh)); + + if (ext4_buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + __ext4_read_bh(bh, op_flags, end_io); + + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} + +int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) +{ + lock_buffer(bh); + if (!wait) { + ext4_read_bh_nowait(bh, op_flags, NULL); + return 0; + } + return ext4_read_bh(bh, op_flags, NULL); +} + /* - * This works like sb_bread() except it uses ERR_PTR for error + * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ -struct buffer_head * -ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, + sector_t block, int op_flags, + gfp_t gfp) { - struct buffer_head *bh = sb_getblk(sb, block); + struct buffer_head *bh; + int ret; + bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); - if (buffer_uptodate(bh)) + if (ext4_buffer_uptodate(bh)) return bh; - ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - return ERR_PTR(-EIO); + + ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); + if (ret) { + put_bh(bh); + return ERR_PTR(ret); + } + return bh; +} + +struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, + int op_flags) +{ + return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); +} + +struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block) +{ + return __ext4_sb_bread_gfp(sb, block, 0, 0); +} + +void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) +{ + struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + + if (likely(bh)) { + if (trylock_buffer(bh)) + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); + brelse(bh); + } } static int ext4_verify_csum_type(struct super_block *sb, @@ -202,26 +284,6 @@ return; es->s_checksum = ext4_superblock_csum(sb, es); -} - -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -355,44 +417,6 @@ #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -static void __save_error_info(struct super_block *sb, const char *func, - unsigned int line) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - if (bdev_read_only(sb->s_bdev)) - return; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_update_tstamp(es, s_last_error_time); - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); - es->s_last_error_line = cpu_to_le32(line); - if (!es->s_first_error_time) { - es->s_first_error_time = es->s_last_error_time; - es->s_first_error_time_hi = es->s_last_error_time_hi; - strncpy(es->s_first_error_func, func, - sizeof(es->s_first_error_func)); - es->s_first_error_line = cpu_to_le32(line); - es->s_first_error_ino = es->s_last_error_ino; - es->s_first_error_block = es->s_last_error_block; - } - /* - * Start the daily error reporting function if it hasn't been - * started already - */ - if (!es->s_error_count) - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - le32_add_cpu(&es->s_error_count, 1); -} - -static void save_error_info(struct super_block *sb, const char *func, - unsigned int line) -{ - __save_error_info(sb, func, line); - if (!bdev_read_only(sb->s_bdev)) - ext4_commit_super(sb, 1); -} - /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -432,10 +456,176 @@ spin_unlock(&sbi->s_md_lock); } +/* + * This writepage callback for write_cache_pages() + * takes care of a few cases after page cleaning. + * + * write_cache_pages() already checks for dirty pages + * and calls clear_page_dirty_for_io(), which we want, + * to write protect the pages. + * + * However, we may have to redirty a page (see below.) + */ +static int ext4_journalled_writepage_callback(struct page *page, + struct writeback_control *wbc, + void *data) +{ + transaction_t *transaction = (transaction_t *) data; + struct buffer_head *bh, *head; + struct journal_head *jh; + + bh = head = page_buffers(page); + do { + /* + * We have to redirty a page in these cases: + * 1) If buffer is dirty, it means the page was dirty because it + * contains a buffer that needs checkpointing. So the dirty bit + * needs to be preserved so that checkpointing writes the buffer + * properly. + * 2) If buffer is not part of the committing transaction + * (we may have just accidentally come across this buffer because + * inode range tracking is not exact) or if the currently running + * transaction already contains this buffer as well, dirty bit + * needs to be preserved so that the buffer gets writeprotected + * properly on running transaction's commit. + */ + jh = bh2jh(bh); + if (buffer_dirty(bh) || + (jh && (jh->b_transaction != transaction || + jh->b_next_transaction))) { + redirty_page_for_writepage(wbc, page); + goto out; + } + } while ((bh = bh->b_this_page) != head); + +out: + return AOP_WRITEPAGE_ACTIVATE; +} + +static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + + return write_cache_pages(mapping, &wbc, + ext4_journalled_writepage_callback, + jinode->i_transaction); +} + +static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret; + + if (ext4_should_journal_data(jinode->i_vfs_inode)) + ret = ext4_journalled_submit_inode_data_buffers(jinode); + else + ret = jbd2_journal_submit_inode_data_buffers(jinode); + + return ret; +} + +static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) +{ + int ret = 0; + + if (!ext4_should_journal_data(jinode->i_vfs_inode)) + ret = jbd2_journal_finish_inode_data_buffers(jinode); + + return ret; +} + static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; +} + +struct ext4_err_translation { + int code; + int errno; +}; + +#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } + +static struct ext4_err_translation err_translation[] = { + EXT4_ERR_TRANSLATE(EIO), + EXT4_ERR_TRANSLATE(ENOMEM), + EXT4_ERR_TRANSLATE(EFSBADCRC), + EXT4_ERR_TRANSLATE(EFSCORRUPTED), + EXT4_ERR_TRANSLATE(ENOSPC), + EXT4_ERR_TRANSLATE(ENOKEY), + EXT4_ERR_TRANSLATE(EROFS), + EXT4_ERR_TRANSLATE(EFBIG), + EXT4_ERR_TRANSLATE(EEXIST), + EXT4_ERR_TRANSLATE(ERANGE), + EXT4_ERR_TRANSLATE(EOVERFLOW), + EXT4_ERR_TRANSLATE(EBUSY), + EXT4_ERR_TRANSLATE(ENOTDIR), + EXT4_ERR_TRANSLATE(ENOTEMPTY), + EXT4_ERR_TRANSLATE(ESHUTDOWN), + EXT4_ERR_TRANSLATE(EFAULT), +}; + +static int ext4_errno_to_code(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(err_translation); i++) + if (err_translation[i].errno == errno) + return err_translation[i].code; + return EXT4_ERR_UNKNOWN; +} + +static void __save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; + /* We default to EFSCORRUPTED error... */ + if (error == 0) + error = EFSCORRUPTED; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_update_tstamp(es, s_last_error_time); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + es->s_last_error_errcode = ext4_errno_to_code(error); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + es->s_first_error_time_hi = es->s_last_error_time_hi; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + le32_add_cpu(&es->s_error_count, 1); +} + +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + __save_error_info(sb, error, ino, block, func, line); + if (!bdev_read_only(sb->s_bdev)) + ext4_commit_super(sb, 1); } /* Deal with the reporting of failure conditions on a filesystem such as @@ -463,7 +653,7 @@ if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT)) return; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (journal) jbd2_journal_abort(journal, -EIO); /* @@ -480,9 +670,6 @@ smp_wmb(); sb->s_flags |= SB_RDONLY; } else if (test_opt(sb, ERRORS_PANIC)) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } @@ -493,7 +680,8 @@ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, __u64 block, + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -511,24 +699,21 @@ sb->s_id, function, line, current->comm, &vaf); va_end(args); } - save_error_info(sb, function, line); + save_error_info(sb, error, 0, block, function, line); ext4_handle_error(sb); } void __ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, + unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; trace_ext4_error(inode->i_sb, function, line); - es->s_last_error_ino = cpu_to_le32(inode->i_ino); - es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -545,7 +730,8 @@ current->comm, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, error, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -555,7 +741,6 @@ { va_list args; struct va_format vaf; - struct ext4_super_block *es; struct inode *inode = file_inode(file); char pathname[80], *path; @@ -563,8 +748,6 @@ return; trace_ext4_error(inode->i_sb, function, line); - es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -586,7 +769,8 @@ current->comm, path, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -654,7 +838,7 @@ sb->s_id, function, line, errstr); } - save_error_info(sb, function, line); + save_error_info(sb, -errno, 0, 0, function, line); ext4_handle_error(sb); } @@ -669,7 +853,7 @@ */ void __ext4_abort(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -677,7 +861,7 @@ if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; - save_error_info(sb, function, line); + save_error_info(sb, error, 0, 0, function, line); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -686,24 +870,20 @@ va_end(args); if (sb_rdonly(sb) == 0) { + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; /* * Make sure updated value of ->s_mount_flags will be visible * before ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); - save_error_info(sb, function, line); } - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) panic("EXT4-fs panic from previous error\n"); - } } void __ext4_msg(struct super_block *sb, @@ -712,6 +892,7 @@ struct va_format vaf; va_list args; + atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; @@ -722,9 +903,12 @@ va_end(args); } -#define ext4_warning_ratelimit(sb) \ - ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ - "EXT4-fs warning") +static int ext4_warning_ratelimit(struct super_block *sb) +{ + atomic_inc(&EXT4_SB(sb)->s_warning_count); + return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), + "EXT4-fs warning"); +} void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) @@ -770,15 +954,12 @@ { struct va_format vaf; va_list args; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; trace_ext4_error(sb, function, line); - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - __save_error_info(sb, function, line); + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); @@ -830,6 +1011,8 @@ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; + if (!grp || !gdp) + return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); @@ -882,7 +1065,6 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) @@ -890,8 +1072,9 @@ return bdev; fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); return NULL; } @@ -906,10 +1089,16 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; - bdev = sbi->journal_bdev; + bdev = sbi->s_journal_bdev; if (bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + invalidate_bdev(bdev); ext4_blkdev_put(bdev); - sbi->journal_bdev = NULL; + sbi->s_journal_bdev = NULL; } } @@ -974,6 +1163,18 @@ int aborted = 0; int i, err; + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_error_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * flush_stashed_error_work will call start_this_handle may trigger + * BUG_ON. + */ + ext4_unregister_sysfs(sb); + ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -983,11 +1184,11 @@ aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if ((err < 0) && !aborted) - ext4_abort(sb, "Couldn't clean up the journal"); + if ((err < 0) && !aborted) { + ext4_abort(sb, -err, "Couldn't clean up the journal"); + } } - ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -1017,6 +1218,7 @@ percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -1033,26 +1235,19 @@ sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - sync_blockdev(sbi->journal_bdev); - invalidate_bdev(sbi->journal_bdev); + if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { + sync_blockdev(sbi->s_journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + + ext4_stop_mmpd(sbi); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* @@ -1065,7 +1260,7 @@ crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE utf8_unload(sb->s_encoding); #endif @@ -1086,8 +1281,10 @@ return NULL; inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); @@ -1096,9 +1293,8 @@ ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_da_metadata_calc_len = 0; - ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); @@ -1110,6 +1306,8 @@ ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + ext4_fc_init_inode(&ei->vfs_inode); + mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1124,12 +1322,13 @@ return drop; } -static void ext4_i_callback(struct rcu_head *head) +static void ext4_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - fscrypt_free_inode(inode); - + if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { + pr_warn("%s: inode %ld still in fc list", + __func__, inode->i_ino); + } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } @@ -1144,7 +1343,12 @@ true); dump_stack(); } - call_rcu(&inode->i_rcu, ext4_i_callback); + + if (EXT4_I(inode)->i_reserved_data_blocks) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); } static void init_once(void *foo) @@ -1156,6 +1360,7 @@ init_rwsem(&ei->i_data_sem); init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) @@ -1184,11 +1389,12 @@ void ext4_clear_inode(struct inode *inode) { + ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); - dquot_drop(inode); - ext4_discard_preallocations(inode); + ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1258,8 +1464,8 @@ if (!page_has_buffers(page)) return 0; if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_DIRECT_RECLAIM); + return jbd2_journal_try_to_free_buffers(journal, page); + return try_to_free_buffers(page); } @@ -1288,6 +1494,9 @@ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; @@ -1313,7 +1522,7 @@ * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1340,7 +1549,7 @@ * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); @@ -1354,10 +1563,9 @@ return res; } -static const union fscrypt_context * -ext4_get_dummy_context(struct super_block *sb) +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { - return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; + return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1372,21 +1580,15 @@ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); } -static bool ext4_inline_crypt_enabled(struct super_block *sb) -{ - return test_opt(sb, INLINECRYPT); -} - static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .get_dummy_context = ext4_get_dummy_context, + .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, - .inline_crypt_enabled = ext4_inline_crypt_enabled, }; #endif @@ -1409,7 +1611,6 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1427,7 +1628,7 @@ .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, - .get_next_id = ext4_get_next_id, + .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1444,6 +1645,7 @@ static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, + .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, @@ -1485,7 +1687,8 @@ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1494,6 +1697,10 @@ Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, + Opt_prefetch_block_bitmaps, +#ifdef CONFIG_EXT4_DEBUG + Opt_fc_debug_max_replay, Opt_fc_debug_force +#endif }; static const match_table_t tokens = { @@ -1552,6 +1759,9 @@ {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1570,18 +1780,24 @@ {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "nodioread_nolock"}, {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, +#ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, "fc_debug_force"}, + {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, +#endif {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ + {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1700,6 +1916,8 @@ #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 +#define MOPT_SKIP 0x0800 +#define MOPT_2 0x1000 static const struct mount_opts { int token; @@ -1724,6 +1942,7 @@ MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, @@ -1749,7 +1968,13 @@ {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1791,12 +2016,14 @@ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, -#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - {Opt_inlinecrypt, EXT4_MOUNT_INLINECRYPT, MOPT_SET}, -#else - {Opt_inlinecrypt, EXT4_MOUNT_INLINECRYPT, MOPT_NOSUPPORT}, -#endif {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + MOPT_SET}, +#ifdef CONFIG_EXT4_DEBUG + {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, + {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, +#endif {Opt_err, 0, 0} }; @@ -1839,18 +2066,25 @@ struct ext4_sb_info *sbi = EXT4_SB(sb); int err; + if (!ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "test_dummy_encryption requires encrypt feature"); + return -1; + } + /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ - if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + if (is_remount && !sbi->s_dummy_enc_policy.policy) { ext4_msg(sb, KERN_WARNING, "Can't set test_dummy_encryption on remount"); return -1; } - err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + err = fscrypt_set_test_dummy_encryption(sb, arg->from, + &sbi->s_dummy_enc_policy); if (err) { if (err == -EEXIST) ext4_msg(sb, KERN_WARNING, @@ -1865,11 +2099,13 @@ return -1; } ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); + return 1; #else ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); + "test_dummy_encryption option not supported"); + return -1; + #endif - return 1; } static int handle_mount_opt(struct super_block *sb, char *opt, int token, @@ -1903,7 +2139,7 @@ ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); return 1; case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); return 1; case Opt_i_version: sb->s_flags |= SB_I_VERSION; @@ -1913,6 +2149,13 @@ return 1; case Opt_nolazytime: sb->s_flags &= ~SB_LAZYTIME; + return 1; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; +#else + ext4_msg(sb, KERN_ERR, "inline encryption not supported"); +#endif return 1; } @@ -1962,6 +2205,13 @@ } else if (token == Opt_commit) { if (arg == 0) arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (arg > INT_MAX / HZ) { + ext4_msg(sb, KERN_ERR, + "Invalid commit interval %d, " + "must be smaller than %d", + arg, INT_MAX / HZ); + return -1; + } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { if ((arg & 1) || @@ -1991,6 +2241,10 @@ sbi->s_li_wait_mult = arg; } else if (token == Opt_max_dir_size_kb) { sbi->s_max_dir_size_kb = arg; +#ifdef CONFIG_EXT4_DEBUG + } else if (token == Opt_fc_debug_max_replay) { + sbi->s_fc_debug_max_replay = arg; +#endif } else if (token == Opt_stripe) { sbi->s_stripe = arg; } else if (token == Opt_resuid) { @@ -2092,23 +2346,56 @@ } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - if (is_remount && test_opt(sb, DAX)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - return -1; + switch (token) { + case Opt_dax: + case Opt_dax_always: + if (is_remount && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { + fail_dax_change_remount: + ext4_msg(sb, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -1; + } + if (is_remount && + (test_opt(sb, DATA_FLAGS) == + EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -1; + } + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + break; + case Opt_dax_never: + if (is_remount && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) + goto fail_dax_change_remount; + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + break; + case Opt_dax_inode: + if (is_remount && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) + goto fail_dax_change_remount; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + /* Strictly for printing options */ + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + break; } - if (is_remount && !(sbi->s_mount_opt & EXT4_MOUNT_DAX)) { - ext4_msg(sb, KERN_ERR, "can't change " - "dax mount option while remounting"); - return -1; - } - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; #else ext4_msg(sb, KERN_INFO, "dax option not supported"); + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; return -1; #endif } else if (token == Opt_data_err_abort) { @@ -2126,10 +2413,17 @@ WARN_ON(1); return -1; } - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; - else - sbi->s_mount_opt &= ~m->mount_opt; + if (m->flags & MOPT_2) { + if (arg != 0) + sbi->s_mount_opt2 |= m->mount_opt; + else + sbi->s_mount_opt2 &= ~m->mount_opt; + } else { + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; + } } return 1; } @@ -2139,7 +2433,7 @@ unsigned int *journal_ioprio, int is_remount) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -2196,12 +2490,10 @@ if (test_opt(sb, DIOREAD_NOLOCK)) { int blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } + if (blocksize < PAGE_SIZE) + ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " + "experimental mount option 'dioread_nolock' " + "for blocksize < PAGE_SIZE"); } return 1; } @@ -2274,7 +2566,7 @@ for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) + (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2334,6 +2626,19 @@ fscrypt_show_test_dummy_encryption(seq, sep, sb); + if (sb->s_flags & SB_INLINECRYPT) + SEQ_OPTS_PUTS("inlinecrypt"); + + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } ext4_show_quota_options(seq, sb); return 0; } @@ -2528,11 +2833,9 @@ crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if (ext4_has_feature_64bit(sb) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) + if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + sbi->s_desc_size - offset); out: return cpu_to_le16(crc); @@ -2859,13 +3162,9 @@ loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; - /* small i_blocks in vfs inode? */ - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { - /* - * CONFIG_LBDAF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 - */ + BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); + + if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ @@ -2895,22 +3194,22 @@ */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - loff_t res = EXT4_NDIR_BLOCKS; + unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block + + /* + * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ - - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files) { /* - * !has_huge_files or CONFIG_LBDAF not enabled implies that - * the inode i_block field represents total file blocks in - * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 + * !has_huge_files or implies that the inode i_block field + * represents total file blocks in 2^32 512-byte sectors == + * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -2948,7 +3247,7 @@ if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; - return res; + return (loff_t)res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, @@ -3059,18 +3358,6 @@ ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } - /* - * Large file size enabled file system can only be mounted - * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF - */ - if (ext4_has_feature_huge_file(sb)) { - if (sizeof(blkcnt_t) < sizeof(u64)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge files " - "cannot be mounted RDWR without " - "CONFIG_LBDAF"); - return 0; - } - } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " @@ -3140,15 +3427,34 @@ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; - ext4_group_t group, ngroups; - struct super_block *sb; + struct super_block *sb = elr->lr_super; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t group = elr->lr_next_group; + unsigned int prefetch_ios = 0; int ret = 0; u64 start_time; - sb = elr->lr_super; - ngroups = EXT4_SB(sb)->s_groups_count; + if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { + elr->lr_next_group = ext4_mb_prefetch(sb, group, + EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); + if (prefetch_ios) + ext4_mb_prefetch_fini(sb, elr->lr_next_group, + prefetch_ios); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, + prefetch_ios); + if (group >= elr->lr_next_group) { + ret = 1; + if (elr->lr_first_not_zeroed != ngroups && + !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + elr->lr_next_group = elr->lr_first_not_zeroed; + elr->lr_mode = EXT4_LI_MODE_ITABLE; + ret = 0; + } + } + return ret; + } - for (group = elr->lr_next_group; group < ngroups; group++) { + for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; @@ -3166,9 +3472,10 @@ start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); + trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * - elr->lr_sbi->s_li_wait_mult); + EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; @@ -3182,15 +3489,11 @@ */ static void ext4_remove_li_request(struct ext4_li_request *elr) { - struct ext4_sb_info *sbi; - if (!elr) return; - sbi = elr->lr_sbi; - list_del(&elr->lr_request); - sbi->s_li_request = NULL; + EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } @@ -3227,6 +3530,7 @@ unsigned long next_wakeup, cur; BUG_ON(NULL == eli); + set_freezable(); cont_thread: while (true) { @@ -3399,7 +3703,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { - struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); @@ -3407,8 +3710,13 @@ return NULL; elr->lr_super = sb; - elr->lr_sbi = sbi; - elr->lr_next_group = start; + elr->lr_first_not_zeroed = start; + if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; + else { + elr->lr_mode = EXT4_LI_MODE_ITABLE; + elr->lr_next_group = start; + } /* * Randomize first schedule time of the request to @@ -3438,8 +3746,9 @@ goto out; } - if (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE)) + if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || sb_rdonly(sb) || + !test_opt(sb, INIT_INODE_TABLE))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -3556,9 +3865,11 @@ ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + @@ -3650,8 +3961,8 @@ * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->journal_bdev) - overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); + if (sbi->s_journal && !sbi->s_journal_bdev) + overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); @@ -3719,7 +4030,7 @@ int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files, has_bigalloc; + int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -3764,8 +4075,11 @@ logical_sb_block = sb_block; } - if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); + ret = PTR_ERR(bh); + bh = NULL; goto out_fail; } /* @@ -3832,6 +4146,8 @@ #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif + if (ext4_has_feature_fast_commit(sb)) + set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); @@ -3877,14 +4193,25 @@ */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d (%d log_block_size)", - blocksize, le32_to_cpu(es->s_log_block_size)); + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -3915,9 +4242,12 @@ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } + sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - @@ -3997,20 +4327,16 @@ #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ + clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); goto failed_mount; } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dioread_nolock"); - goto failed_mount; - } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4098,21 +4424,6 @@ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; - if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; - } - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", @@ -4120,16 +4431,19 @@ goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (bdev_dax_supported(sb->s_bdev, blocksize)) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + goto failed_mount; } - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device. Turning off DAX."); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + "DAX unsupported by block device."); + goto failed_mount; } } @@ -4140,20 +4454,28 @@ } if (sb->s_blocksize != blocksize) { + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); + bh = NULL; goto failed_mount; } - brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); - bh = sb_bread_unmovable(sb, logical_sb_block); - if (!bh) { + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; goto failed_mount; } es = (struct ext4_super_block *)(bh->b_data + offset); @@ -4199,7 +4521,7 @@ sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -4226,8 +4548,7 @@ /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = ext4_has_feature_bigalloc(sb); - if (has_bigalloc) { + if (ext4_has_feature_bigalloc(sb)) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " @@ -4283,8 +4604,6 @@ if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); - if (sizeof(sector_t) < 8) - ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); goto failed_mount; } @@ -4368,18 +4687,20 @@ /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sb_breadahead_unmovable(sb, block); + ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); - bh = sb_bread_unmovable(sb, block); - if (!bh) { + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); db_count = i; + ret = PTR_ERR(bh); + bh = NULL; goto failed_mount2; } rcu_read_lock(); @@ -4427,14 +4748,36 @@ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + atomic_set(&sbi->s_fc_ineligible_updates, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + spin_lock_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; + sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { + err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); + if (err) goto failed_mount3a; + } /* * The first inode we look at is the journal inode. Don't try @@ -4448,34 +4791,36 @@ ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount_wq; + goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount_wq; - } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; + } + + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_checksum, fs mounted w/o journal"); + goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); - goto failed_mount_wq; + goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); + clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; @@ -4491,6 +4836,14 @@ if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); + goto failed_mount_wq; + } + + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); goto failed_mount_wq; } @@ -4533,7 +4886,10 @@ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; no_journal: if (!test_opt(sb, NO_MBCACHE)) { @@ -4554,31 +4910,27 @@ } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Unsupported blocksize for fs encryption"); - goto failed_mount_wq; - } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; - } - - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && - !ext4_has_feature_encrypt(sb)) { - ext4_set_feature_encrypt(sb); - ext4_commit_super(sb, 1); } /* * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; @@ -4638,6 +4990,7 @@ goto failed_mount4a; } } + ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); err = ext4_mb_init(sb); @@ -4646,6 +4999,14 @@ err); goto failed_mount5; } + + /* + * We can only set up the journal commit callback once + * mballoc is initialized + */ + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = + ext4_journal_commit_callback; block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, @@ -4665,6 +5026,9 @@ ext4_count_dirs(sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); @@ -4700,6 +5064,14 @@ } #endif /* CONFIG_QUOTA */ + /* + * Save the original bdev mapping's wb_err value which could be + * used to detect the metadata async write error. + */ + spin_lock_init(&sbi->s_bdev_wb_lock); + errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, + &sbi->s_bdev_wb_err); + sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -4741,6 +5113,8 @@ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); + atomic_set(&sbi->s_warning_count, 0); + atomic_set(&sbi->s_msg_count, 0); kfree(orig_data); return 0; @@ -4769,6 +5143,7 @@ percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); failed_mount5: ext4_ext_release(sb); @@ -4781,14 +5156,12 @@ if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4797,8 +5170,7 @@ ext4_es_unregister_shrinker(sbi); failed_mount3: del_timer_sync(&sbi->s_err_report); - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); failed_mount2: rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); @@ -4816,12 +5188,14 @@ #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(get_qf_name(sb, sbi, i)); #endif - fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); - ext4_blkdev_remove(sbi); + fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); + /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(bh); + ext4_blkdev_remove(sbi); out_fail: + invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); out_free_base: @@ -4843,6 +5217,7 @@ journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; + ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -4880,7 +5255,7 @@ jbd_debug(2, "Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; @@ -4985,9 +5360,7 @@ goto out_bdev; } journal->j_private = sb; - ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { + if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); goto out_journal; } @@ -4997,7 +5370,7 @@ be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } - EXT4_SB(sb)->journal_bdev = bdev; + EXT4_SB(sb)->s_journal_bdev = bdev; ext4_init_journal_params(sb, journal); return journal; @@ -5323,7 +5696,7 @@ needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } @@ -5413,11 +5786,11 @@ struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags, vfs_flags; struct ext4_mount_options old_opts; - int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA + int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif @@ -5502,8 +5875,8 @@ goto restore_opts; } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, "Abort forced by user"); + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -5516,7 +5889,7 @@ } if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; goto restore_opts; } @@ -5551,8 +5924,6 @@ */ ext4_mark_recovery_complete(sb, es); } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || @@ -5603,22 +5974,59 @@ if (err) goto restore_opts; } - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = (le16_to_cpu(es->s_state) & + ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; - if (ext4_has_feature_mmp(sb)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; + if (ext4_has_feature_mmp(sb)) { + err = ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block)); + if (err) goto restore_opts; - } + } +#ifdef CONFIG_QUOTA enable_quota = 1; +#endif } } + + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + } + + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb, 1); + if (err) + goto restore_opts; + } + +#ifdef CONFIG_QUOTA + if (enable_quota) { + if (sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); + else if (ext4_has_feature_quota(sb)) { + err = ext4_enable_quotas(sb); + if (err) + goto restore_opts; + } + } + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); +#endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) + ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on @@ -5632,39 +6040,8 @@ ext4_register_li_request(sb, first_not_zeroed); } - /* - * Handle creation of system zone data early because it can fail. - * Releasing of existing data is done when we are sure remount will - * succeed. - */ - if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { - err = ext4_setup_system_zone(sb); - if (err) - goto restore_opts; - } - - if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { - err = ext4_commit_super(sb, 1); - if (err) - goto restore_opts; - } - -#ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); - if (enable_quota) { - if (sb_any_quota_suspended(sb)) - dquot_resume(sb, -1); - else if (ext4_has_feature_quota(sb)) { - err = ext4_enable_quotas(sb); - if (err) - goto restore_opts; - } - } -#endif - if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) - ext4_release_system_zone(sb); + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); /* * Some options can be enabled by ext4 and/or by VFS mount flag @@ -5678,6 +6055,13 @@ return 0; restore_opts: + /* + * If there was a failing r/w to ro transition, we may need to + * re-enable quota + */ + if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -5686,7 +6070,7 @@ sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; - if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; @@ -5698,6 +6082,8 @@ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) + ext4_stop_mmpd(sbi); kfree(orig_data); return err; } @@ -5717,9 +6103,10 @@ return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + limit >>= sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; @@ -5729,9 +6116,8 @@ (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = @@ -5774,8 +6160,7 @@ buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(fsid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && @@ -5871,7 +6256,7 @@ handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); + handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5975,7 +6360,7 @@ EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); @@ -5986,6 +6371,20 @@ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; +} + +static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, @@ -6004,16 +6403,23 @@ if (!qf_inums[type]) return -EPERM; + if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); @@ -6045,8 +6451,9 @@ if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode; @@ -6093,12 +6500,14 @@ * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + err = PTR_ERR(handle); goto out_unlock; + } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); @@ -6156,7 +6565,7 @@ { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err, offset = off & (sb->s_blocksize - 1); + int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -6182,7 +6591,7 @@ bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); - } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); @@ -6204,21 +6613,11 @@ if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; } - return len; -} - -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) -{ - const struct quota_format_ops *ops; - - if (!sb_has_quota_loaded(sb, qid->type)) - return -ESRCH; - ops = sb_dqopt(sb)->ops[qid->type]; - if (!ops || !ops->get_next_id) - return -ENOSYS; - return dquot_get_next_id(sb, qid); + return err ? err : len; } #endif @@ -6314,6 +6713,10 @@ if (err) return err; + err = ext4_init_pending(); + if (err) + goto out7; + err = ext4_init_post_read_processing(); if (err) goto out6; @@ -6336,6 +6739,11 @@ err = init_inodecache(); if (err) goto out1; + + err = ext4_fc_init_dentry_cache(); + if (err) + goto out05; + register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); @@ -6346,6 +6754,8 @@ out: unregister_as_ext2(); unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); +out05: destroy_inodecache(); out1: ext4_exit_mballoc(); @@ -6358,6 +6768,8 @@ out5: ext4_exit_post_read_processing(); out6: + ext4_exit_pending(); +out7: ext4_exit_es(); return err; @@ -6369,6 +6781,7 @@ unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); @@ -6376,11 +6789,13 @@ ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); + ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(ANDROID_GKI_VFS_EXPORT_ONLY); MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) -- Gitblit v1.6.2