From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/block_dev.c | 816 ++++++++++++++++++++++++++++------------------------------ 1 files changed, 393 insertions(+), 423 deletions(-) diff --git a/kernel/fs/block_dev.c b/kernel/fs/block_dev.c index 182ed1f..f250a98 100644 --- a/kernel/fs/block_dev.c +++ b/kernel/fs/block_dev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/block_dev.c * @@ -18,22 +19,21 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> -#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> -#include <linux/dax.h> -#include <linux/badblocks.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include "internal.h" struct bdev_inode { @@ -75,7 +75,7 @@ } /* Kill _all_ buffers and pagecache , dirty or not.. */ -void kill_bdev(struct block_device *bdev) +static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; @@ -84,8 +84,7 @@ invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); -} -EXPORT_SYMBOL(kill_bdev); +} /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) @@ -104,9 +103,47 @@ } EXPORT_SYMBOL(invalidate_bdev); +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, + loff_t lstart, loff_t lend) +{ + struct block_device *claimed_bdev = NULL; + int err; + + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & FMODE_EXCL)) { + claimed_bdev = bdev->bd_contains; + err = bd_prepare_to_claim(bdev, claimed_bdev, + truncate_bdev_range); + if (err) + goto invalidate; + } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (claimed_bdev) + bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + return 0; + +invalidate: + /* + * Someone else has handle exclusively open. Try invalidating instead. + * The 'end' argument is inclusive so the rounding is safe. + */ + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + lstart >> PAGE_SHIFT, + lend >> PAGE_SHIFT); +} +EXPORT_SYMBOL(truncate_bdev_range); + static void set_init_blocksize(struct block_device *bdev) { - unsigned bsize = bdev_logical_block_size(bdev); + unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(bdev->bd_inode); while (bsize < PAGE_SIZE) { @@ -114,7 +151,6 @@ break; bsize <<= 1; } - bdev->bd_block_size = bsize; bdev->bd_inode->i_blkbits = blksize_bits(bsize); } @@ -129,9 +165,8 @@ return -EINVAL; /* Don't change the size if it is same as current */ - if (bdev->bd_block_size != size) { + if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); - bdev->bd_block_size = size; bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } @@ -151,7 +186,7 @@ return sb->s_blocksize; } -EXPORT_SYMBOL(sb_set_blocksize); +EXPORT_SYMBOL_NS(sb_set_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY); int sb_min_blocksize(struct super_block *sb, int size) { @@ -161,7 +196,7 @@ return sb_set_blocksize(sb, size); } -EXPORT_SYMBOL(sb_min_blocksize); +EXPORT_SYMBOL_NS(sb_min_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY); static int blkdev_get_block(struct inode *inode, sector_t iblock, @@ -195,7 +230,7 @@ struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -204,13 +239,12 @@ { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec; + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; blk_qc_t qc; - int i; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -246,6 +280,10 @@ bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(&bio, iocb); qc = submit_bio(&bio); for (;;) { @@ -253,17 +291,12 @@ if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) - io_schedule(); + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); } __set_current_state(TASK_RUNNING); - bio_for_each_segment_all(bvec, &bio, i) { - if (should_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); - } - + bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); @@ -291,6 +324,14 @@ static struct bio_set blkdev_dio_pool; +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; @@ -312,23 +353,20 @@ } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); + if (dio->multi_bio) + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); + bio_release_pages(bio, false); bio_put(bio); } } @@ -342,6 +380,7 @@ struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; @@ -352,20 +391,27 @@ return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; - dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + dio->should_dirty = is_read && iter_is_iovec(iter); - blk_start_plug(&plug); + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -389,17 +435,36 @@ bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + polled = true; + } + qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); break; } if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { @@ -409,7 +474,9 @@ submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - blk_finish_plug(&plug); + + if (!is_poll) + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -420,8 +487,8 @@ break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) - io_schedule(); + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -502,55 +569,47 @@ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ -struct super_block *freeze_bdev(struct block_device *bdev) +int freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) { - /* - * We don't even need to grab a reference - the first call - * to freeze_bdev grab an active reference and only the last - * thaw_bdev drops it. - */ - sb = get_super(bdev); - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } + if (++bdev->bd_fsfreeze_count > 1) + goto done; sb = get_active_super(bdev); if (!sb) - goto out; + goto sync; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); - if (error) { - deactivate_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } deactivate_super(sb); - out: + + if (error) { + bdev->bd_fsfreeze_count--; + goto done; + } + bdev->bd_fsfreeze_sb = sb; + +sync: sync_blockdev(bdev); +done: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ + return error; } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock - * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev) { + struct super_block *sb; int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); @@ -561,6 +620,7 @@ if (--bdev->bd_fsfreeze_count > 0) goto out; + sb = bdev->bd_fsfreeze_sb; if (!sb) goto out; @@ -586,10 +646,9 @@ return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -644,7 +703,7 @@ * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; @@ -677,15 +736,14 @@ if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } -EXPORT_SYMBOL_GPL(bdev_read_page); /** * bdev_write_page() - Start writing a page to a block device @@ -714,7 +772,7 @@ if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, 0); + result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; @@ -727,10 +785,9 @@ clean_page_buffers(page); unlock_page(page); } - blk_queue_exit(bdev->bd_queue); + blk_queue_exit(bdev->bd_disk->queue); return result; } -EXPORT_SYMBOL_GPL(bdev_write_page); /* * pseudo-fs @@ -747,17 +804,9 @@ return &ei->vfs_inode; } -static void bdev_i_callback(struct rcu_head *head) +static void bdev_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - struct bdev_inode *bdi = BDEV_I(inode); - - kmem_cache_free(bdev_cachep, bdi); -} - -static void bdev_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bdev_i_callback); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *foo) @@ -767,7 +816,6 @@ memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif @@ -783,9 +831,6 @@ truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - spin_lock(&bdev_lock); - list_del_init(&bdev->bd_list); - spin_unlock(&bdev_lock); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { @@ -797,24 +842,24 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, - .destroy_inode = bdev_destroy_inode, + .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; -static struct dentry *bd_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bd_init_fs_context(struct fs_context *fc) { - struct dentry *dent; - dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); - if (!IS_ERR(dent)) - dent->d_sb->s_iflags |= SB_I_CGROUPWB; - return dent; + struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); + if (!ctx) + return -ENOMEM; + fc->s_iflags |= SB_I_CGROUPWB; + ctx->ops = &bdev_sops; + return 0; } static struct file_system_type bd_type = { .name = "bdev", - .mount = bd_mount, + .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; @@ -860,24 +905,7 @@ return 0; } -static LIST_HEAD(all_bdevs); - -/* - * If there is a bdev inode for this device, unhash it so that it gets evicted - * as soon as last inode reference is dropped. - */ -void bdev_unhash_inode(dev_t dev) -{ - struct inode *inode; - - inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); - if (inode) { - remove_inode_hash(inode); - iput(inode); - } -} - -struct block_device *bdget(dev_t dev) +static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; @@ -891,26 +919,20 @@ bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { + spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; - bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - spin_lock(&bdev_lock); - list_add(&bdev->bd_list, &all_bdevs); - spin_unlock(&bdev_lock); unlock_new_inode(inode); } return bdev; } - -EXPORT_SYMBOL(bdget); /** * bdgrab -- Grab a reference to an already referenced block device @@ -923,15 +945,21 @@ } EXPORT_SYMBOL(bdgrab); +struct block_device *bdget_part(struct hd_struct *part) +{ + return bdget(part_devt(part)); +} + long nr_blockdev_pages(void) { - struct block_device *bdev; + struct inode *inode; long ret = 0; - spin_lock(&bdev_lock); - list_for_each_entry(bdev, &all_bdevs, bd_list) { - ret += bdev->bd_inode->i_mapping->nrpages; - } - spin_unlock(&bdev_lock); + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + ret += inode->i_mapping->nrpages; + spin_unlock(&blockdev_superblock->s_inode_list_lock); + return ret; } @@ -1033,30 +1061,28 @@ } /** - * bd_prepare_to_claim - prepare to claim a block device + * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * - * Prepare to claim @bdev. This function fails if @bdev is already - * claimed by another holder and waits if another claiming is in - * progress. This function doesn't actually claim. On successful - * return, the caller has ownership of bd_claiming and bd_holder[s]. - * - * CONTEXT: - * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab - * it multiple times. + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -static int bd_prepare_to_claim(struct block_device *bdev, - struct block_device *whole, void *holder) +int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { retry: + spin_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) + if (!bd_may_claim(bdev, whole, holder)) { + spin_unlock(&bdev_lock); return -EBUSY; + } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { @@ -1067,13 +1093,15 @@ spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); - spin_lock(&bdev_lock); goto retry; } /* yay, all mine */ + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); return 0; } +EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { @@ -1096,77 +1124,59 @@ return disk; } -/** - * bd_start_claiming - start claiming a block device - * @bdev: block device of interest - * @holder: holder trying to claim @bdev - * - * @bdev is about to be opened exclusively. Check @bdev can be opened - * exclusively and mark that an exclusive open is in progress. Each - * successful call to this function must be matched with a call to - * either bd_finish_claiming() or bd_abort_claiming() (which do not - * fail). - * - * This function is used to gain exclusive access to the block device - * without actually causing other exclusive open attempts to fail. It - * should be used when the open sequence itself requires exclusive - * access but may subsequently fail. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to the block device containing @bdev on success, ERR_PTR() - * value on failure. - */ -static struct block_device *bd_start_claiming(struct block_device *bdev, - void *holder) +static void bd_clear_claiming(struct block_device *whole, void *holder) { - struct gendisk *disk; - struct block_device *whole; - int partno, err; - - might_sleep(); - - /* - * @bdev might not have been initialized properly yet, look up - * and grab the outer block device the hard way. - */ - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - return ERR_PTR(-ENXIO); - - /* - * Normally, @bdev should equal what's returned from bdget_disk() - * if partno is 0; however, some drivers (floppy) use multiple - * bdev's for the same physical device and @bdev may be one of the - * aliases. Keep @bdev if partno is 0. This means claimer - * tracking is broken for those devices but it has always been that - * way. - */ - if (partno) - whole = bdget_disk(disk, 0); - else - whole = bdgrab(bdev); - - put_disk_and_module(disk); - if (!whole) - return ERR_PTR(-ENOMEM); - - /* prepare to claim, if successful, mark claiming in progress */ - spin_lock(&bdev_lock); - - err = bd_prepare_to_claim(bdev, whole, holder); - if (err == 0) { - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return whole; - } else { - spin_unlock(&bdev_lock); - bdput(whole); - return ERR_PTR(err); - } + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); } + +/** + * bd_finish_claiming - finish claiming of a block device + * @bdev: block device of interest + * @whole: whole block device + * @holder: holder that has claimed @bdev + * + * Finish exclusive open of a block device. Mark the device as exlusively + * open by the holder and wake up all waiters for exclusive open to finish. + */ +static void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + /* + * Note that for a whole device bd_holders will be incremented twice, + * and bd_holder will be set to bd_may_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @whole: whole block device + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + spin_lock(&bdev_lock); + bd_clear_claiming(whole, holder); + spin_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { @@ -1312,26 +1322,6 @@ #endif /** - * flush_disk - invalidates all buffer-cache entries on a disk - * - * @bdev: struct block device to be flushed - * @kill_dirty: flag to guide handling of dirty inodes - * - * Invalidates all buffer-cache entries on a disk. It should be called - * when a disk has been changed -- either by a media change or online - * resize. - */ -static void flush_disk(struct block_device *bdev, bool kill_dirty) -{ - if (__invalidate_device(bdev, kill_dirty)) { - printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", - bdev->bd_disk ? bdev->bd_disk->disk_name : ""); - } - bdev->bd_invalidated = 1; -} - -/** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. @@ -1341,11 +1331,12 @@ * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, - bool verbose) +static void check_disk_size_change(struct gendisk *disk, + struct block_device *bdev, bool verbose) { loff_t disk_size, bdev_size; + spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { @@ -1355,89 +1346,109 @@ disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); - if (bdev_size > disk_size) - flush_disk(bdev, false); + } + spin_unlock(&bdev->bd_size_lock); + + if (bdev_size > disk_size) { + if (__invalidate_device(bdev, false)) + pr_warn("VFS: busy inodes on resized disk %s\n", + disk->disk_name); } } /** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back - * @disk: struct gendisk to be revalidated + * revalidate_disk_size - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @verbose: if %true log a message about a size change if there is any * - * This routine is a wrapper for lower-level driver's revalidate_disk - * call-backs. It is used to do common pre and post operations needed - * for all revalidate_disk operations. + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ -int revalidate_disk(struct gendisk *disk) +void revalidate_disk_size(struct gendisk *disk, bool verbose) { struct block_device *bdev; - int ret = 0; - if (disk->fops->revalidate_disk) - ret = disk->fops->revalidate_disk(disk); + /* + * Hidden disks don't have associated bdev so there's no point in + * revalidating them. + */ + if (disk->flags & GENHD_FL_HIDDEN) + return; + bdev = bdget_disk(disk, 0); - if (!bdev) - return ret; - - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev, ret == 0); - bdev->bd_invalidated = 0; - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; + if (bdev) { + check_disk_size_change(disk, bdev, verbose); + bdput(bdev); + } } -EXPORT_SYMBOL(revalidate_disk); +EXPORT_SYMBOL(revalidate_disk_size); -/* - * This routine checks whether a removable media has been changed, - * and invalidates all buffer-cache-entries in that case. This - * is a relatively slow routine, so we have to try to minimize using - * it. Thus it is called only upon a 'mount' or 'open'. This - * is the best way of combining speed and utility, I think. - * People changing diskettes in the middle of an operation deserve - * to lose :-) - */ -int check_disk_change(struct block_device *bdev) +void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { - struct gendisk *disk = bdev->bd_disk; - const struct block_device_operations *bdops = disk->fops; - unsigned int events; - - events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return 0; - - flush_disk(bdev, true); - if (bdops->revalidate_disk) - bdops->revalidate_disk(bdev->bd_disk); - return 1; + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); } - -EXPORT_SYMBOL(check_disk_change); - -void bd_set_size(struct block_device *bdev, loff_t size) -{ - inode_lock(bdev->bd_inode); - i_size_write(bdev->bd_inode, size); - inode_unlock(bdev->bd_inode); -} -EXPORT_SYMBOL(bd_set_size); +EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); -static void bdev_disk_changed(struct block_device *bdev, bool invalidate) +int bdev_disk_changed(struct block_device *bdev, bool invalidate) { - if (disk_part_scan_enabled(bdev->bd_disk)) { - if (invalidate) - invalidate_partitions(bdev->bd_disk, bdev); - else - rescan_partitions(bdev->bd_disk, bdev); + struct gendisk *disk = bdev->bd_disk; + int ret; + + lockdep_assert_held(&bdev->bd_mutex); + + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + +rescan: + ret = blk_drop_partitions(bdev); + if (ret) + return ret; + + clear_bit(GD_NEED_PART_SCAN, &disk->state); + + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); } else { - check_disk_size_change(bdev->bd_disk, bdev, !invalidate); - bdev->bd_invalidated = 0; + if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); } + + check_disk_size_change(disk, bdev, !invalidate); + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk, bdev); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; } +/* + * Only exported for for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); /* * bd_mutex locking: @@ -1446,40 +1457,46 @@ * mutex_lock_nested(whole->bd_mutex, 1) */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, + int for_part) { + struct block_device *whole = NULL, *claiming = NULL; struct gendisk *disk; int ret; int partno; - int perm = 0; - bool first_open = false; - - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - if (!for_part) { - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) - return ret; - } + bool first_open = false, unblock_events = true, need_restart; restart: - + need_restart = false; ret = -ENXIO; disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; + + if (partno) { + whole = bdget_disk(disk, 0); + if (!whole) { + ret = -ENOMEM; + goto out_put_disk; + } + } + + if (!for_part && (mode & FMODE_EXCL)) { + WARN_ON_ONCE(!holder); + if (whole) + claiming = whole; + else + claiming = bdev; + ret = bd_prepare_to_claim(bdev, claiming, holder); + if (ret) + goto out_put_whole; + } disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { first_open = true; bdev->bd_disk = disk; - bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; bdev->bd_partno = partno; @@ -1492,24 +1509,16 @@ ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); - if (ret == -ERESTARTSYS) { - /* Lost a race with 'disk' being - * deleted, try again. - * See md.c - */ - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - bdev->bd_disk = NULL; - bdev->bd_queue = NULL; - mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); - put_disk_and_module(disk); - goto restart; - } + /* + * If we lost a race with 'disk' being deleted, + * try again. See md.c + */ + if (ret == -ERESTARTSYS) + need_restart = true; } if (!ret) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } @@ -1519,32 +1528,25 @@ * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_clear; } else { - struct block_device *whole; - whole = bdget_disk(disk, 0); - ret = -ENOMEM; - if (!whole) - goto out_clear; BUG_ON(for_part); - ret = __blkdev_get(whole, mode, 1); - if (ret) { - bdput(whole); + ret = __blkdev_get(whole, mode, NULL, 1); + if (ret) goto out_clear; - } - bdev->bd_contains = whole; + bdev->bd_contains = bdgrab(whole); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { ret = -ENXIO; goto out_clear; } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } @@ -1556,7 +1558,7 @@ if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && + if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) @@ -1566,27 +1568,52 @@ bdev->bd_openers++; if (for_part) bdev->bd_part_count++; + if (claiming) + bd_finish_claiming(bdev, claiming, holder); + + /* + * Block event polling for write claims if requested. Any write holder + * makes the write_holder state stick until all are released. This is + * good enough and tracking individual writeable reference is too + * fragile given the way @mode is used in blkdev_get/put(). + */ + if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); + + if (unblock_events) + disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ if (!first_open) put_disk_and_module(disk); + if (whole) + bdput(whole); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - bdev->bd_queue = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: + if (claiming) + bd_abort_claiming(bdev, claiming, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + out_put_whole: + if (whole) + bdput(whole); + out_put_disk: put_disk_and_module(disk); + if (need_restart) + goto restart; out: - return ret; } @@ -1609,74 +1636,27 @@ * RETURNS: * 0 on success, -errno on failure. */ -int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - struct block_device *whole = NULL; - int res; + int ret, perm = 0; - WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret) + goto bdput; - if ((mode & FMODE_EXCL) && holder) { - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } + ret =__blkdev_get(bdev, mode, holder, 0); + if (ret) + goto bdput; + return 0; - res = __blkdev_get(bdev, mode, 0); - - if (whole) { - struct gendisk *disk = whole->bd_disk; - - /* finish claiming */ - mutex_lock(&bdev->bd_mutex); - spin_lock(&bdev_lock); - - if (!res) { - BUG_ON(!bd_may_claim(bdev, whole, holder)); - /* - * Note that for a whole device bd_holders - * will be incremented twice, and bd_holder - * will be set to bd_may_claim before being - * set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_may_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - - /* tell others that we're done */ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); - - spin_unlock(&bdev_lock); - - /* - * Block event polling for write claims if requested. Any - * write holder makes the write_holder state stick until - * all are released. This is good enough and tracking - * individual writeable reference is too fragile given the - * way @mode is used in blkdev_get/put(). - */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - disk_block_events(disk); - } - - mutex_unlock(&bdev->bd_mutex); - bdput(whole); - } - - if (res) - bdput(bdev); - - return res; +bdput: + bdput(bdev); + return ret; } -EXPORT_SYMBOL(blkdev_get); /** * blkdev_get_by_path - open a block device by name @@ -1769,7 +1749,7 @@ */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; @@ -1925,7 +1905,7 @@ if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode)) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -1999,13 +1979,14 @@ static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; @@ -2017,7 +1998,6 @@ loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; int error; @@ -2045,8 +2025,9 @@ return -EINVAL; /* Invalidate the page cache, including dirty pages. */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: @@ -2073,7 +2054,7 @@ * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ - return invalidate_inode_pages2_range(mapping, + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } @@ -2084,6 +2065,7 @@ .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, @@ -2094,18 +2076,6 @@ .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, }; - -int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -{ - int res; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - res = blkdev_ioctl(bdev, 0, cmd, arg); - set_fs(old_fs); - return res; -} - -EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name -- Gitblit v1.6.2