From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/fs/block_dev.c | 816 ++++++++++++++++++++++++++++------------------------------
1 files changed, 393 insertions(+), 423 deletions(-)
diff --git a/kernel/fs/block_dev.c b/kernel/fs/block_dev.c
index 182ed1f..f250a98 100644
--- a/kernel/fs/block_dev.c
+++ b/kernel/fs/block_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/block_dev.c
*
@@ -18,22 +19,21 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
-#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
-#include <linux/dax.h>
-#include <linux/badblocks.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <linux/uaccess.h>
+#include <linux/suspend.h>
#include "internal.h"
struct bdev_inode {
@@ -75,7 +75,7 @@
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
@@ -84,8 +84,7 @@
invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
-}
-EXPORT_SYMBOL(kill_bdev);
+}
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
@@ -104,9 +103,47 @@
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ goto invalidate;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+
+invalidate:
+ /*
+ * Someone else has handle exclusively open. Try invalidating instead.
+ * The 'end' argument is inclusive so the rounding is safe.
+ */
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+ lstart >> PAGE_SHIFT,
+ lend >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
- unsigned bsize = bdev_logical_block_size(bdev);
+ unsigned int bsize = bdev_logical_block_size(bdev);
loff_t size = i_size_read(bdev->bd_inode);
while (bsize < PAGE_SIZE) {
@@ -114,7 +151,6 @@
break;
bsize <<= 1;
}
- bdev->bd_block_size = bsize;
bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
@@ -129,9 +165,8 @@
return -EINVAL;
/* Don't change the size if it is same as current */
- if (bdev->bd_block_size != size) {
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
- bdev->bd_block_size = size;
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
@@ -151,7 +186,7 @@
return sb->s_blocksize;
}
-EXPORT_SYMBOL(sb_set_blocksize);
+EXPORT_SYMBOL_NS(sb_set_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY);
int sb_min_blocksize(struct super_block *sb, int size)
{
@@ -161,7 +196,7 @@
return sb_set_blocksize(sb, size);
}
-EXPORT_SYMBOL(sb_min_blocksize);
+EXPORT_SYMBOL_NS(sb_min_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY);
static int
blkdev_get_block(struct inode *inode, sector_t iblock,
@@ -195,7 +230,7 @@
struct task_struct *waiter = bio->bi_private;
WRITE_ONCE(bio->bi_private, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
static ssize_t
@@ -204,13 +239,12 @@
{
struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
+ struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
blk_qc_t qc;
- int i;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -246,6 +280,10 @@
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio.bi_opf |= REQ_NOWAIT;
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
for (;;) {
@@ -253,17 +291,12 @@
if (!READ_ONCE(bio.bi_private))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
- io_schedule();
+ !blk_poll(bdev_get_queue(bdev), qc, true))
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
- bio_for_each_segment_all(bvec, &bio, i) {
- if (should_dirty && !PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
- }
-
+ bio_release_pages(&bio, should_dirty);
if (unlikely(bio.bi_status))
ret = blk_status_to_errno(bio.bi_status);
@@ -291,6 +324,14 @@
static struct bio_set blkdev_dio_pool;
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -312,23 +353,20 @@
}
dio->iocb->ki_complete(iocb, ret, 0);
- bio_put(&dio->bio);
+ if (dio->multi_bio)
+ bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
WRITE_ONCE(dio->waiter, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
}
}
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i)
- put_page(bvec->bv_page);
+ bio_release_pages(bio, false);
bio_put(bio);
}
}
@@ -342,6 +380,7 @@
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
+ bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
@@ -352,20 +391,27 @@
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
- bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync)
+ if (dio->is_sync) {
dio->waiter = current;
- else
+ bio_get(bio);
+ } else {
dio->iocb = iocb;
+ }
dio->size = 0;
dio->multi_bio = false;
- dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+ dio->should_dirty = is_read && iter_is_iovec(iter);
- blk_start_plug(&plug);
+ /*
+ * Don't plug for HIPRI/polled IO, as those should go straight
+ * to issue
+ */
+ if (!is_poll)
+ blk_start_plug(&plug);
+
for (;;) {
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
@@ -389,17 +435,36 @@
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
+ bool polled = false;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio_set_polled(bio, iocb);
+ polled = true;
+ }
+
qc = submit_bio(bio);
+
+ if (polled)
+ WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
if (!dio->multi_bio) {
+ /*
+ * AIO needs an extra reference to ensure the dio
+ * structure which is embedded into the first bio
+ * stays around.
+ */
+ if (!is_sync)
+ bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
@@ -409,7 +474,9 @@
submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages);
}
- blk_finish_plug(&plug);
+
+ if (!is_poll)
+ blk_finish_plug(&plug);
if (!is_sync)
return -EIOCBQUEUED;
@@ -420,8 +487,8 @@
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc))
- io_schedule();
+ !blk_poll(bdev_get_queue(bdev), qc, true))
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -502,55 +569,47 @@
* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+int freeze_bdev(struct block_device *bdev)
{
struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1) {
- /*
- * We don't even need to grab a reference - the first call
- * to freeze_bdev grab an active reference and only the last
- * thaw_bdev drops it.
- */
- sb = get_super(bdev);
- if (sb)
- drop_super(sb);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return sb;
- }
+ if (++bdev->bd_fsfreeze_count > 1)
+ goto done;
sb = get_active_super(bdev);
if (!sb)
- goto out;
+ goto sync;
if (sb->s_op->freeze_super)
error = sb->s_op->freeze_super(sb);
else
error = freeze_super(sb);
- if (error) {
- deactivate_super(sb);
- bdev->bd_fsfreeze_count--;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return ERR_PTR(error);
- }
deactivate_super(sb);
- out:
+
+ if (error) {
+ bdev->bd_fsfreeze_count--;
+ goto done;
+ }
+ bdev->bd_fsfreeze_sb = sb;
+
+sync:
sync_blockdev(bdev);
+done:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return sb; /* thaw_bdev releases s->s_umount */
+ return error;
}
EXPORT_SYMBOL(freeze_bdev);
/**
* thaw_bdev -- unlock filesystem
* @bdev: blockdevice to unlock
- * @sb: associated superblock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
*/
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev)
{
+ struct super_block *sb;
int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
@@ -561,6 +620,7 @@
if (--bdev->bd_fsfreeze_count > 0)
goto out;
+ sb = bdev->bd_fsfreeze_sb;
if (!sb)
goto out;
@@ -586,10 +646,9 @@
return block_read_full_page(page, blkdev_get_block);
}
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void blkdev_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+ mpage_readahead(rac, blkdev_get_block);
}
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
@@ -644,7 +703,7 @@
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
+ error = blkdev_issue_flush(bdev, GFP_KERNEL);
if (error == -EOPNOTSUPP)
error = 0;
@@ -677,15 +736,14 @@
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
-EXPORT_SYMBOL_GPL(bdev_read_page);
/**
* bdev_write_page() - Start writing a page to a block device
@@ -714,7 +772,7 @@
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, 0);
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
@@ -727,10 +785,9 @@
clean_page_buffers(page);
unlock_page(page);
}
- blk_queue_exit(bdev->bd_queue);
+ blk_queue_exit(bdev->bd_disk->queue);
return result;
}
-EXPORT_SYMBOL_GPL(bdev_write_page);
/*
* pseudo-fs
@@ -747,17 +804,9 @@
return &ei->vfs_inode;
}
-static void bdev_i_callback(struct rcu_head *head)
+static void bdev_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- struct bdev_inode *bdi = BDEV_I(inode);
-
- kmem_cache_free(bdev_cachep, bdi);
-}
-
-static void bdev_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, bdev_i_callback);
+ kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
static void init_once(void *foo)
@@ -767,7 +816,6 @@
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
@@ -783,9 +831,6 @@
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
- spin_lock(&bdev_lock);
- list_del_init(&bdev->bd_list);
- spin_unlock(&bdev_lock);
/* Detach inode from wb early as bdi_put() may free bdi->wb */
inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
@@ -797,24 +842,24 @@
static const struct super_operations bdev_sops = {
.statfs = simple_statfs,
.alloc_inode = bdev_alloc_inode,
- .destroy_inode = bdev_destroy_inode,
+ .free_inode = bdev_free_inode,
.drop_inode = generic_delete_inode,
.evict_inode = bdev_evict_inode,
};
-static struct dentry *bd_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bd_init_fs_context(struct fs_context *fc)
{
- struct dentry *dent;
- dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
- if (!IS_ERR(dent))
- dent->d_sb->s_iflags |= SB_I_CGROUPWB;
- return dent;
+ struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ fc->s_iflags |= SB_I_CGROUPWB;
+ ctx->ops = &bdev_sops;
+ return 0;
}
static struct file_system_type bd_type = {
.name = "bdev",
- .mount = bd_mount,
+ .init_fs_context = bd_init_fs_context,
.kill_sb = kill_anon_super,
};
@@ -860,24 +905,7 @@
return 0;
}
-static LIST_HEAD(all_bdevs);
-
-/*
- * If there is a bdev inode for this device, unhash it so that it gets evicted
- * as soon as last inode reference is dropped.
- */
-void bdev_unhash_inode(dev_t dev)
-{
- struct inode *inode;
-
- inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
- if (inode) {
- remove_inode_hash(inode);
- iput(inode);
- }
-}
-
-struct block_device *bdget(dev_t dev)
+static struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
@@ -891,26 +919,20 @@
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
- bdev->bd_block_size = i_blocksize(inode);
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- spin_lock(&bdev_lock);
- list_add(&bdev->bd_list, &all_bdevs);
- spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
}
-
-EXPORT_SYMBOL(bdget);
/**
* bdgrab -- Grab a reference to an already referenced block device
@@ -923,15 +945,21 @@
}
EXPORT_SYMBOL(bdgrab);
+struct block_device *bdget_part(struct hd_struct *part)
+{
+ return bdget(part_devt(part));
+}
+
long nr_blockdev_pages(void)
{
- struct block_device *bdev;
+ struct inode *inode;
long ret = 0;
- spin_lock(&bdev_lock);
- list_for_each_entry(bdev, &all_bdevs, bd_list) {
- ret += bdev->bd_inode->i_mapping->nrpages;
- }
- spin_unlock(&bdev_lock);
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ ret += inode->i_mapping->nrpages;
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
return ret;
}
@@ -1033,30 +1061,28 @@
}
/**
- * bd_prepare_to_claim - prepare to claim a block device
+ * bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
* @whole: the whole device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
- * Prepare to claim @bdev. This function fails if @bdev is already
- * claimed by another holder and waits if another claiming is in
- * progress. This function doesn't actually claim. On successful
- * return, the caller has ownership of bd_claiming and bd_holder[s].
- *
- * CONTEXT:
- * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
- * it multiple times.
+ * Claim @bdev. This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
*
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
-static int bd_prepare_to_claim(struct block_device *bdev,
- struct block_device *whole, void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
retry:
+ spin_lock(&bdev_lock);
/* if someone else claimed, fail */
- if (!bd_may_claim(bdev, whole, holder))
+ if (!bd_may_claim(bdev, whole, holder)) {
+ spin_unlock(&bdev_lock);
return -EBUSY;
+ }
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) {
@@ -1067,13 +1093,15 @@
spin_unlock(&bdev_lock);
schedule();
finish_wait(wq, &wait);
- spin_lock(&bdev_lock);
goto retry;
}
/* yay, all mine */
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
return 0;
}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
{
@@ -1096,77 +1124,59 @@
return disk;
}
-/**
- * bd_start_claiming - start claiming a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * @bdev is about to be opened exclusively. Check @bdev can be opened
- * exclusively and mark that an exclusive open is in progress. Each
- * successful call to this function must be matched with a call to
- * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * fail).
- *
- * This function is used to gain exclusive access to the block device
- * without actually causing other exclusive open attempts to fail. It
- * should be used when the open sequence itself requires exclusive
- * access but may subsequently fail.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to the block device containing @bdev on success, ERR_PTR()
- * value on failure.
- */
-static struct block_device *bd_start_claiming(struct block_device *bdev,
- void *holder)
+static void bd_clear_claiming(struct block_device *whole, void *holder)
{
- struct gendisk *disk;
- struct block_device *whole;
- int partno, err;
-
- might_sleep();
-
- /*
- * @bdev might not have been initialized properly yet, look up
- * and grab the outer block device the hard way.
- */
- disk = bdev_get_gendisk(bdev, &partno);
- if (!disk)
- return ERR_PTR(-ENXIO);
-
- /*
- * Normally, @bdev should equal what's returned from bdget_disk()
- * if partno is 0; however, some drivers (floppy) use multiple
- * bdev's for the same physical device and @bdev may be one of the
- * aliases. Keep @bdev if partno is 0. This means claimer
- * tracking is broken for those devices but it has always been that
- * way.
- */
- if (partno)
- whole = bdget_disk(disk, 0);
- else
- whole = bdgrab(bdev);
-
- put_disk_and_module(disk);
- if (!whole)
- return ERR_PTR(-ENOMEM);
-
- /* prepare to claim, if successful, mark claiming in progress */
- spin_lock(&bdev_lock);
-
- err = bd_prepare_to_claim(bdev, whole, holder);
- if (err == 0) {
- whole->bd_claiming = holder;
- spin_unlock(&bdev_lock);
- return whole;
- } else {
- spin_unlock(&bdev_lock);
- bdput(whole);
- return ERR_PTR(err);
- }
+ lockdep_assert_held(&bdev_lock);
+ /* tell others that we're done */
+ BUG_ON(whole->bd_claiming != holder);
+ whole->bd_claiming = NULL;
+ wake_up_bit(&whole->bd_claiming, 0);
}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device
+ * @holder: holder that has claimed @bdev
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+ struct block_device *whole, void *holder)
+{
+ spin_lock(&bdev_lock);
+ BUG_ON(!bd_may_claim(bdev, whole, holder));
+ /*
+ * Note that for a whole device bd_holders will be incremented twice,
+ * and bd_holder will be set to bd_may_claim before being set to holder
+ */
+ whole->bd_holders++;
+ whole->bd_holder = bd_may_claim;
+ bdev->bd_holders++;
+ bdev->bd_holder = holder;
+ bd_clear_claiming(whole, holder);
+ spin_unlock(&bdev_lock);
+}
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @whole: whole block device
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
+ void *holder)
+{
+ spin_lock(&bdev_lock);
+ bd_clear_claiming(whole, holder);
+ spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
#ifdef CONFIG_SYSFS
struct bd_holder_disk {
@@ -1312,26 +1322,6 @@
#endif
/**
- * flush_disk - invalidates all buffer-cache entries on a disk
- *
- * @bdev: struct block device to be flushed
- * @kill_dirty: flag to guide handling of dirty inodes
- *
- * Invalidates all buffer-cache entries on a disk. It should be called
- * when a disk has been changed -- either by a media change or online
- * resize.
- */
-static void flush_disk(struct block_device *bdev, bool kill_dirty)
-{
- if (__invalidate_device(bdev, kill_dirty)) {
- printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n",
- bdev->bd_disk ? bdev->bd_disk->disk_name : "");
- }
- bdev->bd_invalidated = 1;
-}
-
-/**
* check_disk_size_change - checks for disk size change and adjusts bdev size.
* @disk: struct gendisk to check
* @bdev: struct bdev to adjust.
@@ -1341,11 +1331,12 @@
* and adjusts it if it differs. When shrinking the bdev size, its all caches
* are freed.
*/
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
- bool verbose)
+static void check_disk_size_change(struct gendisk *disk,
+ struct block_device *bdev, bool verbose)
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1355,89 +1346,109 @@
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size)
- flush_disk(bdev, false);
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
+ pr_warn("VFS: busy inodes on resized disk %s\n",
+ disk->disk_name);
}
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
struct block_device *bdev;
- int ret = 0;
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ /*
+ * Hidden disks don't have associated bdev so there's no point in
+ * revalidating them.
+ */
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
+
bdev = bdget_disk(disk, 0);
- if (!bdev)
- return ret;
-
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- bdev->bd_invalidated = 0;
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
- return ret;
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
+ bdput(bdev);
+ }
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- flush_disk(bdev, true);
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
-}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
-static void bdev_disk_changed(struct block_device *bdev, bool invalidate)
+int bdev_disk_changed(struct block_device *bdev, bool invalidate)
{
- if (disk_part_scan_enabled(bdev->bd_disk)) {
- if (invalidate)
- invalidate_partitions(bdev->bd_disk, bdev);
- else
- rescan_partitions(bdev->bd_disk, bdev);
+ struct gendisk *disk = bdev->bd_disk;
+ int ret;
+
+ lockdep_assert_held(&bdev->bd_mutex);
+
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
+
+rescan:
+ ret = blk_drop_partitions(bdev);
+ if (ret)
+ return ret;
+
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ /*
+ * Historically we only set the capacity to zero for devices that
+ * support partitions (independ of actually having partitions created).
+ * Doing that is rather inconsistent, but changing it broke legacy
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
+ * below to get the sane behavior for most device while not breaking
+ * userspace for this particular setup.
+ */
+ if (invalidate) {
+ if (disk_part_scan_enabled(disk) ||
+ !(disk->flags & GENHD_FL_REMOVABLE))
+ set_capacity(disk, 0);
} else {
- check_disk_size_change(bdev->bd_disk, bdev, !invalidate);
- bdev->bd_invalidated = 0;
+ if (disk->fops->revalidate_disk)
+ disk->fops->revalidate_disk(disk);
}
+
+ check_disk_size_change(disk, bdev, !invalidate);
+
+ if (get_capacity(disk)) {
+ ret = blk_add_partitions(disk, bdev);
+ if (ret == -EAGAIN)
+ goto rescan;
+ } else if (invalidate) {
+ /*
+ * Tell userspace that the media / partition table may have
+ * changed.
+ */
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ }
+
+ return ret;
}
+/*
+ * Only exported for for loop and dasd for historic reasons. Don't use in new
+ * code!
+ */
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
/*
* bd_mutex locking:
@@ -1446,40 +1457,46 @@
* mutex_lock_nested(whole->bd_mutex, 1)
*/
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
+ int for_part)
{
+ struct block_device *whole = NULL, *claiming = NULL;
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
- bool first_open = false;
-
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
+ bool first_open = false, unblock_events = true, need_restart;
restart:
-
+ need_restart = false;
ret = -ENXIO;
disk = bdev_get_gendisk(bdev, &partno);
if (!disk)
goto out;
+
+ if (partno) {
+ whole = bdget_disk(disk, 0);
+ if (!whole) {
+ ret = -ENOMEM;
+ goto out_put_disk;
+ }
+ }
+
+ if (!for_part && (mode & FMODE_EXCL)) {
+ WARN_ON_ONCE(!holder);
+ if (whole)
+ claiming = whole;
+ else
+ claiming = bdev;
+ ret = bd_prepare_to_claim(bdev, claiming, holder);
+ if (ret)
+ goto out_put_whole;
+ }
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
first_open = true;
bdev->bd_disk = disk;
- bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_partno = partno;
@@ -1492,24 +1509,16 @@
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
- if (ret == -ERESTARTSYS) {
- /* Lost a race with 'disk' being
- * deleted, try again.
- * See md.c
- */
- disk_put_part(bdev->bd_part);
- bdev->bd_part = NULL;
- bdev->bd_disk = NULL;
- bdev->bd_queue = NULL;
- mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
- put_disk_and_module(disk);
- goto restart;
- }
+ /*
+ * If we lost a race with 'disk' being deleted,
+ * try again. See md.c
+ */
+ if (ret == -ERESTARTSYS)
+ need_restart = true;
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1519,32 +1528,25 @@
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
goto out_clear;
} else {
- struct block_device *whole;
- whole = bdget_disk(disk, 0);
- ret = -ENOMEM;
- if (!whole)
- goto out_clear;
BUG_ON(for_part);
- ret = __blkdev_get(whole, mode, 1);
- if (ret) {
- bdput(whole);
+ ret = __blkdev_get(whole, mode, NULL, 1);
+ if (ret)
goto out_clear;
- }
- bdev->bd_contains = whole;
+ bdev->bd_contains = bdgrab(whole);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1556,7 +1558,7 @@
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1566,27 +1568,52 @@
bdev->bd_openers++;
if (for_part)
bdev->bd_part_count++;
+ if (claiming)
+ bd_finish_claiming(bdev, claiming, holder);
+
+ /*
+ * Block event polling for write claims if requested. Any write holder
+ * makes the write_holder state stick until all are released. This is
+ * good enough and tracking individual writeable reference is too
+ * fragile given the way @mode is used in blkdev_get/put().
+ */
+ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+
/* only one opener holds refs to the module and disk */
if (!first_open)
put_disk_and_module(disk);
+ if (whole)
+ bdput(whole);
return 0;
out_clear:
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
- bdev->bd_queue = NULL;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
out_unlock_bdev:
+ if (claiming)
+ bd_abort_claiming(bdev, claiming, holder);
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
+ out_put_whole:
+ if (whole)
+ bdput(whole);
+ out_put_disk:
put_disk_and_module(disk);
+ if (need_restart)
+ goto restart;
out:
-
return ret;
}
@@ -1609,74 +1636,27 @@
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- struct block_device *whole = NULL;
- int res;
+ int ret, perm = 0;
- WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
- if ((mode & FMODE_EXCL) && holder) {
- whole = bd_start_claiming(bdev, holder);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return PTR_ERR(whole);
- }
- }
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
- res = __blkdev_get(bdev, mode, 0);
-
- if (whole) {
- struct gendisk *disk = whole->bd_disk;
-
- /* finish claiming */
- mutex_lock(&bdev->bd_mutex);
- spin_lock(&bdev_lock);
-
- if (!res) {
- BUG_ON(!bd_may_claim(bdev, whole, holder));
- /*
- * Note that for a whole device bd_holders
- * will be incremented twice, and bd_holder
- * will be set to bd_may_claim before being
- * set to holder
- */
- whole->bd_holders++;
- whole->bd_holder = bd_may_claim;
- bdev->bd_holders++;
- bdev->bd_holder = holder;
- }
-
- /* tell others that we're done */
- BUG_ON(whole->bd_claiming != holder);
- whole->bd_claiming = NULL;
- wake_up_bit(&whole->bd_claiming, 0);
-
- spin_unlock(&bdev_lock);
-
- /*
- * Block event polling for write claims if requested. Any
- * write holder makes the write_holder state stick until
- * all are released. This is good enough and tracking
- * individual writeable reference is too fragile given the
- * way @mode is used in blkdev_get/put().
- */
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- disk_block_events(disk);
- }
-
- mutex_unlock(&bdev->bd_mutex);
- bdput(whole);
- }
-
- if (res)
- bdput(bdev);
-
- return res;
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1769,7 +1749,7 @@
*/
filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
@@ -1925,7 +1905,7 @@
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1999,13 +1979,14 @@
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
- .readpages = blkdev_readpages,
+ .readahead = blkdev_readahead,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
+ .migratepage = buffer_migrate_page_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
@@ -2017,7 +1998,6 @@
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -2045,8 +2025,9 @@
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2073,7 +2054,7 @@
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
@@ -2084,6 +2065,7 @@
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+ .iopoll = blkdev_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
@@ -2094,18 +2076,6 @@
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
};
-
-int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
-{
- int res;
- mm_segment_t old_fs = get_fs();
- set_fs(KERNEL_DS);
- res = blkdev_ioctl(bdev, 0, cmd, arg);
- set_fs(old_fs);
- return res;
-}
-
-EXPORT_SYMBOL(ioctl_by_bdev);
/**
* lookup_bdev - lookup a struct block_device by name
--
Gitblit v1.6.2