hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/block_dev.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/fs/block_dev.c
34 *
....@@ -18,22 +19,21 @@
1819 #include <linux/module.h>
1920 #include <linux/blkpg.h>
2021 #include <linux/magic.h>
21
-#include <linux/dax.h>
2222 #include <linux/buffer_head.h>
2323 #include <linux/swap.h>
2424 #include <linux/pagevec.h>
2525 #include <linux/writeback.h>
2626 #include <linux/mpage.h>
2727 #include <linux/mount.h>
28
+#include <linux/pseudo_fs.h>
2829 #include <linux/uio.h>
2930 #include <linux/namei.h>
3031 #include <linux/log2.h>
3132 #include <linux/cleancache.h>
32
-#include <linux/dax.h>
33
-#include <linux/badblocks.h>
3433 #include <linux/task_io_accounting_ops.h>
3534 #include <linux/falloc.h>
3635 #include <linux/uaccess.h>
36
+#include <linux/suspend.h>
3737 #include "internal.h"
3838
3939 struct bdev_inode {
....@@ -75,7 +75,7 @@
7575 }
7676
7777 /* Kill _all_ buffers and pagecache , dirty or not.. */
78
-void kill_bdev(struct block_device *bdev)
78
+static void kill_bdev(struct block_device *bdev)
7979 {
8080 struct address_space *mapping = bdev->bd_inode->i_mapping;
8181
....@@ -84,8 +84,7 @@
8484
8585 invalidate_bh_lrus();
8686 truncate_inode_pages(mapping, 0);
87
-}
88
-EXPORT_SYMBOL(kill_bdev);
87
+}
8988
9089 /* Invalidate clean unused buffers and pagecache. */
9190 void invalidate_bdev(struct block_device *bdev)
....@@ -104,9 +103,47 @@
104103 }
105104 EXPORT_SYMBOL(invalidate_bdev);
106105
106
+/*
107
+ * Drop all buffers & page cache for given bdev range. This function bails
108
+ * with error if bdev has other exclusive owner (such as filesystem).
109
+ */
110
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
111
+ loff_t lstart, loff_t lend)
112
+{
113
+ struct block_device *claimed_bdev = NULL;
114
+ int err;
115
+
116
+ /*
117
+ * If we don't hold exclusive handle for the device, upgrade to it
118
+ * while we discard the buffer cache to avoid discarding buffers
119
+ * under live filesystem.
120
+ */
121
+ if (!(mode & FMODE_EXCL)) {
122
+ claimed_bdev = bdev->bd_contains;
123
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
124
+ truncate_bdev_range);
125
+ if (err)
126
+ goto invalidate;
127
+ }
128
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
129
+ if (claimed_bdev)
130
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
131
+ return 0;
132
+
133
+invalidate:
134
+ /*
135
+ * Someone else has handle exclusively open. Try invalidating instead.
136
+ * The 'end' argument is inclusive so the rounding is safe.
137
+ */
138
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
139
+ lstart >> PAGE_SHIFT,
140
+ lend >> PAGE_SHIFT);
141
+}
142
+EXPORT_SYMBOL(truncate_bdev_range);
143
+
107144 static void set_init_blocksize(struct block_device *bdev)
108145 {
109
- unsigned bsize = bdev_logical_block_size(bdev);
146
+ unsigned int bsize = bdev_logical_block_size(bdev);
110147 loff_t size = i_size_read(bdev->bd_inode);
111148
112149 while (bsize < PAGE_SIZE) {
....@@ -114,7 +151,6 @@
114151 break;
115152 bsize <<= 1;
116153 }
117
- bdev->bd_block_size = bsize;
118154 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
119155 }
120156
....@@ -129,9 +165,8 @@
129165 return -EINVAL;
130166
131167 /* Don't change the size if it is same as current */
132
- if (bdev->bd_block_size != size) {
168
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
133169 sync_blockdev(bdev);
134
- bdev->bd_block_size = size;
135170 bdev->bd_inode->i_blkbits = blksize_bits(size);
136171 kill_bdev(bdev);
137172 }
....@@ -151,7 +186,7 @@
151186 return sb->s_blocksize;
152187 }
153188
154
-EXPORT_SYMBOL(sb_set_blocksize);
189
+EXPORT_SYMBOL_NS(sb_set_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY);
155190
156191 int sb_min_blocksize(struct super_block *sb, int size)
157192 {
....@@ -161,7 +196,7 @@
161196 return sb_set_blocksize(sb, size);
162197 }
163198
164
-EXPORT_SYMBOL(sb_min_blocksize);
199
+EXPORT_SYMBOL_NS(sb_min_blocksize, ANDROID_GKI_VFS_EXPORT_ONLY);
165200
166201 static int
167202 blkdev_get_block(struct inode *inode, sector_t iblock,
....@@ -195,7 +230,7 @@
195230 struct task_struct *waiter = bio->bi_private;
196231
197232 WRITE_ONCE(bio->bi_private, NULL);
198
- wake_up_process(waiter);
233
+ blk_wake_io_task(waiter);
199234 }
200235
201236 static ssize_t
....@@ -204,13 +239,12 @@
204239 {
205240 struct file *file = iocb->ki_filp;
206241 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
207
- struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
242
+ struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
208243 loff_t pos = iocb->ki_pos;
209244 bool should_dirty = false;
210245 struct bio bio;
211246 ssize_t ret;
212247 blk_qc_t qc;
213
- int i;
214248
215249 if ((pos | iov_iter_alignment(iter)) &
216250 (bdev_logical_block_size(bdev) - 1))
....@@ -246,6 +280,10 @@
246280 bio.bi_opf = dio_bio_write_op(iocb);
247281 task_io_account_write(ret);
248282 }
283
+ if (iocb->ki_flags & IOCB_NOWAIT)
284
+ bio.bi_opf |= REQ_NOWAIT;
285
+ if (iocb->ki_flags & IOCB_HIPRI)
286
+ bio_set_polled(&bio, iocb);
249287
250288 qc = submit_bio(&bio);
251289 for (;;) {
....@@ -253,17 +291,12 @@
253291 if (!READ_ONCE(bio.bi_private))
254292 break;
255293 if (!(iocb->ki_flags & IOCB_HIPRI) ||
256
- !blk_poll(bdev_get_queue(bdev), qc))
257
- io_schedule();
294
+ !blk_poll(bdev_get_queue(bdev), qc, true))
295
+ blk_io_schedule();
258296 }
259297 __set_current_state(TASK_RUNNING);
260298
261
- bio_for_each_segment_all(bvec, &bio, i) {
262
- if (should_dirty && !PageCompound(bvec->bv_page))
263
- set_page_dirty_lock(bvec->bv_page);
264
- put_page(bvec->bv_page);
265
- }
266
-
299
+ bio_release_pages(&bio, should_dirty);
267300 if (unlikely(bio.bi_status))
268301 ret = blk_status_to_errno(bio.bi_status);
269302
....@@ -291,6 +324,14 @@
291324
292325 static struct bio_set blkdev_dio_pool;
293326
327
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
328
+{
329
+ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
330
+ struct request_queue *q = bdev_get_queue(bdev);
331
+
332
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
333
+}
334
+
294335 static void blkdev_bio_end_io(struct bio *bio)
295336 {
296337 struct blkdev_dio *dio = bio->bi_private;
....@@ -312,23 +353,20 @@
312353 }
313354
314355 dio->iocb->ki_complete(iocb, ret, 0);
315
- bio_put(&dio->bio);
356
+ if (dio->multi_bio)
357
+ bio_put(&dio->bio);
316358 } else {
317359 struct task_struct *waiter = dio->waiter;
318360
319361 WRITE_ONCE(dio->waiter, NULL);
320
- wake_up_process(waiter);
362
+ blk_wake_io_task(waiter);
321363 }
322364 }
323365
324366 if (should_dirty) {
325367 bio_check_pages_dirty(bio);
326368 } else {
327
- struct bio_vec *bvec;
328
- int i;
329
-
330
- bio_for_each_segment_all(bvec, bio, i)
331
- put_page(bvec->bv_page);
369
+ bio_release_pages(bio, false);
332370 bio_put(bio);
333371 }
334372 }
....@@ -342,6 +380,7 @@
342380 struct blk_plug plug;
343381 struct blkdev_dio *dio;
344382 struct bio *bio;
383
+ bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
345384 bool is_read = (iov_iter_rw(iter) == READ), is_sync;
346385 loff_t pos = iocb->ki_pos;
347386 blk_qc_t qc = BLK_QC_T_NONE;
....@@ -352,20 +391,27 @@
352391 return -EINVAL;
353392
354393 bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
355
- bio_get(bio); /* extra ref for the completion handler */
356394
357395 dio = container_of(bio, struct blkdev_dio, bio);
358396 dio->is_sync = is_sync = is_sync_kiocb(iocb);
359
- if (dio->is_sync)
397
+ if (dio->is_sync) {
360398 dio->waiter = current;
361
- else
399
+ bio_get(bio);
400
+ } else {
362401 dio->iocb = iocb;
402
+ }
363403
364404 dio->size = 0;
365405 dio->multi_bio = false;
366
- dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
406
+ dio->should_dirty = is_read && iter_is_iovec(iter);
367407
368
- blk_start_plug(&plug);
408
+ /*
409
+ * Don't plug for HIPRI/polled IO, as those should go straight
410
+ * to issue
411
+ */
412
+ if (!is_poll)
413
+ blk_start_plug(&plug);
414
+
369415 for (;;) {
370416 bio_set_dev(bio, bdev);
371417 bio->bi_iter.bi_sector = pos >> 9;
....@@ -389,17 +435,36 @@
389435 bio->bi_opf = dio_bio_write_op(iocb);
390436 task_io_account_write(bio->bi_iter.bi_size);
391437 }
438
+ if (iocb->ki_flags & IOCB_NOWAIT)
439
+ bio->bi_opf |= REQ_NOWAIT;
392440
393441 dio->size += bio->bi_iter.bi_size;
394442 pos += bio->bi_iter.bi_size;
395443
396444 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
397445 if (!nr_pages) {
446
+ bool polled = false;
447
+
448
+ if (iocb->ki_flags & IOCB_HIPRI) {
449
+ bio_set_polled(bio, iocb);
450
+ polled = true;
451
+ }
452
+
398453 qc = submit_bio(bio);
454
+
455
+ if (polled)
456
+ WRITE_ONCE(iocb->ki_cookie, qc);
399457 break;
400458 }
401459
402460 if (!dio->multi_bio) {
461
+ /*
462
+ * AIO needs an extra reference to ensure the dio
463
+ * structure which is embedded into the first bio
464
+ * stays around.
465
+ */
466
+ if (!is_sync)
467
+ bio_get(bio);
403468 dio->multi_bio = true;
404469 atomic_set(&dio->ref, 2);
405470 } else {
....@@ -409,7 +474,9 @@
409474 submit_bio(bio);
410475 bio = bio_alloc(GFP_KERNEL, nr_pages);
411476 }
412
- blk_finish_plug(&plug);
477
+
478
+ if (!is_poll)
479
+ blk_finish_plug(&plug);
413480
414481 if (!is_sync)
415482 return -EIOCBQUEUED;
....@@ -420,8 +487,8 @@
420487 break;
421488
422489 if (!(iocb->ki_flags & IOCB_HIPRI) ||
423
- !blk_poll(bdev_get_queue(bdev), qc))
424
- io_schedule();
490
+ !blk_poll(bdev_get_queue(bdev), qc, true))
491
+ blk_io_schedule();
425492 }
426493 __set_current_state(TASK_RUNNING);
427494
....@@ -502,55 +569,47 @@
502569 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
503570 * actually.
504571 */
505
-struct super_block *freeze_bdev(struct block_device *bdev)
572
+int freeze_bdev(struct block_device *bdev)
506573 {
507574 struct super_block *sb;
508575 int error = 0;
509576
510577 mutex_lock(&bdev->bd_fsfreeze_mutex);
511
- if (++bdev->bd_fsfreeze_count > 1) {
512
- /*
513
- * We don't even need to grab a reference - the first call
514
- * to freeze_bdev grab an active reference and only the last
515
- * thaw_bdev drops it.
516
- */
517
- sb = get_super(bdev);
518
- if (sb)
519
- drop_super(sb);
520
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
521
- return sb;
522
- }
578
+ if (++bdev->bd_fsfreeze_count > 1)
579
+ goto done;
523580
524581 sb = get_active_super(bdev);
525582 if (!sb)
526
- goto out;
583
+ goto sync;
527584 if (sb->s_op->freeze_super)
528585 error = sb->s_op->freeze_super(sb);
529586 else
530587 error = freeze_super(sb);
531
- if (error) {
532
- deactivate_super(sb);
533
- bdev->bd_fsfreeze_count--;
534
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
535
- return ERR_PTR(error);
536
- }
537588 deactivate_super(sb);
538
- out:
589
+
590
+ if (error) {
591
+ bdev->bd_fsfreeze_count--;
592
+ goto done;
593
+ }
594
+ bdev->bd_fsfreeze_sb = sb;
595
+
596
+sync:
539597 sync_blockdev(bdev);
598
+done:
540599 mutex_unlock(&bdev->bd_fsfreeze_mutex);
541
- return sb; /* thaw_bdev releases s->s_umount */
600
+ return error;
542601 }
543602 EXPORT_SYMBOL(freeze_bdev);
544603
545604 /**
546605 * thaw_bdev -- unlock filesystem
547606 * @bdev: blockdevice to unlock
548
- * @sb: associated superblock
549607 *
550608 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
551609 */
552
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
610
+int thaw_bdev(struct block_device *bdev)
553611 {
612
+ struct super_block *sb;
554613 int error = -EINVAL;
555614
556615 mutex_lock(&bdev->bd_fsfreeze_mutex);
....@@ -561,6 +620,7 @@
561620 if (--bdev->bd_fsfreeze_count > 0)
562621 goto out;
563622
623
+ sb = bdev->bd_fsfreeze_sb;
564624 if (!sb)
565625 goto out;
566626
....@@ -586,10 +646,9 @@
586646 return block_read_full_page(page, blkdev_get_block);
587647 }
588648
589
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
590
- struct list_head *pages, unsigned nr_pages)
649
+static void blkdev_readahead(struct readahead_control *rac)
591650 {
592
- return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
651
+ mpage_readahead(rac, blkdev_get_block);
593652 }
594653
595654 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
....@@ -644,7 +703,7 @@
644703 * i_mutex and doing so causes performance issues with concurrent
645704 * O_SYNC writers to a block device.
646705 */
647
- error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
706
+ error = blkdev_issue_flush(bdev, GFP_KERNEL);
648707 if (error == -EOPNOTSUPP)
649708 error = 0;
650709
....@@ -677,15 +736,14 @@
677736 if (!ops->rw_page || bdev_get_integrity(bdev))
678737 return result;
679738
680
- result = blk_queue_enter(bdev->bd_queue, 0);
739
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
681740 if (result)
682741 return result;
683742 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
684743 REQ_OP_READ);
685
- blk_queue_exit(bdev->bd_queue);
744
+ blk_queue_exit(bdev->bd_disk->queue);
686745 return result;
687746 }
688
-EXPORT_SYMBOL_GPL(bdev_read_page);
689747
690748 /**
691749 * bdev_write_page() - Start writing a page to a block device
....@@ -714,7 +772,7 @@
714772
715773 if (!ops->rw_page || bdev_get_integrity(bdev))
716774 return -EOPNOTSUPP;
717
- result = blk_queue_enter(bdev->bd_queue, 0);
775
+ result = blk_queue_enter(bdev->bd_disk->queue, 0);
718776 if (result)
719777 return result;
720778
....@@ -727,10 +785,9 @@
727785 clean_page_buffers(page);
728786 unlock_page(page);
729787 }
730
- blk_queue_exit(bdev->bd_queue);
788
+ blk_queue_exit(bdev->bd_disk->queue);
731789 return result;
732790 }
733
-EXPORT_SYMBOL_GPL(bdev_write_page);
734791
735792 /*
736793 * pseudo-fs
....@@ -747,17 +804,9 @@
747804 return &ei->vfs_inode;
748805 }
749806
750
-static void bdev_i_callback(struct rcu_head *head)
807
+static void bdev_free_inode(struct inode *inode)
751808 {
752
- struct inode *inode = container_of(head, struct inode, i_rcu);
753
- struct bdev_inode *bdi = BDEV_I(inode);
754
-
755
- kmem_cache_free(bdev_cachep, bdi);
756
-}
757
-
758
-static void bdev_destroy_inode(struct inode *inode)
759
-{
760
- call_rcu(&inode->i_rcu, bdev_i_callback);
809
+ kmem_cache_free(bdev_cachep, BDEV_I(inode));
761810 }
762811
763812 static void init_once(void *foo)
....@@ -767,7 +816,6 @@
767816
768817 memset(bdev, 0, sizeof(*bdev));
769818 mutex_init(&bdev->bd_mutex);
770
- INIT_LIST_HEAD(&bdev->bd_list);
771819 #ifdef CONFIG_SYSFS
772820 INIT_LIST_HEAD(&bdev->bd_holder_disks);
773821 #endif
....@@ -783,9 +831,6 @@
783831 truncate_inode_pages_final(&inode->i_data);
784832 invalidate_inode_buffers(inode); /* is it needed here? */
785833 clear_inode(inode);
786
- spin_lock(&bdev_lock);
787
- list_del_init(&bdev->bd_list);
788
- spin_unlock(&bdev_lock);
789834 /* Detach inode from wb early as bdi_put() may free bdi->wb */
790835 inode_detach_wb(inode);
791836 if (bdev->bd_bdi != &noop_backing_dev_info) {
....@@ -797,24 +842,24 @@
797842 static const struct super_operations bdev_sops = {
798843 .statfs = simple_statfs,
799844 .alloc_inode = bdev_alloc_inode,
800
- .destroy_inode = bdev_destroy_inode,
845
+ .free_inode = bdev_free_inode,
801846 .drop_inode = generic_delete_inode,
802847 .evict_inode = bdev_evict_inode,
803848 };
804849
805
-static struct dentry *bd_mount(struct file_system_type *fs_type,
806
- int flags, const char *dev_name, void *data)
850
+static int bd_init_fs_context(struct fs_context *fc)
807851 {
808
- struct dentry *dent;
809
- dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
810
- if (!IS_ERR(dent))
811
- dent->d_sb->s_iflags |= SB_I_CGROUPWB;
812
- return dent;
852
+ struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
853
+ if (!ctx)
854
+ return -ENOMEM;
855
+ fc->s_iflags |= SB_I_CGROUPWB;
856
+ ctx->ops = &bdev_sops;
857
+ return 0;
813858 }
814859
815860 static struct file_system_type bd_type = {
816861 .name = "bdev",
817
- .mount = bd_mount,
862
+ .init_fs_context = bd_init_fs_context,
818863 .kill_sb = kill_anon_super,
819864 };
820865
....@@ -860,24 +905,7 @@
860905 return 0;
861906 }
862907
863
-static LIST_HEAD(all_bdevs);
864
-
865
-/*
866
- * If there is a bdev inode for this device, unhash it so that it gets evicted
867
- * as soon as last inode reference is dropped.
868
- */
869
-void bdev_unhash_inode(dev_t dev)
870
-{
871
- struct inode *inode;
872
-
873
- inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
874
- if (inode) {
875
- remove_inode_hash(inode);
876
- iput(inode);
877
- }
878
-}
879
-
880
-struct block_device *bdget(dev_t dev)
908
+static struct block_device *bdget(dev_t dev)
881909 {
882910 struct block_device *bdev;
883911 struct inode *inode;
....@@ -891,26 +919,20 @@
891919 bdev = &BDEV_I(inode)->bdev;
892920
893921 if (inode->i_state & I_NEW) {
922
+ spin_lock_init(&bdev->bd_size_lock);
894923 bdev->bd_contains = NULL;
895924 bdev->bd_super = NULL;
896925 bdev->bd_inode = inode;
897
- bdev->bd_block_size = i_blocksize(inode);
898926 bdev->bd_part_count = 0;
899
- bdev->bd_invalidated = 0;
900927 inode->i_mode = S_IFBLK;
901928 inode->i_rdev = dev;
902929 inode->i_bdev = bdev;
903930 inode->i_data.a_ops = &def_blk_aops;
904931 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
905
- spin_lock(&bdev_lock);
906
- list_add(&bdev->bd_list, &all_bdevs);
907
- spin_unlock(&bdev_lock);
908932 unlock_new_inode(inode);
909933 }
910934 return bdev;
911935 }
912
-
913
-EXPORT_SYMBOL(bdget);
914936
915937 /**
916938 * bdgrab -- Grab a reference to an already referenced block device
....@@ -923,15 +945,21 @@
923945 }
924946 EXPORT_SYMBOL(bdgrab);
925947
948
+struct block_device *bdget_part(struct hd_struct *part)
949
+{
950
+ return bdget(part_devt(part));
951
+}
952
+
926953 long nr_blockdev_pages(void)
927954 {
928
- struct block_device *bdev;
955
+ struct inode *inode;
929956 long ret = 0;
930
- spin_lock(&bdev_lock);
931
- list_for_each_entry(bdev, &all_bdevs, bd_list) {
932
- ret += bdev->bd_inode->i_mapping->nrpages;
933
- }
934
- spin_unlock(&bdev_lock);
957
+
958
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
959
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
960
+ ret += inode->i_mapping->nrpages;
961
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
962
+
935963 return ret;
936964 }
937965
....@@ -1033,30 +1061,28 @@
10331061 }
10341062
10351063 /**
1036
- * bd_prepare_to_claim - prepare to claim a block device
1064
+ * bd_prepare_to_claim - claim a block device
10371065 * @bdev: block device of interest
10381066 * @whole: the whole device containing @bdev, may equal @bdev
10391067 * @holder: holder trying to claim @bdev
10401068 *
1041
- * Prepare to claim @bdev. This function fails if @bdev is already
1042
- * claimed by another holder and waits if another claiming is in
1043
- * progress. This function doesn't actually claim. On successful
1044
- * return, the caller has ownership of bd_claiming and bd_holder[s].
1045
- *
1046
- * CONTEXT:
1047
- * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
1048
- * it multiple times.
1069
+ * Claim @bdev. This function fails if @bdev is already claimed by another
1070
+ * holder and waits if another claiming is in progress. return, the caller
1071
+ * has ownership of bd_claiming and bd_holder[s].
10491072 *
10501073 * RETURNS:
10511074 * 0 if @bdev can be claimed, -EBUSY otherwise.
10521075 */
1053
-static int bd_prepare_to_claim(struct block_device *bdev,
1054
- struct block_device *whole, void *holder)
1076
+int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
1077
+ void *holder)
10551078 {
10561079 retry:
1080
+ spin_lock(&bdev_lock);
10571081 /* if someone else claimed, fail */
1058
- if (!bd_may_claim(bdev, whole, holder))
1082
+ if (!bd_may_claim(bdev, whole, holder)) {
1083
+ spin_unlock(&bdev_lock);
10591084 return -EBUSY;
1085
+ }
10601086
10611087 /* if claiming is already in progress, wait for it to finish */
10621088 if (whole->bd_claiming) {
....@@ -1067,13 +1093,15 @@
10671093 spin_unlock(&bdev_lock);
10681094 schedule();
10691095 finish_wait(wq, &wait);
1070
- spin_lock(&bdev_lock);
10711096 goto retry;
10721097 }
10731098
10741099 /* yay, all mine */
1100
+ whole->bd_claiming = holder;
1101
+ spin_unlock(&bdev_lock);
10751102 return 0;
10761103 }
1104
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
10771105
10781106 static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
10791107 {
....@@ -1096,77 +1124,59 @@
10961124 return disk;
10971125 }
10981126
1099
-/**
1100
- * bd_start_claiming - start claiming a block device
1101
- * @bdev: block device of interest
1102
- * @holder: holder trying to claim @bdev
1103
- *
1104
- * @bdev is about to be opened exclusively. Check @bdev can be opened
1105
- * exclusively and mark that an exclusive open is in progress. Each
1106
- * successful call to this function must be matched with a call to
1107
- * either bd_finish_claiming() or bd_abort_claiming() (which do not
1108
- * fail).
1109
- *
1110
- * This function is used to gain exclusive access to the block device
1111
- * without actually causing other exclusive open attempts to fail. It
1112
- * should be used when the open sequence itself requires exclusive
1113
- * access but may subsequently fail.
1114
- *
1115
- * CONTEXT:
1116
- * Might sleep.
1117
- *
1118
- * RETURNS:
1119
- * Pointer to the block device containing @bdev on success, ERR_PTR()
1120
- * value on failure.
1121
- */
1122
-static struct block_device *bd_start_claiming(struct block_device *bdev,
1123
- void *holder)
1127
+static void bd_clear_claiming(struct block_device *whole, void *holder)
11241128 {
1125
- struct gendisk *disk;
1126
- struct block_device *whole;
1127
- int partno, err;
1128
-
1129
- might_sleep();
1130
-
1131
- /*
1132
- * @bdev might not have been initialized properly yet, look up
1133
- * and grab the outer block device the hard way.
1134
- */
1135
- disk = bdev_get_gendisk(bdev, &partno);
1136
- if (!disk)
1137
- return ERR_PTR(-ENXIO);
1138
-
1139
- /*
1140
- * Normally, @bdev should equal what's returned from bdget_disk()
1141
- * if partno is 0; however, some drivers (floppy) use multiple
1142
- * bdev's for the same physical device and @bdev may be one of the
1143
- * aliases. Keep @bdev if partno is 0. This means claimer
1144
- * tracking is broken for those devices but it has always been that
1145
- * way.
1146
- */
1147
- if (partno)
1148
- whole = bdget_disk(disk, 0);
1149
- else
1150
- whole = bdgrab(bdev);
1151
-
1152
- put_disk_and_module(disk);
1153
- if (!whole)
1154
- return ERR_PTR(-ENOMEM);
1155
-
1156
- /* prepare to claim, if successful, mark claiming in progress */
1157
- spin_lock(&bdev_lock);
1158
-
1159
- err = bd_prepare_to_claim(bdev, whole, holder);
1160
- if (err == 0) {
1161
- whole->bd_claiming = holder;
1162
- spin_unlock(&bdev_lock);
1163
- return whole;
1164
- } else {
1165
- spin_unlock(&bdev_lock);
1166
- bdput(whole);
1167
- return ERR_PTR(err);
1168
- }
1129
+ lockdep_assert_held(&bdev_lock);
1130
+ /* tell others that we're done */
1131
+ BUG_ON(whole->bd_claiming != holder);
1132
+ whole->bd_claiming = NULL;
1133
+ wake_up_bit(&whole->bd_claiming, 0);
11691134 }
1135
+
1136
+/**
1137
+ * bd_finish_claiming - finish claiming of a block device
1138
+ * @bdev: block device of interest
1139
+ * @whole: whole block device
1140
+ * @holder: holder that has claimed @bdev
1141
+ *
1142
+ * Finish exclusive open of a block device. Mark the device as exlusively
1143
+ * open by the holder and wake up all waiters for exclusive open to finish.
1144
+ */
1145
+static void bd_finish_claiming(struct block_device *bdev,
1146
+ struct block_device *whole, void *holder)
1147
+{
1148
+ spin_lock(&bdev_lock);
1149
+ BUG_ON(!bd_may_claim(bdev, whole, holder));
1150
+ /*
1151
+ * Note that for a whole device bd_holders will be incremented twice,
1152
+ * and bd_holder will be set to bd_may_claim before being set to holder
1153
+ */
1154
+ whole->bd_holders++;
1155
+ whole->bd_holder = bd_may_claim;
1156
+ bdev->bd_holders++;
1157
+ bdev->bd_holder = holder;
1158
+ bd_clear_claiming(whole, holder);
1159
+ spin_unlock(&bdev_lock);
1160
+}
1161
+
1162
+/**
1163
+ * bd_abort_claiming - abort claiming of a block device
1164
+ * @bdev: block device of interest
1165
+ * @whole: whole block device
1166
+ * @holder: holder that has claimed @bdev
1167
+ *
1168
+ * Abort claiming of a block device when the exclusive open failed. This can be
1169
+ * also used when exclusive open is not actually desired and we just needed
1170
+ * to block other exclusive openers for a while.
1171
+ */
1172
+void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
1173
+ void *holder)
1174
+{
1175
+ spin_lock(&bdev_lock);
1176
+ bd_clear_claiming(whole, holder);
1177
+ spin_unlock(&bdev_lock);
1178
+}
1179
+EXPORT_SYMBOL(bd_abort_claiming);
11701180
11711181 #ifdef CONFIG_SYSFS
11721182 struct bd_holder_disk {
....@@ -1312,26 +1322,6 @@
13121322 #endif
13131323
13141324 /**
1315
- * flush_disk - invalidates all buffer-cache entries on a disk
1316
- *
1317
- * @bdev: struct block device to be flushed
1318
- * @kill_dirty: flag to guide handling of dirty inodes
1319
- *
1320
- * Invalidates all buffer-cache entries on a disk. It should be called
1321
- * when a disk has been changed -- either by a media change or online
1322
- * resize.
1323
- */
1324
-static void flush_disk(struct block_device *bdev, bool kill_dirty)
1325
-{
1326
- if (__invalidate_device(bdev, kill_dirty)) {
1327
- printk(KERN_WARNING "VFS: busy inodes on changed media or "
1328
- "resized disk %s\n",
1329
- bdev->bd_disk ? bdev->bd_disk->disk_name : "");
1330
- }
1331
- bdev->bd_invalidated = 1;
1332
-}
1333
-
1334
-/**
13351325 * check_disk_size_change - checks for disk size change and adjusts bdev size.
13361326 * @disk: struct gendisk to check
13371327 * @bdev: struct bdev to adjust.
....@@ -1341,11 +1331,12 @@
13411331 * and adjusts it if it differs. When shrinking the bdev size, its all caches
13421332 * are freed.
13431333 */
1344
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
1345
- bool verbose)
1334
+static void check_disk_size_change(struct gendisk *disk,
1335
+ struct block_device *bdev, bool verbose)
13461336 {
13471337 loff_t disk_size, bdev_size;
13481338
1339
+ spin_lock(&bdev->bd_size_lock);
13491340 disk_size = (loff_t)get_capacity(disk) << 9;
13501341 bdev_size = i_size_read(bdev->bd_inode);
13511342 if (disk_size != bdev_size) {
....@@ -1355,89 +1346,109 @@
13551346 disk->disk_name, bdev_size, disk_size);
13561347 }
13571348 i_size_write(bdev->bd_inode, disk_size);
1358
- if (bdev_size > disk_size)
1359
- flush_disk(bdev, false);
1349
+ }
1350
+ spin_unlock(&bdev->bd_size_lock);
1351
+
1352
+ if (bdev_size > disk_size) {
1353
+ if (__invalidate_device(bdev, false))
1354
+ pr_warn("VFS: busy inodes on resized disk %s\n",
1355
+ disk->disk_name);
13601356 }
13611357 }
13621358
13631359 /**
1364
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
1365
- * @disk: struct gendisk to be revalidated
1360
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
1361
+ * @disk: struct gendisk to check
1362
+ * @verbose: if %true log a message about a size change if there is any
13661363 *
1367
- * This routine is a wrapper for lower-level driver's revalidate_disk
1368
- * call-backs. It is used to do common pre and post operations needed
1369
- * for all revalidate_disk operations.
1364
+ * This routine checks to see if the bdev size does not match the disk size
1365
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
1366
+ * are freed.
13701367 */
1371
-int revalidate_disk(struct gendisk *disk)
1368
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
13721369 {
13731370 struct block_device *bdev;
1374
- int ret = 0;
13751371
1376
- if (disk->fops->revalidate_disk)
1377
- ret = disk->fops->revalidate_disk(disk);
1372
+ /*
1373
+ * Hidden disks don't have associated bdev so there's no point in
1374
+ * revalidating them.
1375
+ */
1376
+ if (disk->flags & GENHD_FL_HIDDEN)
1377
+ return;
1378
+
13781379 bdev = bdget_disk(disk, 0);
1379
- if (!bdev)
1380
- return ret;
1381
-
1382
- mutex_lock(&bdev->bd_mutex);
1383
- check_disk_size_change(disk, bdev, ret == 0);
1384
- bdev->bd_invalidated = 0;
1385
- mutex_unlock(&bdev->bd_mutex);
1386
- bdput(bdev);
1387
- return ret;
1380
+ if (bdev) {
1381
+ check_disk_size_change(disk, bdev, verbose);
1382
+ bdput(bdev);
1383
+ }
13881384 }
1389
-EXPORT_SYMBOL(revalidate_disk);
1385
+EXPORT_SYMBOL(revalidate_disk_size);
13901386
1391
-/*
1392
- * This routine checks whether a removable media has been changed,
1393
- * and invalidates all buffer-cache-entries in that case. This
1394
- * is a relatively slow routine, so we have to try to minimize using
1395
- * it. Thus it is called only upon a 'mount' or 'open'. This
1396
- * is the best way of combining speed and utility, I think.
1397
- * People changing diskettes in the middle of an operation deserve
1398
- * to lose :-)
1399
- */
1400
-int check_disk_change(struct block_device *bdev)
1387
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
14011388 {
1402
- struct gendisk *disk = bdev->bd_disk;
1403
- const struct block_device_operations *bdops = disk->fops;
1404
- unsigned int events;
1405
-
1406
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
1407
- DISK_EVENT_EJECT_REQUEST);
1408
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
1409
- return 0;
1410
-
1411
- flush_disk(bdev, true);
1412
- if (bdops->revalidate_disk)
1413
- bdops->revalidate_disk(bdev->bd_disk);
1414
- return 1;
1389
+ spin_lock(&bdev->bd_size_lock);
1390
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
1391
+ spin_unlock(&bdev->bd_size_lock);
14151392 }
1416
-
1417
-EXPORT_SYMBOL(check_disk_change);
1418
-
1419
-void bd_set_size(struct block_device *bdev, loff_t size)
1420
-{
1421
- inode_lock(bdev->bd_inode);
1422
- i_size_write(bdev->bd_inode, size);
1423
- inode_unlock(bdev->bd_inode);
1424
-}
1425
-EXPORT_SYMBOL(bd_set_size);
1393
+EXPORT_SYMBOL(bd_set_nr_sectors);
14261394
14271395 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
14281396
1429
-static void bdev_disk_changed(struct block_device *bdev, bool invalidate)
1397
+int bdev_disk_changed(struct block_device *bdev, bool invalidate)
14301398 {
1431
- if (disk_part_scan_enabled(bdev->bd_disk)) {
1432
- if (invalidate)
1433
- invalidate_partitions(bdev->bd_disk, bdev);
1434
- else
1435
- rescan_partitions(bdev->bd_disk, bdev);
1399
+ struct gendisk *disk = bdev->bd_disk;
1400
+ int ret;
1401
+
1402
+ lockdep_assert_held(&bdev->bd_mutex);
1403
+
1404
+ if (!(disk->flags & GENHD_FL_UP))
1405
+ return -ENXIO;
1406
+
1407
+rescan:
1408
+ ret = blk_drop_partitions(bdev);
1409
+ if (ret)
1410
+ return ret;
1411
+
1412
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
1413
+
1414
+ /*
1415
+ * Historically we only set the capacity to zero for devices that
1416
+ * support partitions (independ of actually having partitions created).
1417
+ * Doing that is rather inconsistent, but changing it broke legacy
1418
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
1419
+ * below to get the sane behavior for most device while not breaking
1420
+ * userspace for this particular setup.
1421
+ */
1422
+ if (invalidate) {
1423
+ if (disk_part_scan_enabled(disk) ||
1424
+ !(disk->flags & GENHD_FL_REMOVABLE))
1425
+ set_capacity(disk, 0);
14361426 } else {
1437
- check_disk_size_change(bdev->bd_disk, bdev, !invalidate);
1438
- bdev->bd_invalidated = 0;
1427
+ if (disk->fops->revalidate_disk)
1428
+ disk->fops->revalidate_disk(disk);
14391429 }
1430
+
1431
+ check_disk_size_change(disk, bdev, !invalidate);
1432
+
1433
+ if (get_capacity(disk)) {
1434
+ ret = blk_add_partitions(disk, bdev);
1435
+ if (ret == -EAGAIN)
1436
+ goto rescan;
1437
+ } else if (invalidate) {
1438
+ /*
1439
+ * Tell userspace that the media / partition table may have
1440
+ * changed.
1441
+ */
1442
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
1443
+ }
1444
+
1445
+ return ret;
14401446 }
1447
+/*
1448
+ * Only exported for for loop and dasd for historic reasons. Don't use in new
1449
+ * code!
1450
+ */
1451
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
14411452
14421453 /*
14431454 * bd_mutex locking:
....@@ -1446,40 +1457,46 @@
14461457 * mutex_lock_nested(whole->bd_mutex, 1)
14471458 */
14481459
1449
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1460
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
1461
+ int for_part)
14501462 {
1463
+ struct block_device *whole = NULL, *claiming = NULL;
14511464 struct gendisk *disk;
14521465 int ret;
14531466 int partno;
1454
- int perm = 0;
1455
- bool first_open = false;
1456
-
1457
- if (mode & FMODE_READ)
1458
- perm |= MAY_READ;
1459
- if (mode & FMODE_WRITE)
1460
- perm |= MAY_WRITE;
1461
- /*
1462
- * hooks: /n/, see "layering violations".
1463
- */
1464
- if (!for_part) {
1465
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1466
- if (ret != 0)
1467
- return ret;
1468
- }
1467
+ bool first_open = false, unblock_events = true, need_restart;
14691468
14701469 restart:
1471
-
1470
+ need_restart = false;
14721471 ret = -ENXIO;
14731472 disk = bdev_get_gendisk(bdev, &partno);
14741473 if (!disk)
14751474 goto out;
1475
+
1476
+ if (partno) {
1477
+ whole = bdget_disk(disk, 0);
1478
+ if (!whole) {
1479
+ ret = -ENOMEM;
1480
+ goto out_put_disk;
1481
+ }
1482
+ }
1483
+
1484
+ if (!for_part && (mode & FMODE_EXCL)) {
1485
+ WARN_ON_ONCE(!holder);
1486
+ if (whole)
1487
+ claiming = whole;
1488
+ else
1489
+ claiming = bdev;
1490
+ ret = bd_prepare_to_claim(bdev, claiming, holder);
1491
+ if (ret)
1492
+ goto out_put_whole;
1493
+ }
14761494
14771495 disk_block_events(disk);
14781496 mutex_lock_nested(&bdev->bd_mutex, for_part);
14791497 if (!bdev->bd_openers) {
14801498 first_open = true;
14811499 bdev->bd_disk = disk;
1482
- bdev->bd_queue = disk->queue;
14831500 bdev->bd_contains = bdev;
14841501 bdev->bd_partno = partno;
14851502
....@@ -1492,24 +1509,16 @@
14921509 ret = 0;
14931510 if (disk->fops->open) {
14941511 ret = disk->fops->open(bdev, mode);
1495
- if (ret == -ERESTARTSYS) {
1496
- /* Lost a race with 'disk' being
1497
- * deleted, try again.
1498
- * See md.c
1499
- */
1500
- disk_put_part(bdev->bd_part);
1501
- bdev->bd_part = NULL;
1502
- bdev->bd_disk = NULL;
1503
- bdev->bd_queue = NULL;
1504
- mutex_unlock(&bdev->bd_mutex);
1505
- disk_unblock_events(disk);
1506
- put_disk_and_module(disk);
1507
- goto restart;
1508
- }
1512
+ /*
1513
+ * If we lost a race with 'disk' being deleted,
1514
+ * try again. See md.c
1515
+ */
1516
+ if (ret == -ERESTARTSYS)
1517
+ need_restart = true;
15091518 }
15101519
15111520 if (!ret) {
1512
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1521
+ bd_set_nr_sectors(bdev, get_capacity(disk));
15131522 set_init_blocksize(bdev);
15141523 }
15151524
....@@ -1519,32 +1528,25 @@
15191528 * The latter is necessary to prevent ghost
15201529 * partitions on a removed medium.
15211530 */
1522
- if (bdev->bd_invalidated &&
1531
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
15231532 (!ret || ret == -ENOMEDIUM))
15241533 bdev_disk_changed(bdev, ret == -ENOMEDIUM);
15251534
15261535 if (ret)
15271536 goto out_clear;
15281537 } else {
1529
- struct block_device *whole;
1530
- whole = bdget_disk(disk, 0);
1531
- ret = -ENOMEM;
1532
- if (!whole)
1533
- goto out_clear;
15341538 BUG_ON(for_part);
1535
- ret = __blkdev_get(whole, mode, 1);
1536
- if (ret) {
1537
- bdput(whole);
1539
+ ret = __blkdev_get(whole, mode, NULL, 1);
1540
+ if (ret)
15381541 goto out_clear;
1539
- }
1540
- bdev->bd_contains = whole;
1542
+ bdev->bd_contains = bdgrab(whole);
15411543 bdev->bd_part = disk_get_part(disk, partno);
15421544 if (!(disk->flags & GENHD_FL_UP) ||
15431545 !bdev->bd_part || !bdev->bd_part->nr_sects) {
15441546 ret = -ENXIO;
15451547 goto out_clear;
15461548 }
1547
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1549
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
15481550 set_init_blocksize(bdev);
15491551 }
15501552
....@@ -1556,7 +1558,7 @@
15561558 if (bdev->bd_disk->fops->open)
15571559 ret = bdev->bd_disk->fops->open(bdev, mode);
15581560 /* the same as first opener case, read comment there */
1559
- if (bdev->bd_invalidated &&
1561
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
15601562 (!ret || ret == -ENOMEDIUM))
15611563 bdev_disk_changed(bdev, ret == -ENOMEDIUM);
15621564 if (ret)
....@@ -1566,27 +1568,52 @@
15661568 bdev->bd_openers++;
15671569 if (for_part)
15681570 bdev->bd_part_count++;
1571
+ if (claiming)
1572
+ bd_finish_claiming(bdev, claiming, holder);
1573
+
1574
+ /*
1575
+ * Block event polling for write claims if requested. Any write holder
1576
+ * makes the write_holder state stick until all are released. This is
1577
+ * good enough and tracking individual writeable reference is too
1578
+ * fragile given the way @mode is used in blkdev_get/put().
1579
+ */
1580
+ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1581
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1582
+ bdev->bd_write_holder = true;
1583
+ unblock_events = false;
1584
+ }
15691585 mutex_unlock(&bdev->bd_mutex);
1570
- disk_unblock_events(disk);
1586
+
1587
+ if (unblock_events)
1588
+ disk_unblock_events(disk);
1589
+
15711590 /* only one opener holds refs to the module and disk */
15721591 if (!first_open)
15731592 put_disk_and_module(disk);
1593
+ if (whole)
1594
+ bdput(whole);
15741595 return 0;
15751596
15761597 out_clear:
15771598 disk_put_part(bdev->bd_part);
15781599 bdev->bd_disk = NULL;
15791600 bdev->bd_part = NULL;
1580
- bdev->bd_queue = NULL;
15811601 if (bdev != bdev->bd_contains)
15821602 __blkdev_put(bdev->bd_contains, mode, 1);
15831603 bdev->bd_contains = NULL;
15841604 out_unlock_bdev:
1605
+ if (claiming)
1606
+ bd_abort_claiming(bdev, claiming, holder);
15851607 mutex_unlock(&bdev->bd_mutex);
15861608 disk_unblock_events(disk);
1609
+ out_put_whole:
1610
+ if (whole)
1611
+ bdput(whole);
1612
+ out_put_disk:
15871613 put_disk_and_module(disk);
1614
+ if (need_restart)
1615
+ goto restart;
15881616 out:
1589
-
15901617 return ret;
15911618 }
15921619
....@@ -1609,74 +1636,27 @@
16091636 * RETURNS:
16101637 * 0 on success, -errno on failure.
16111638 */
1612
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1639
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
16131640 {
1614
- struct block_device *whole = NULL;
1615
- int res;
1641
+ int ret, perm = 0;
16161642
1617
- WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
1643
+ if (mode & FMODE_READ)
1644
+ perm |= MAY_READ;
1645
+ if (mode & FMODE_WRITE)
1646
+ perm |= MAY_WRITE;
1647
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
1648
+ if (ret)
1649
+ goto bdput;
16181650
1619
- if ((mode & FMODE_EXCL) && holder) {
1620
- whole = bd_start_claiming(bdev, holder);
1621
- if (IS_ERR(whole)) {
1622
- bdput(bdev);
1623
- return PTR_ERR(whole);
1624
- }
1625
- }
1651
+ ret =__blkdev_get(bdev, mode, holder, 0);
1652
+ if (ret)
1653
+ goto bdput;
1654
+ return 0;
16261655
1627
- res = __blkdev_get(bdev, mode, 0);
1628
-
1629
- if (whole) {
1630
- struct gendisk *disk = whole->bd_disk;
1631
-
1632
- /* finish claiming */
1633
- mutex_lock(&bdev->bd_mutex);
1634
- spin_lock(&bdev_lock);
1635
-
1636
- if (!res) {
1637
- BUG_ON(!bd_may_claim(bdev, whole, holder));
1638
- /*
1639
- * Note that for a whole device bd_holders
1640
- * will be incremented twice, and bd_holder
1641
- * will be set to bd_may_claim before being
1642
- * set to holder
1643
- */
1644
- whole->bd_holders++;
1645
- whole->bd_holder = bd_may_claim;
1646
- bdev->bd_holders++;
1647
- bdev->bd_holder = holder;
1648
- }
1649
-
1650
- /* tell others that we're done */
1651
- BUG_ON(whole->bd_claiming != holder);
1652
- whole->bd_claiming = NULL;
1653
- wake_up_bit(&whole->bd_claiming, 0);
1654
-
1655
- spin_unlock(&bdev_lock);
1656
-
1657
- /*
1658
- * Block event polling for write claims if requested. Any
1659
- * write holder makes the write_holder state stick until
1660
- * all are released. This is good enough and tracking
1661
- * individual writeable reference is too fragile given the
1662
- * way @mode is used in blkdev_get/put().
1663
- */
1664
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1665
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1666
- bdev->bd_write_holder = true;
1667
- disk_block_events(disk);
1668
- }
1669
-
1670
- mutex_unlock(&bdev->bd_mutex);
1671
- bdput(whole);
1672
- }
1673
-
1674
- if (res)
1675
- bdput(bdev);
1676
-
1677
- return res;
1656
+bdput:
1657
+ bdput(bdev);
1658
+ return ret;
16781659 }
1679
-EXPORT_SYMBOL(blkdev_get);
16801660
16811661 /**
16821662 * blkdev_get_by_path - open a block device by name
....@@ -1769,7 +1749,7 @@
17691749 */
17701750 filp->f_flags |= O_LARGEFILE;
17711751
1772
- filp->f_mode |= FMODE_NOWAIT;
1752
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
17731753
17741754 if (filp->f_flags & O_NDELAY)
17751755 filp->f_mode |= FMODE_NDELAY;
....@@ -1925,7 +1905,7 @@
19251905 if (bdev_read_only(I_BDEV(bd_inode)))
19261906 return -EPERM;
19271907
1928
- if (IS_SWAPFILE(bd_inode))
1908
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
19291909 return -ETXTBSY;
19301910
19311911 if (!iov_iter_count(from))
....@@ -1999,13 +1979,14 @@
19991979
20001980 static const struct address_space_operations def_blk_aops = {
20011981 .readpage = blkdev_readpage,
2002
- .readpages = blkdev_readpages,
1982
+ .readahead = blkdev_readahead,
20031983 .writepage = blkdev_writepage,
20041984 .write_begin = blkdev_write_begin,
20051985 .write_end = blkdev_write_end,
20061986 .writepages = blkdev_writepages,
20071987 .releasepage = blkdev_releasepage,
20081988 .direct_IO = blkdev_direct_IO,
1989
+ .migratepage = buffer_migrate_page_norefs,
20091990 .is_dirty_writeback = buffer_check_dirty_writeback,
20101991 };
20111992
....@@ -2017,7 +1998,6 @@
20171998 loff_t len)
20181999 {
20192000 struct block_device *bdev = I_BDEV(bdev_file_inode(file));
2020
- struct address_space *mapping;
20212001 loff_t end = start + len - 1;
20222002 loff_t isize;
20232003 int error;
....@@ -2045,8 +2025,9 @@
20452025 return -EINVAL;
20462026
20472027 /* Invalidate the page cache, including dirty pages. */
2048
- mapping = bdev->bd_inode->i_mapping;
2049
- truncate_inode_pages_range(mapping, start, end);
2028
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
2029
+ if (error)
2030
+ return error;
20502031
20512032 switch (mode) {
20522033 case FALLOC_FL_ZERO_RANGE:
....@@ -2073,7 +2054,7 @@
20732054 * the caller will be given -EBUSY. The third argument is
20742055 * inclusive, so the rounding here is safe.
20752056 */
2076
- return invalidate_inode_pages2_range(mapping,
2057
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
20772058 start >> PAGE_SHIFT,
20782059 end >> PAGE_SHIFT);
20792060 }
....@@ -2084,6 +2065,7 @@
20842065 .llseek = block_llseek,
20852066 .read_iter = blkdev_read_iter,
20862067 .write_iter = blkdev_write_iter,
2068
+ .iopoll = blkdev_iopoll,
20872069 .mmap = generic_file_mmap,
20882070 .fsync = blkdev_fsync,
20892071 .unlocked_ioctl = block_ioctl,
....@@ -2094,18 +2076,6 @@
20942076 .splice_write = iter_file_splice_write,
20952077 .fallocate = blkdev_fallocate,
20962078 };
2097
-
2098
-int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
2099
-{
2100
- int res;
2101
- mm_segment_t old_fs = get_fs();
2102
- set_fs(KERNEL_DS);
2103
- res = blkdev_ioctl(bdev, 0, cmd, arg);
2104
- set_fs(old_fs);
2105
- return res;
2106
-}
2107
-
2108
-EXPORT_SYMBOL(ioctl_by_bdev);
21092079
21102080 /**
21112081 * lookup_bdev - lookup a struct block_device by name