hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/block/genhd.c
....@@ -1,8 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * gendisk handling
34 */
45
56 #include <linux/module.h>
7
+#include <linux/ctype.h>
68 #include <linux/fs.h>
79 #include <linux/genhd.h>
810 #include <linux/kdev_t.h>
....@@ -25,7 +27,7 @@
2527 #include "blk.h"
2628
2729 static DEFINE_MUTEX(block_class_lock);
28
-struct kobject *block_depr;
30
+static struct kobject *block_depr;
2931
3032 /* for extended dynamic devt allocation, currently only one major is used */
3133 #define NR_EXT_DEVT (1 << MINORBITS)
....@@ -36,8 +38,6 @@
3638 static DEFINE_SPINLOCK(ext_devt_lock);
3739 static DEFINE_IDR(ext_devt_idr);
3840
39
-static const struct device_type disk_type;
40
-
4141 static void disk_check_events(struct disk_events *ev,
4242 unsigned int *clearing_ptr);
4343 static void disk_alloc_events(struct gendisk *disk);
....@@ -45,53 +45,102 @@
4545 static void disk_del_events(struct gendisk *disk);
4646 static void disk_release_events(struct gendisk *disk);
4747
48
-void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
48
+/*
49
+ * Set disk capacity and notify if the size is not currently
50
+ * zero and will not be set to zero
51
+ */
52
+bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
53
+ bool update_bdev)
4954 {
50
- if (q->mq_ops)
51
- return;
55
+ sector_t capacity = get_capacity(disk);
5256
53
- atomic_inc(&part->in_flight[rw]);
54
- if (part->partno)
55
- atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
56
-}
57
+ set_capacity(disk, size);
58
+ if (update_bdev)
59
+ revalidate_disk_size(disk, true);
5760
58
-void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
59
-{
60
- if (q->mq_ops)
61
- return;
61
+ if (capacity != size && capacity != 0 && size != 0) {
62
+ char *envp[] = { "RESIZE=1", NULL };
6263
63
- atomic_dec(&part->in_flight[rw]);
64
- if (part->partno)
65
- atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
66
-}
67
-
68
-void part_in_flight(struct request_queue *q, struct hd_struct *part,
69
- unsigned int inflight[2])
70
-{
71
- if (q->mq_ops) {
72
- blk_mq_in_flight(q, part, inflight);
73
- return;
64
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
65
+ return true;
7466 }
7567
76
- inflight[0] = atomic_read(&part->in_flight[0]) +
77
- atomic_read(&part->in_flight[1]);
78
- if (part->partno) {
79
- part = &part_to_disk(part)->part0;
80
- inflight[1] = atomic_read(&part->in_flight[0]) +
81
- atomic_read(&part->in_flight[1]);
68
+ return false;
69
+}
70
+
71
+EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);
72
+
73
+/*
74
+ * Format the device name of the indicated disk into the supplied buffer and
75
+ * return a pointer to that same buffer for convenience.
76
+ */
77
+char *disk_name(struct gendisk *hd, int partno, char *buf)
78
+{
79
+ if (!partno)
80
+ snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
81
+ else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
82
+ snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
83
+ else
84
+ snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
85
+
86
+ return buf;
87
+}
88
+
89
+const char *bdevname(struct block_device *bdev, char *buf)
90
+{
91
+ return disk_name(bdev->bd_disk, bdev->bd_partno, buf);
92
+}
93
+EXPORT_SYMBOL(bdevname);
94
+
95
+static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
96
+{
97
+ int cpu;
98
+
99
+ memset(stat, 0, sizeof(struct disk_stats));
100
+ for_each_possible_cpu(cpu) {
101
+ struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
102
+ int group;
103
+
104
+ for (group = 0; group < NR_STAT_GROUPS; group++) {
105
+ stat->nsecs[group] += ptr->nsecs[group];
106
+ stat->sectors[group] += ptr->sectors[group];
107
+ stat->ios[group] += ptr->ios[group];
108
+ stat->merges[group] += ptr->merges[group];
109
+ }
110
+
111
+ stat->io_ticks += ptr->io_ticks;
82112 }
83113 }
84114
85
-void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
86
- unsigned int inflight[2])
115
+static unsigned int part_in_flight(struct hd_struct *part)
87116 {
88
- if (q->mq_ops) {
89
- blk_mq_in_flight_rw(q, part, inflight);
90
- return;
91
- }
117
+ unsigned int inflight = 0;
118
+ int cpu;
92119
93
- inflight[0] = atomic_read(&part->in_flight[0]);
94
- inflight[1] = atomic_read(&part->in_flight[1]);
120
+ for_each_possible_cpu(cpu) {
121
+ inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
122
+ part_stat_local_read_cpu(part, in_flight[1], cpu);
123
+ }
124
+ if ((int)inflight < 0)
125
+ inflight = 0;
126
+
127
+ return inflight;
128
+}
129
+
130
+static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
131
+{
132
+ int cpu;
133
+
134
+ inflight[0] = 0;
135
+ inflight[1] = 0;
136
+ for_each_possible_cpu(cpu) {
137
+ inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
138
+ inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
139
+ }
140
+ if ((int)inflight[0] < 0)
141
+ inflight[0] = 0;
142
+ if ((int)inflight[1] < 0)
143
+ inflight[1] = 0;
95144 }
96145
97146 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
....@@ -129,7 +178,6 @@
129178
130179 return part;
131180 }
132
-EXPORT_SYMBOL_GPL(disk_get_part);
133181
134182 /**
135183 * disk_part_iter_init - initialize partition iterator
....@@ -260,11 +308,13 @@
260308 * primarily used for stats accounting.
261309 *
262310 * CONTEXT:
263
- * RCU read locked. The returned partition pointer is valid only
264
- * while preemption is disabled.
311
+ * RCU read locked. The returned partition pointer is always valid
312
+ * because its refcount is grabbed except for part0, which lifetime
313
+ * is same with the disk.
265314 *
266315 * RETURNS:
267316 * Found partition on success, part0 is returned if no partition matches
317
+ * or the matched partition is being deleted.
268318 */
269319 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
270320 {
....@@ -272,23 +322,70 @@
272322 struct hd_struct *part;
273323 int i;
274324
325
+ rcu_read_lock();
275326 ptbl = rcu_dereference(disk->part_tbl);
276327
277328 part = rcu_dereference(ptbl->last_lookup);
278
- if (part && sector_in_part(part, sector))
279
- return part;
329
+ if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
330
+ goto out_unlock;
280331
281332 for (i = 1; i < ptbl->len; i++) {
282333 part = rcu_dereference(ptbl->part[i]);
283334
284335 if (part && sector_in_part(part, sector)) {
336
+ /*
337
+ * only live partition can be cached for lookup,
338
+ * so use-after-free on cached & deleting partition
339
+ * can be avoided
340
+ */
341
+ if (!hd_struct_try_get(part))
342
+ break;
285343 rcu_assign_pointer(ptbl->last_lookup, part);
286
- return part;
344
+ goto out_unlock;
287345 }
288346 }
289
- return &disk->part0;
347
+
348
+ part = &disk->part0;
349
+out_unlock:
350
+ rcu_read_unlock();
351
+ return part;
290352 }
291
-EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
353
+
354
+/**
355
+ * disk_has_partitions
356
+ * @disk: gendisk of interest
357
+ *
358
+ * Walk through the partition table and check if valid partition exists.
359
+ *
360
+ * CONTEXT:
361
+ * Don't care.
362
+ *
363
+ * RETURNS:
364
+ * True if the gendisk has at least one valid non-zero size partition.
365
+ * Otherwise false.
366
+ */
367
+bool disk_has_partitions(struct gendisk *disk)
368
+{
369
+ struct disk_part_tbl *ptbl;
370
+ int i;
371
+ bool ret = false;
372
+
373
+ rcu_read_lock();
374
+ ptbl = rcu_dereference(disk->part_tbl);
375
+
376
+ /* Iterate partitions skipping the whole device at index 0 */
377
+ for (i = 1; i < ptbl->len; i++) {
378
+ if (rcu_dereference(ptbl->part[i])) {
379
+ ret = true;
380
+ break;
381
+ }
382
+ }
383
+
384
+ rcu_read_unlock();
385
+
386
+ return ret;
387
+}
388
+EXPORT_SYMBOL_GPL(disk_has_partitions);
292389
293390 /*
294391 * Can be deleted altogether. Later.
....@@ -355,8 +452,8 @@
355452 }
356453
357454 if (index == 0) {
358
- printk("register_blkdev: failed to get major for %s\n",
359
- name);
455
+ printk("%s: failed to get major for %s\n",
456
+ __func__, name);
360457 ret = -EBUSY;
361458 goto out;
362459 }
....@@ -365,8 +462,8 @@
365462 }
366463
367464 if (major >= BLKDEV_MAJOR_MAX) {
368
- pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
369
- major, BLKDEV_MAJOR_MAX-1, name);
465
+ pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
466
+ __func__, major, BLKDEV_MAJOR_MAX-1, name);
370467
371468 ret = -EINVAL;
372469 goto out;
....@@ -521,8 +618,8 @@
521618 }
522619 }
523620
524
-/**
525
- * We invalidate devt by assigning NULL pointer for devt in idr.
621
+/*
622
+ * We invalidate devt by assigning NULL pointer for devt in idr.
526623 */
527624 void blk_invalidate_devt(dev_t devt)
528625 {
....@@ -582,10 +679,23 @@
582679 return 0;
583680 }
584681
585
-static void register_disk(struct device *parent, struct gendisk *disk)
682
+static void disk_scan_partitions(struct gendisk *disk)
683
+{
684
+ struct block_device *bdev;
685
+
686
+ if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
687
+ return;
688
+
689
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
690
+ bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
691
+ if (!IS_ERR(bdev))
692
+ blkdev_put(bdev, FMODE_READ);
693
+}
694
+
695
+static void register_disk(struct device *parent, struct gendisk *disk,
696
+ const struct attribute_group **groups)
586697 {
587698 struct device *ddev = disk_to_dev(disk);
588
- struct block_device *bdev;
589699 struct disk_part_iter piter;
590700 struct hd_struct *part;
591701 int err;
....@@ -597,6 +707,10 @@
597707 /* delay uevents, until we scanned partition table */
598708 dev_set_uevent_suppress(ddev, 1);
599709
710
+ if (groups) {
711
+ WARN_ON(ddev->groups);
712
+ ddev->groups = groups;
713
+ }
600714 if (device_add(ddev))
601715 return;
602716 if (!sysfs_deprecated) {
....@@ -621,25 +735,8 @@
621735 if (disk->flags & GENHD_FL_HIDDEN)
622736 return;
623737
624
- /* No minors to use for partitions */
625
- if (!disk_part_scan_enabled(disk))
626
- goto exit;
738
+ disk_scan_partitions(disk);
627739
628
- /* No such device (e.g., media were just removed) */
629
- if (!get_capacity(disk))
630
- goto exit;
631
-
632
- bdev = bdget_disk(disk, 0);
633
- if (!bdev)
634
- goto exit;
635
-
636
- bdev->bd_invalidated = 1;
637
- err = blkdev_get(bdev, FMODE_READ, NULL);
638
- if (err < 0)
639
- goto exit;
640
- blkdev_put(bdev, FMODE_READ);
641
-
642
-exit:
643740 /* announce disk after possible partitions are created */
644741 dev_set_uevent_suppress(ddev, 0);
645742 kobject_uevent(&ddev->kobj, KOBJ_ADD);
....@@ -662,6 +759,7 @@
662759 * __device_add_disk - add disk information to kernel list
663760 * @parent: parent device for the disk
664761 * @disk: per-device partitioning information
762
+ * @groups: Additional per-device sysfs groups
665763 * @register_queue: register the queue if set to true
666764 *
667765 * This function registers the partitioning information in @disk
....@@ -670,10 +768,20 @@
670768 * FIXME: error handling
671769 */
672770 static void __device_add_disk(struct device *parent, struct gendisk *disk,
771
+ const struct attribute_group **groups,
673772 bool register_queue)
674773 {
675774 dev_t devt;
676775 int retval;
776
+
777
+ /*
778
+ * The disk queue should now be all set with enough information about
779
+ * the device for the elevator code to pick an adequate default
780
+ * elevator if one is needed, that is, for devices requesting queue
781
+ * registration.
782
+ */
783
+ if (register_queue)
784
+ elevator_init_mq(disk->queue);
677785
678786 /* minors == 0 indicates to use ext devt from part0 and should
679787 * be accompanied with EXT_DEVT flag. Make sure all
....@@ -703,17 +811,19 @@
703811 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
704812 disk->flags |= GENHD_FL_NO_PART_SCAN;
705813 } else {
814
+ struct backing_dev_info *bdi = disk->queue->backing_dev_info;
815
+ struct device *dev = disk_to_dev(disk);
706816 int ret;
707817
708818 /* Register BDI before referencing it from bdev */
709
- disk_to_dev(disk)->devt = devt;
710
- ret = bdi_register_owner(disk->queue->backing_dev_info,
711
- disk_to_dev(disk));
819
+ dev->devt = devt;
820
+ ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
712821 WARN_ON(ret);
822
+ bdi_set_owner(bdi, dev);
713823 blk_register_region(disk_devt(disk), disk->minors, NULL,
714824 exact_match, exact_lock, disk);
715825 }
716
- register_disk(parent, disk);
826
+ register_disk(parent, disk, groups);
717827 if (register_queue)
718828 blk_register_queue(disk);
719829
....@@ -727,22 +837,64 @@
727837 blk_integrity_add(disk);
728838 }
729839
730
-void device_add_disk(struct device *parent, struct gendisk *disk)
840
+void device_add_disk(struct device *parent, struct gendisk *disk,
841
+ const struct attribute_group **groups)
842
+
731843 {
732
- __device_add_disk(parent, disk, true);
844
+ __device_add_disk(parent, disk, groups, true);
733845 }
734846 EXPORT_SYMBOL(device_add_disk);
735847
736848 void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
737849 {
738
- __device_add_disk(parent, disk, false);
850
+ __device_add_disk(parent, disk, NULL, false);
739851 }
740852 EXPORT_SYMBOL(device_add_disk_no_queue_reg);
741853
854
+static void invalidate_partition(struct gendisk *disk, int partno)
855
+{
856
+ struct block_device *bdev;
857
+
858
+ bdev = bdget_disk(disk, partno);
859
+ if (!bdev)
860
+ return;
861
+
862
+ fsync_bdev(bdev);
863
+ __invalidate_device(bdev, true);
864
+
865
+ /*
866
+ * Unhash the bdev inode for this device so that it gets evicted as soon
867
+ * as last inode reference is dropped.
868
+ */
869
+ remove_inode_hash(bdev->bd_inode);
870
+ bdput(bdev);
871
+}
872
+
873
+/**
874
+ * del_gendisk - remove the gendisk
875
+ * @disk: the struct gendisk to remove
876
+ *
877
+ * Removes the gendisk and all its associated resources. This deletes the
878
+ * partitions associated with the gendisk, and unregisters the associated
879
+ * request_queue.
880
+ *
881
+ * This is the counter to the respective __device_add_disk() call.
882
+ *
883
+ * The final removal of the struct gendisk happens when its refcount reaches 0
884
+ * with put_disk(), which should be called after del_gendisk(), if
885
+ * __device_add_disk() was used.
886
+ *
887
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
888
+ * it should not be deferred.
889
+ *
890
+ * Context: can sleep
891
+ */
742892 void del_gendisk(struct gendisk *disk)
743893 {
744894 struct disk_part_iter piter;
745895 struct hd_struct *part;
896
+
897
+ might_sleep();
746898
747899 blk_integrity_del(disk);
748900 disk_del_events(disk);
....@@ -757,13 +909,11 @@
757909 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
758910 while ((part = disk_part_iter_next(&piter))) {
759911 invalidate_partition(disk, part->partno);
760
- bdev_unhash_inode(part_devt(part));
761
- delete_partition(disk, part->partno);
912
+ delete_partition(part);
762913 }
763914 disk_part_iter_exit(&piter);
764915
765916 invalidate_partition(disk, 0);
766
- bdev_unhash_inode(disk_devt(disk));
767917 set_capacity(disk, 0);
768918 disk->flags &= ~GENHD_FL_UP;
769919 up_write(&disk->lookup_sem);
....@@ -836,10 +986,14 @@
836986 *
837987 * This function gets the structure containing partitioning
838988 * information for the given device @devt.
989
+ *
990
+ * Context: can sleep
839991 */
840992 struct gendisk *get_gendisk(dev_t devt, int *partno)
841993 {
842994 struct gendisk *disk = NULL;
995
+
996
+ might_sleep();
843997
844998 if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
845999 struct kobject *kobj;
....@@ -877,7 +1031,6 @@
8771031 }
8781032 return disk;
8791033 }
880
-EXPORT_SYMBOL(get_gendisk);
8811034
8821035 /**
8831036 * bdget_disk - do bdget() by gendisk and partition number
....@@ -899,7 +1052,7 @@
8991052
9001053 part = disk_get_part(disk, partno);
9011054 if (part)
902
- bdev = bdget(part_devt(part));
1055
+ bdev = bdget_part(part);
9031056 disk_put_part(part);
9041057
9051058 return bdev;
....@@ -1123,6 +1276,74 @@
11231276 return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
11241277 }
11251278
1279
+ssize_t part_size_show(struct device *dev,
1280
+ struct device_attribute *attr, char *buf)
1281
+{
1282
+ struct hd_struct *p = dev_to_part(dev);
1283
+
1284
+ return sprintf(buf, "%llu\n",
1285
+ (unsigned long long)part_nr_sects_read(p));
1286
+}
1287
+
1288
+ssize_t part_stat_show(struct device *dev,
1289
+ struct device_attribute *attr, char *buf)
1290
+{
1291
+ struct hd_struct *p = dev_to_part(dev);
1292
+ struct request_queue *q = part_to_disk(p)->queue;
1293
+ struct disk_stats stat;
1294
+ unsigned int inflight;
1295
+
1296
+ part_stat_read_all(p, &stat);
1297
+ if (queue_is_mq(q))
1298
+ inflight = blk_mq_in_flight(q, p);
1299
+ else
1300
+ inflight = part_in_flight(p);
1301
+
1302
+ return sprintf(buf,
1303
+ "%8lu %8lu %8llu %8u "
1304
+ "%8lu %8lu %8llu %8u "
1305
+ "%8u %8u %8u "
1306
+ "%8lu %8lu %8llu %8u "
1307
+ "%8lu %8u"
1308
+ "\n",
1309
+ stat.ios[STAT_READ],
1310
+ stat.merges[STAT_READ],
1311
+ (unsigned long long)stat.sectors[STAT_READ],
1312
+ (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
1313
+ stat.ios[STAT_WRITE],
1314
+ stat.merges[STAT_WRITE],
1315
+ (unsigned long long)stat.sectors[STAT_WRITE],
1316
+ (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
1317
+ inflight,
1318
+ jiffies_to_msecs(stat.io_ticks),
1319
+ (unsigned int)div_u64(stat.nsecs[STAT_READ] +
1320
+ stat.nsecs[STAT_WRITE] +
1321
+ stat.nsecs[STAT_DISCARD] +
1322
+ stat.nsecs[STAT_FLUSH],
1323
+ NSEC_PER_MSEC),
1324
+ stat.ios[STAT_DISCARD],
1325
+ stat.merges[STAT_DISCARD],
1326
+ (unsigned long long)stat.sectors[STAT_DISCARD],
1327
+ (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
1328
+ stat.ios[STAT_FLUSH],
1329
+ (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
1330
+}
1331
+
1332
+ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
1333
+ char *buf)
1334
+{
1335
+ struct hd_struct *p = dev_to_part(dev);
1336
+ struct request_queue *q = part_to_disk(p)->queue;
1337
+ unsigned int inflight[2];
1338
+
1339
+ if (queue_is_mq(q))
1340
+ blk_mq_in_flight_rw(q, p, inflight);
1341
+ else
1342
+ part_in_flight_rw(p, inflight);
1343
+
1344
+ return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
1345
+}
1346
+
11261347 static ssize_t disk_capability_show(struct device *dev,
11271348 struct device_attribute *attr, char *buf)
11281349 {
....@@ -1161,10 +1382,33 @@
11611382 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
11621383 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
11631384 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
1385
+
11641386 #ifdef CONFIG_FAIL_MAKE_REQUEST
1387
+ssize_t part_fail_show(struct device *dev,
1388
+ struct device_attribute *attr, char *buf)
1389
+{
1390
+ struct hd_struct *p = dev_to_part(dev);
1391
+
1392
+ return sprintf(buf, "%d\n", p->make_it_fail);
1393
+}
1394
+
1395
+ssize_t part_fail_store(struct device *dev,
1396
+ struct device_attribute *attr,
1397
+ const char *buf, size_t count)
1398
+{
1399
+ struct hd_struct *p = dev_to_part(dev);
1400
+ int i;
1401
+
1402
+ if (count > 0 && sscanf(buf, "%d", &i) > 0)
1403
+ p->make_it_fail = (i == 0) ? 0 : 1;
1404
+
1405
+ return count;
1406
+}
1407
+
11651408 static struct device_attribute dev_attr_fail =
11661409 __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1167
-#endif
1410
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
1411
+
11681412 #ifdef CONFIG_FAIL_IO_TIMEOUT
11691413 static struct device_attribute dev_attr_fail_timeout =
11701414 __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
....@@ -1259,7 +1503,6 @@
12591503 struct disk_part_tbl *new_ptbl;
12601504 int len = old_ptbl ? old_ptbl->len : 0;
12611505 int i, target;
1262
- size_t size;
12631506
12641507 /*
12651508 * check for int overflow, since we can get here from blkpg_ioctl()
....@@ -1276,8 +1519,8 @@
12761519 if (target <= len)
12771520 return 0;
12781521
1279
- size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1280
- new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1522
+ new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
1523
+ disk->node_id);
12811524 if (!new_ptbl)
12821525 return -ENOMEM;
12831526
....@@ -1290,9 +1533,30 @@
12901533 return 0;
12911534 }
12921535
1536
+/**
1537
+ * disk_release - releases all allocated resources of the gendisk
1538
+ * @dev: the device representing this disk
1539
+ *
1540
+ * This function releases all allocated resources of the gendisk.
1541
+ *
1542
+ * The struct gendisk refcount is incremented with get_gendisk() or
1543
+ * get_disk_and_module(), and its refcount is decremented with
1544
+ * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
1545
+ * function is called.
1546
+ *
1547
+ * Drivers which used __device_add_disk() have a gendisk with a request_queue
1548
+ * assigned. Since the request_queue sits on top of the gendisk for these
1549
+ * drivers we also call blk_put_queue() for them, and we expect the
1550
+ * request_queue refcount to reach 0 at this point, and so the request_queue
1551
+ * will also be freed prior to the disk.
1552
+ *
1553
+ * Context: can sleep
1554
+ */
12931555 static void disk_release(struct device *dev)
12941556 {
12951557 struct gendisk *disk = dev_to_disk(dev);
1558
+
1559
+ might_sleep();
12961560
12971561 blk_free_devt(dev->devt);
12981562 disk_release_events(disk);
....@@ -1312,12 +1576,12 @@
13121576 {
13131577 struct gendisk *disk = dev_to_disk(dev);
13141578
1315
- if (disk->devnode)
1316
- return disk->devnode(disk, mode);
1579
+ if (disk->fops->devnode)
1580
+ return disk->fops->devnode(disk, mode);
13171581 return NULL;
13181582 }
13191583
1320
-static const struct device_type disk_type = {
1584
+const struct device_type disk_type = {
13211585 .name = "disk",
13221586 .groups = disk_attr_groups,
13231587 .release = disk_release,
....@@ -1338,8 +1602,8 @@
13381602 struct disk_part_iter piter;
13391603 struct hd_struct *hd;
13401604 char buf[BDEVNAME_SIZE];
1341
- unsigned int inflight[2];
1342
- int cpu;
1605
+ unsigned int inflight;
1606
+ struct disk_stats stat;
13431607
13441608 /*
13451609 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
....@@ -1351,32 +1615,46 @@
13511615
13521616 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
13531617 while ((hd = disk_part_iter_next(&piter))) {
1354
- cpu = part_stat_lock();
1355
- part_round_stats(gp->queue, cpu, hd);
1356
- part_stat_unlock();
1357
- part_in_flight(gp->queue, hd, inflight);
1618
+ part_stat_read_all(hd, &stat);
1619
+ if (queue_is_mq(gp->queue))
1620
+ inflight = blk_mq_in_flight(gp->queue, hd);
1621
+ else
1622
+ inflight = part_in_flight(hd);
1623
+
13581624 seq_printf(seqf, "%4d %7d %s "
13591625 "%lu %lu %lu %u "
13601626 "%lu %lu %lu %u "
13611627 "%u %u %u "
1362
- "%lu %lu %lu %u\n",
1628
+ "%lu %lu %lu %u "
1629
+ "%lu %u"
1630
+ "\n",
13631631 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
13641632 disk_name(gp, hd->partno, buf),
1365
- part_stat_read(hd, ios[STAT_READ]),
1366
- part_stat_read(hd, merges[STAT_READ]),
1367
- part_stat_read(hd, sectors[STAT_READ]),
1368
- (unsigned int)part_stat_read_msecs(hd, STAT_READ),
1369
- part_stat_read(hd, ios[STAT_WRITE]),
1370
- part_stat_read(hd, merges[STAT_WRITE]),
1371
- part_stat_read(hd, sectors[STAT_WRITE]),
1372
- (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
1373
- inflight[0],
1374
- jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1375
- jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
1376
- part_stat_read(hd, ios[STAT_DISCARD]),
1377
- part_stat_read(hd, merges[STAT_DISCARD]),
1378
- part_stat_read(hd, sectors[STAT_DISCARD]),
1379
- (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD)
1633
+ stat.ios[STAT_READ],
1634
+ stat.merges[STAT_READ],
1635
+ stat.sectors[STAT_READ],
1636
+ (unsigned int)div_u64(stat.nsecs[STAT_READ],
1637
+ NSEC_PER_MSEC),
1638
+ stat.ios[STAT_WRITE],
1639
+ stat.merges[STAT_WRITE],
1640
+ stat.sectors[STAT_WRITE],
1641
+ (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
1642
+ NSEC_PER_MSEC),
1643
+ inflight,
1644
+ jiffies_to_msecs(stat.io_ticks),
1645
+ (unsigned int)div_u64(stat.nsecs[STAT_READ] +
1646
+ stat.nsecs[STAT_WRITE] +
1647
+ stat.nsecs[STAT_DISCARD] +
1648
+ stat.nsecs[STAT_FLUSH],
1649
+ NSEC_PER_MSEC),
1650
+ stat.ios[STAT_DISCARD],
1651
+ stat.merges[STAT_DISCARD],
1652
+ stat.sectors[STAT_DISCARD],
1653
+ (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
1654
+ NSEC_PER_MSEC),
1655
+ stat.ios[STAT_FLUSH],
1656
+ (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
1657
+ NSEC_PER_MSEC)
13801658 );
13811659 }
13821660 disk_part_iter_exit(&piter);
....@@ -1433,7 +1711,6 @@
14331711 class_dev_iter_exit(&iter);
14341712 return devt;
14351713 }
1436
-EXPORT_SYMBOL(blk_lookup_devt);
14371714
14381715 struct gendisk *__alloc_disk_node(int minors, int node_id)
14391716 {
....@@ -1448,47 +1725,60 @@
14481725 }
14491726
14501727 disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1451
- if (disk) {
1452
- if (!init_part_stats(&disk->part0)) {
1453
- kfree(disk);
1454
- return NULL;
1455
- }
1456
- init_rwsem(&disk->lookup_sem);
1457
- disk->node_id = node_id;
1458
- if (disk_expand_part_tbl(disk, 0)) {
1459
- free_part_stats(&disk->part0);
1460
- kfree(disk);
1461
- return NULL;
1462
- }
1463
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
1464
- rcu_assign_pointer(ptbl->part[0], &disk->part0);
1728
+ if (!disk)
1729
+ return NULL;
14651730
1466
- /*
1467
- * set_capacity() and get_capacity() currently don't use
1468
- * seqcounter to read/update the part0->nr_sects. Still init
1469
- * the counter as we can read the sectors in IO submission
1470
- * patch using seqence counters.
1471
- *
1472
- * TODO: Ideally set_capacity() and get_capacity() should be
1473
- * converted to make use of bd_mutex and sequence counters.
1474
- */
1475
- seqcount_init(&disk->part0.nr_sects_seq);
1476
- if (hd_ref_init(&disk->part0)) {
1477
- hd_free_part(&disk->part0);
1478
- kfree(disk);
1479
- return NULL;
1480
- }
1731
+ disk->part0.dkstats = alloc_percpu(struct disk_stats);
1732
+ if (!disk->part0.dkstats)
1733
+ goto out_free_disk;
14811734
1482
- disk->minors = minors;
1483
- rand_initialize_disk(disk);
1484
- disk_to_dev(disk)->class = &block_class;
1485
- disk_to_dev(disk)->type = &disk_type;
1486
- device_initialize(disk_to_dev(disk));
1735
+ init_rwsem(&disk->lookup_sem);
1736
+ disk->node_id = node_id;
1737
+ if (disk_expand_part_tbl(disk, 0)) {
1738
+ free_percpu(disk->part0.dkstats);
1739
+ goto out_free_disk;
14871740 }
1741
+
1742
+ ptbl = rcu_dereference_protected(disk->part_tbl, 1);
1743
+ rcu_assign_pointer(ptbl->part[0], &disk->part0);
1744
+
1745
+ /*
1746
+ * set_capacity() and get_capacity() currently don't use
1747
+ * seqcounter to read/update the part0->nr_sects. Still init
1748
+ * the counter as we can read the sectors in IO submission
1749
+ * patch using seqence counters.
1750
+ *
1751
+ * TODO: Ideally set_capacity() and get_capacity() should be
1752
+ * converted to make use of bd_mutex and sequence counters.
1753
+ */
1754
+ hd_sects_seq_init(&disk->part0);
1755
+ if (hd_ref_init(&disk->part0))
1756
+ goto out_free_part0;
1757
+
1758
+ disk->minors = minors;
1759
+ rand_initialize_disk(disk);
1760
+ disk_to_dev(disk)->class = &block_class;
1761
+ disk_to_dev(disk)->type = &disk_type;
1762
+ device_initialize(disk_to_dev(disk));
14881763 return disk;
1764
+
1765
+out_free_part0:
1766
+ hd_free_part(&disk->part0);
1767
+out_free_disk:
1768
+ kfree(disk);
1769
+ return NULL;
14891770 }
14901771 EXPORT_SYMBOL(__alloc_disk_node);
14911772
1773
+/**
1774
+ * get_disk_and_module - increments the gendisk and gendisk fops module refcount
1775
+ * @disk: the struct gendisk to increment the refcount for
1776
+ *
1777
+ * This increments the refcount for the struct gendisk, and the gendisk's
1778
+ * fops module owner.
1779
+ *
1780
+ * Context: Any context.
1781
+ */
14921782 struct kobject *get_disk_and_module(struct gendisk *disk)
14931783 {
14941784 struct module *owner;
....@@ -1509,6 +1799,16 @@
15091799 }
15101800 EXPORT_SYMBOL(get_disk_and_module);
15111801
1802
+/**
1803
+ * put_disk - decrements the gendisk refcount
1804
+ * @disk: the struct gendisk to decrement the refcount for
1805
+ *
1806
+ * This decrements the refcount for the struct gendisk. When this reaches 0
1807
+ * we'll have disk_release() called.
1808
+ *
1809
+ * Context: Any context, but the last reference must not be dropped from
1810
+ * atomic context.
1811
+ */
15121812 void put_disk(struct gendisk *disk)
15131813 {
15141814 if (disk)
....@@ -1516,9 +1816,15 @@
15161816 }
15171817 EXPORT_SYMBOL(put_disk);
15181818
1519
-/*
1819
+/**
1820
+ * put_disk_and_module - decrements the module and gendisk refcount
1821
+ * @disk: the struct gendisk to decrement the refcount for
1822
+ *
15201823 * This is a counterpart of get_disk_and_module() and thus also of
15211824 * get_gendisk().
1825
+ *
1826
+ * Context: Any context, but the last reference must not be dropped from
1827
+ * atomic context.
15221828 */
15231829 void put_disk_and_module(struct gendisk *disk)
15241830 {
....@@ -1575,20 +1881,6 @@
15751881
15761882 EXPORT_SYMBOL(bdev_read_only);
15771883
1578
-int invalidate_partition(struct gendisk *disk, int partno)
1579
-{
1580
- int res = 0;
1581
- struct block_device *bdev = bdget_disk(disk, partno);
1582
- if (bdev) {
1583
- fsync_bdev(bdev);
1584
- res = __invalidate_device(bdev, true);
1585
- bdput(bdev);
1586
- }
1587
- return res;
1588
-}
1589
-
1590
-EXPORT_SYMBOL(invalidate_partition);
1591
-
15921884 /*
15931885 * Disk events - monitor disk events like media change and eject request.
15941886 */
....@@ -1630,12 +1922,11 @@
16301922
16311923 /*
16321924 * If device-specific poll interval is set, always use it. If
1633
- * the default is being used, poll iff there are events which
1634
- * can't be monitored asynchronously.
1925
+ * the default is being used, poll if the POLL flag is set.
16351926 */
16361927 if (ev->poll_msecs >= 0)
16371928 intv_msecs = ev->poll_msecs;
1638
- else if (disk->events & ~disk->async_events)
1929
+ else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
16391930 intv_msecs = disk_events_dfl_poll_msecs;
16401931
16411932 return msecs_to_jiffies(intv_msecs);
....@@ -1760,20 +2051,14 @@
17602051 * CONTEXT:
17612052 * Might sleep.
17622053 */
1763
-unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
2054
+static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
17642055 {
1765
- const struct block_device_operations *bdops = disk->fops;
17662056 struct disk_events *ev = disk->ev;
17672057 unsigned int pending;
17682058 unsigned int clearing = mask;
17692059
1770
- if (!ev) {
1771
- /* for drivers still using the old ->media_changed method */
1772
- if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1773
- bdops->media_changed && bdops->media_changed(disk))
1774
- return DISK_EVENT_MEDIA_CHANGE;
2060
+ if (!ev)
17752061 return 0;
1776
- }
17772062
17782063 disk_block_events(disk);
17792064
....@@ -1803,6 +2088,33 @@
18032088
18042089 return pending;
18052090 }
2091
+
2092
+/**
2093
+ * bdev_check_media_change - check if a removable media has been changed
2094
+ * @bdev: block device to check
2095
+ *
2096
+ * Check whether a removable media has been changed, and attempt to free all
2097
+ * dentries and inodes and invalidates all block device page cache entries in
2098
+ * that case.
2099
+ *
2100
+ * Returns %true if the block device changed, or %false if not.
2101
+ */
2102
+bool bdev_check_media_change(struct block_device *bdev)
2103
+{
2104
+ unsigned int events;
2105
+
2106
+ events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
2107
+ DISK_EVENT_EJECT_REQUEST);
2108
+ if (!(events & DISK_EVENT_MEDIA_CHANGE))
2109
+ return false;
2110
+
2111
+ if (__invalidate_device(bdev, true))
2112
+ pr_warn("VFS: busy inodes on changed media %s\n",
2113
+ bdev->bd_disk->disk_name);
2114
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
2115
+ return true;
2116
+}
2117
+EXPORT_SYMBOL(bdev_check_media_change);
18062118
18072119 /*
18082120 * Separate this part out so that a different pointer for clearing_ptr can be
....@@ -1845,11 +2157,13 @@
18452157
18462158 /*
18472159 * Tell userland about new events. Only the events listed in
1848
- * @disk->events are reported. Unlisted events are processed the
1849
- * same internally but never get reported to userland.
2160
+ * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
2161
+ * is set. Otherwise, events are processed internally but never
2162
+ * get reported to userland.
18502163 */
18512164 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1852
- if (events & disk->events & (1 << i))
2165
+ if ((events & disk->events & (1 << i)) &&
2166
+ (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
18532167 envp[nr_events++] = disk_uevents[i];
18542168
18552169 if (nr_events)
....@@ -1862,6 +2176,7 @@
18622176 *
18632177 * events : list of all supported events
18642178 * events_async : list of events which can be detected w/o polling
2179
+ * (always empty, only for backwards compatibility)
18652180 * events_poll_msecs : polling interval, 0: disable, -1: system default
18662181 */
18672182 static ssize_t __disk_events_show(unsigned int events, char *buf)
....@@ -1886,15 +2201,16 @@
18862201 {
18872202 struct gendisk *disk = dev_to_disk(dev);
18882203
2204
+ if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
2205
+ return 0;
2206
+
18892207 return __disk_events_show(disk->events, buf);
18902208 }
18912209
18922210 static ssize_t disk_events_async_show(struct device *dev,
18932211 struct device_attribute *attr, char *buf)
18942212 {
1895
- struct gendisk *disk = dev_to_disk(dev);
1896
-
1897
- return __disk_events_show(disk->async_events, buf);
2213
+ return 0;
18982214 }
18992215
19002216 static ssize_t disk_events_poll_msecs_show(struct device *dev,
....@@ -1902,6 +2218,9 @@
19022218 char *buf)
19032219 {
19042220 struct gendisk *disk = dev_to_disk(dev);
2221
+
2222
+ if (!disk->ev)
2223
+ return sprintf(buf, "-1\n");
19052224
19062225 return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
19072226 }
....@@ -1918,6 +2237,9 @@
19182237
19192238 if (intv < 0 && intv != -1)
19202239 return -EINVAL;
2240
+
2241
+ if (!disk->ev)
2242
+ return -ENODEV;
19212243
19222244 disk_block_events(disk);
19232245 disk->ev->poll_msecs = intv;
....@@ -1943,7 +2265,7 @@
19432265 * The default polling interval can be specified by the kernel
19442266 * parameter block.events_dfl_poll_msecs which defaults to 0
19452267 * (disable). This can also be modified runtime by writing to
1946
- * /sys/module/block/events_dfl_poll_msecs.
2268
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
19472269 */
19482270 static int disk_events_set_dfl_poll_msecs(const char *val,
19492271 const struct kernel_param *kp)
....@@ -1983,7 +2305,7 @@
19832305 {
19842306 struct disk_events *ev;
19852307
1986
- if (!disk->fops->check_events)
2308
+ if (!disk->fops->check_events || !disk->events)
19872309 return;
19882310
19892311 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
....@@ -2005,13 +2327,13 @@
20052327
20062328 static void disk_add_events(struct gendisk *disk)
20072329 {
2008
- if (!disk->ev)
2009
- return;
2010
-
20112330 /* FIXME: error handling */
20122331 if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
20132332 pr_warn("%s: failed to create sysfs files for events\n",
20142333 disk->disk_name);
2334
+
2335
+ if (!disk->ev)
2336
+ return;
20152337
20162338 mutex_lock(&disk_events_mutex);
20172339 list_add_tail(&disk->ev->node, &disk_events);
....@@ -2026,14 +2348,13 @@
20262348
20272349 static void disk_del_events(struct gendisk *disk)
20282350 {
2029
- if (!disk->ev)
2030
- return;
2351
+ if (disk->ev) {
2352
+ disk_block_events(disk);
20312353
2032
- disk_block_events(disk);
2033
-
2034
- mutex_lock(&disk_events_mutex);
2035
- list_del_init(&disk->ev->node);
2036
- mutex_unlock(&disk_events_mutex);
2354
+ mutex_lock(&disk_events_mutex);
2355
+ list_del_init(&disk->ev->node);
2356
+ mutex_unlock(&disk_events_mutex);
2357
+ }
20372358
20382359 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
20392360 }