From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/block/genhd.c | 707 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 514 insertions(+), 193 deletions(-) diff --git a/kernel/block/genhd.c b/kernel/block/genhd.c index fe1c236..796baf7 100644 --- a/kernel/block/genhd.c +++ b/kernel/block/genhd.c @@ -1,8 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * gendisk handling */ #include <linux/module.h> +#include <linux/ctype.h> #include <linux/fs.h> #include <linux/genhd.h> #include <linux/kdev_t.h> @@ -25,7 +27,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -struct kobject *block_depr; +static struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -36,8 +38,6 @@ static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static const struct device_type disk_type; - static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); @@ -45,53 +45,102 @@ static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); -void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool update_bdev) { - if (q->mq_ops) - return; + sector_t capacity = get_capacity(disk); - atomic_inc(&part->in_flight[rw]); - if (part->partno) - atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); -} + set_capacity(disk, size); + if (update_bdev) + revalidate_disk_size(disk, true); -void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) -{ - if (q->mq_ops) - return; + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; - atomic_dec(&part->in_flight[rw]); - if (part->partno) - atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); -} - -void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) -{ - if (q->mq_ops) { - blk_mq_in_flight(q, part, inflight); - return; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } - inflight[0] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - if (part->partno) { - part = &part_to_disk(part)->part0; - inflight[1] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); + return false; +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + +/* + * Format the device name of the indicated disk into the supplied buffer and + * return a pointer to that same buffer for convenience. + */ +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_partno, buf); +} +EXPORT_SYMBOL(bdevname); + +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; } } -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static unsigned int part_in_flight(struct hd_struct *part) { - if (q->mq_ops) { - blk_mq_in_flight_rw(q, part, inflight); - return; - } + unsigned int inflight = 0; + int cpu; - inflight[0] = atomic_read(&part->in_flight[0]); - inflight[1] = atomic_read(&part->in_flight[1]); + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + + part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight < 0) + inflight = 0; + + return inflight; +} + +static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +{ + int cpu; + + inflight[0] = 0; + inflight[1] = 0; + for_each_possible_cpu(cpu) { + inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); + inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight[0] < 0) + inflight[0] = 0; + if ((int)inflight[1] < 0) + inflight[1] = 0; } struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) @@ -129,7 +178,6 @@ return part; } -EXPORT_SYMBOL_GPL(disk_get_part); /** * disk_part_iter_init - initialize partition iterator @@ -260,11 +308,13 @@ * primarily used for stats accounting. * * CONTEXT: - * RCU read locked. The returned partition pointer is valid only - * while preemption is disabled. + * RCU read locked. The returned partition pointer is always valid + * because its refcount is grabbed except for part0, which lifetime + * is same with the disk. * * RETURNS: * Found partition on success, part0 is returned if no partition matches + * or the matched partition is being deleted. */ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { @@ -272,23 +322,70 @@ struct hd_struct *part; int i; + rcu_read_lock(); ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector)) - return part; + if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) + goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); if (part && sector_in_part(part, sector)) { + /* + * only live partition can be cached for lookup, + * so use-after-free on cached & deleting partition + * can be avoided + */ + if (!hd_struct_try_get(part)) + break; rcu_assign_pointer(ptbl->last_lookup, part); - return part; + goto out_unlock; } } - return &disk->part0; + + part = &disk->part0; +out_unlock: + rcu_read_unlock(); + return part; } -EXPORT_SYMBOL_GPL(disk_map_sector_rcu); + +/** + * disk_has_partitions + * @disk: gendisk of interest + * + * Walk through the partition table and check if valid partition exists. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * True if the gendisk has at least one valid non-zero size partition. + * Otherwise false. + */ +bool disk_has_partitions(struct gendisk *disk) +{ + struct disk_part_tbl *ptbl; + int i; + bool ret = false; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + /* Iterate partitions skipping the whole device at index 0 */ + for (i = 1; i < ptbl->len; i++) { + if (rcu_dereference(ptbl->part[i])) { + ret = true; + break; + } + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(disk_has_partitions); /* * Can be deleted altogether. Later. @@ -355,8 +452,8 @@ } if (index == 0) { - printk("register_blkdev: failed to get major for %s\n", - name); + printk("%s: failed to get major for %s\n", + __func__, name); ret = -EBUSY; goto out; } @@ -365,8 +462,8 @@ } if (major >= BLKDEV_MAJOR_MAX) { - pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n", - major, BLKDEV_MAJOR_MAX-1, name); + pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", + __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; @@ -521,8 +618,8 @@ } } -/** - * We invalidate devt by assigning NULL pointer for devt in idr. +/* + * We invalidate devt by assigning NULL pointer for devt in idr. */ void blk_invalidate_devt(dev_t devt) { @@ -582,10 +679,23 @@ return 0; } -static void register_disk(struct device *parent, struct gendisk *disk) +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void register_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct block_device *bdev; struct disk_part_iter piter; struct hd_struct *part; int err; @@ -597,6 +707,10 @@ /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); + if (groups) { + WARN_ON(ddev->groups); + ddev->groups = groups; + } if (device_add(ddev)) return; if (!sysfs_deprecated) { @@ -621,25 +735,8 @@ if (disk->flags & GENHD_FL_HIDDEN) return; - /* No minors to use for partitions */ - if (!disk_part_scan_enabled(disk)) - goto exit; + disk_scan_partitions(disk); - /* No such device (e.g., media were just removed) */ - if (!get_capacity(disk)) - goto exit; - - bdev = bdget_disk(disk, 0); - if (!bdev) - goto exit; - - bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ, NULL); - if (err < 0) - goto exit; - blkdev_put(bdev, FMODE_READ); - -exit: /* announce disk after possible partitions are created */ dev_set_uevent_suppress(ddev, 0); kobject_uevent(&ddev->kobj, KOBJ_ADD); @@ -662,6 +759,7 @@ * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk @@ -670,10 +768,20 @@ * FIXME: error handling */ static void __device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, bool register_queue) { dev_t devt; int retval; + + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + if (register_queue) + elevator_init_mq(disk->queue); /* minors == 0 indicates to use ext devt from part0 and should * be accompanied with EXT_DEVT flag. Make sure all @@ -703,17 +811,19 @@ disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { + struct backing_dev_info *bdi = disk->queue->backing_dev_info; + struct device *dev = disk_to_dev(disk); int ret; /* Register BDI before referencing it from bdev */ - disk_to_dev(disk)->devt = devt; - ret = bdi_register_owner(disk->queue->backing_dev_info, - disk_to_dev(disk)); + dev->devt = devt; + ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); + bdi_set_owner(bdi, dev); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); } - register_disk(parent, disk); + register_disk(parent, disk, groups); if (register_queue) blk_register_queue(disk); @@ -727,22 +837,64 @@ blk_integrity_add(disk); } -void device_add_disk(struct device *parent, struct gendisk *disk) +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { - __device_add_disk(parent, disk, true); + __device_add_disk(parent, disk, groups, true); } EXPORT_SYMBOL(device_add_disk); void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) { - __device_add_disk(parent, disk, false); + __device_add_disk(parent, disk, NULL, false); } EXPORT_SYMBOL(device_add_disk_no_queue_reg); +static void invalidate_partition(struct gendisk *disk, int partno) +{ + struct block_device *bdev; + + bdev = bdget_disk(disk, partno); + if (!bdev) + return; + + fsync_bdev(bdev); + __invalidate_device(bdev, true); + + /* + * Unhash the bdev inode for this device so that it gets evicted as soon + * as last inode reference is dropped. + */ + remove_inode_hash(bdev->bd_inode); + bdput(bdev); +} + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; struct hd_struct *part; + + might_sleep(); blk_integrity_del(disk); disk_del_events(disk); @@ -757,13 +909,11 @@ DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - bdev_unhash_inode(part_devt(part)); - delete_partition(disk, part->partno); + delete_partition(part); } disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); @@ -836,10 +986,14 @@ * * This function gets the structure containing partitioning * information for the given device @devt. + * + * Context: can sleep */ struct gendisk *get_gendisk(dev_t devt, int *partno) { struct gendisk *disk = NULL; + + might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { struct kobject *kobj; @@ -877,7 +1031,6 @@ } return disk; } -EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number @@ -899,7 +1052,7 @@ part = disk_get_part(disk, partno); if (part) - bdev = bdget(part_devt(part)); + bdev = bdget_part(part); disk_put_part(part); return bdev; @@ -1123,6 +1276,74 @@ return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; + unsigned int inflight; + + part_stat_read_all(p, &stat); + if (queue_is_mq(q)) + inflight = blk_mq_in_flight(q, p); + else + inflight = part_in_flight(p); + + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), + inflight, + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + if (queue_is_mq(q)) + blk_mq_in_flight_rw(q, p, inflight); + else + part_in_flight_rw(p, inflight); + + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1161,10 +1382,33 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + #ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); @@ -1259,7 +1503,6 @@ struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; int i, target; - size_t size; /* * check for int overflow, since we can get here from blkpg_ioctl() @@ -1276,8 +1519,8 @@ if (target <= len) return 0; - size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); - new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); + new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, + disk->node_id); if (!new_ptbl) return -ENOMEM; @@ -1290,9 +1533,30 @@ return 0; } +/** + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk + * + * This function releases all allocated resources of the gendisk. + * + * The struct gendisk refcount is incremented with get_gendisk() or + * get_disk_and_module(), and its refcount is decremented with + * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this + * function is called. + * + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + * + * Context: can sleep + */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + + might_sleep(); blk_free_devt(dev->devt); disk_release_events(disk); @@ -1312,12 +1576,12 @@ { struct gendisk *disk = dev_to_disk(dev); - if (disk->devnode) - return disk->devnode(disk, mode); + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, @@ -1338,8 +1602,8 @@ struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; - unsigned int inflight[2]; - int cpu; + unsigned int inflight; + struct disk_stats stat; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1351,32 +1615,46 @@ disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - cpu = part_stat_lock(); - part_round_stats(gp->queue, cpu, hd); - part_stat_unlock(); - part_in_flight(gp->queue, hd, inflight); + part_stat_read_all(hd, &stat); + if (queue_is_mq(gp->queue)) + inflight = blk_mq_in_flight(gp->queue, hd); + else + inflight = part_in_flight(hd); + seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " - "%lu %lu %lu %u\n", + "%lu %lu %lu %u " + "%lu %u" + "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[STAT_READ]), - part_stat_read(hd, merges[STAT_READ]), - part_stat_read(hd, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(hd, STAT_READ), - part_stat_read(hd, ios[STAT_WRITE]), - part_stat_read(hd, merges[STAT_WRITE]), - part_stat_read(hd, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), - inflight[0], - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)), - part_stat_read(hd, ios[STAT_DISCARD]), - part_stat_read(hd, merges[STAT_DISCARD]), - part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), + inflight, + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) ); } disk_part_iter_exit(&piter); @@ -1433,7 +1711,6 @@ class_dev_iter_exit(&iter); return devt; } -EXPORT_SYMBOL(blk_lookup_devt); struct gendisk *__alloc_disk_node(int minors, int node_id) { @@ -1448,47 +1725,60 @@ } disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); - if (disk) { - if (!init_part_stats(&disk->part0)) { - kfree(disk); - return NULL; - } - init_rwsem(&disk->lookup_sem); - disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_part_stats(&disk->part0); - kfree(disk); - return NULL; - } - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); + if (!disk) + return NULL; - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - seqcount_init(&disk->part0.nr_sects_seq); - if (hd_ref_init(&disk->part0)) { - hd_free_part(&disk->part0); - kfree(disk); - return NULL; - } + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) + goto out_free_disk; - disk->minors = minors; - rand_initialize_disk(disk); - disk_to_dev(disk)->class = &block_class; - disk_to_dev(disk)->type = &disk_type; - device_initialize(disk_to_dev(disk)); + init_rwsem(&disk->lookup_sem); + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_percpu(disk->part0.dkstats); + goto out_free_disk; } + + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); + + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + hd_sects_seq_init(&disk->part0); + if (hd_ref_init(&disk->part0)) + goto out_free_part0; + + disk->minors = minors; + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); return disk; + +out_free_part0: + hd_free_part(&disk->part0); +out_free_disk: + kfree(disk); + return NULL; } EXPORT_SYMBOL(__alloc_disk_node); +/** + * get_disk_and_module - increments the gendisk and gendisk fops module refcount + * @disk: the struct gendisk to increment the refcount for + * + * This increments the refcount for the struct gendisk, and the gendisk's + * fops module owner. + * + * Context: Any context. + */ struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; @@ -1509,6 +1799,16 @@ } EXPORT_SYMBOL(get_disk_and_module); +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ void put_disk(struct gendisk *disk) { if (disk) @@ -1516,9 +1816,15 @@ } EXPORT_SYMBOL(put_disk); -/* +/** + * put_disk_and_module - decrements the module and gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk_and_module(struct gendisk *disk) { @@ -1575,20 +1881,6 @@ EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int partno) -{ - int res = 0; - struct block_device *bdev = bdget_disk(disk, partno); - if (bdev) { - fsync_bdev(bdev); - res = __invalidate_device(bdev, true); - bdput(bdev); - } - return res; -} - -EXPORT_SYMBOL(invalidate_partition); - /* * Disk events - monitor disk events like media change and eject request. */ @@ -1630,12 +1922,11 @@ /* * If device-specific poll interval is set, always use it. If - * the default is being used, poll iff there are events which - * can't be monitored asynchronously. + * the default is being used, poll if the POLL flag is set. */ if (ev->poll_msecs >= 0) intv_msecs = ev->poll_msecs; - else if (disk->events & ~disk->async_events) + else if (disk->event_flags & DISK_EVENT_FLAG_POLL) intv_msecs = disk_events_dfl_poll_msecs; return msecs_to_jiffies(intv_msecs); @@ -1760,20 +2051,14 @@ * CONTEXT: * Might sleep. */ -unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { - const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; - if (!ev) { - /* for drivers still using the old ->media_changed method */ - if ((mask & DISK_EVENT_MEDIA_CHANGE) && - bdops->media_changed && bdops->media_changed(disk)) - return DISK_EVENT_MEDIA_CHANGE; + if (!ev) return 0; - } disk_block_events(disk); @@ -1803,6 +2088,33 @@ return pending; } + +/** + * bdev_check_media_change - check if a removable media has been changed + * @bdev: block device to check + * + * Check whether a removable media has been changed, and attempt to free all + * dentries and inodes and invalidates all block device page cache entries in + * that case. + * + * Returns %true if the block device changed, or %false if not. + */ +bool bdev_check_media_change(struct block_device *bdev) +{ + unsigned int events; + + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + bdev->bd_disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return true; +} +EXPORT_SYMBOL(bdev_check_media_change); /* * Separate this part out so that a different pointer for clearing_ptr can be @@ -1845,11 +2157,13 @@ /* * Tell userland about new events. Only the events listed in - * @disk->events are reported. Unlisted events are processed the - * same internally but never get reported to userland. + * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT + * is set. Otherwise, events are processed internally but never + * get reported to userland. */ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if (events & disk->events & (1 << i)) + if ((events & disk->events & (1 << i)) && + (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) envp[nr_events++] = disk_uevents[i]; if (nr_events) @@ -1862,6 +2176,7 @@ * * events : list of all supported events * events_async : list of events which can be detected w/o polling + * (always empty, only for backwards compatibility) * events_poll_msecs : polling interval, 0: disable, -1: system default */ static ssize_t __disk_events_show(unsigned int events, char *buf) @@ -1886,15 +2201,16 @@ { struct gendisk *disk = dev_to_disk(dev); + if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + return 0; + return __disk_events_show(disk->events, buf); } static ssize_t disk_events_async_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct gendisk *disk = dev_to_disk(dev); - - return __disk_events_show(disk->async_events, buf); + return 0; } static ssize_t disk_events_poll_msecs_show(struct device *dev, @@ -1902,6 +2218,9 @@ char *buf) { struct gendisk *disk = dev_to_disk(dev); + + if (!disk->ev) + return sprintf(buf, "-1\n"); return sprintf(buf, "%ld\n", disk->ev->poll_msecs); } @@ -1918,6 +2237,9 @@ if (intv < 0 && intv != -1) return -EINVAL; + + if (!disk->ev) + return -ENODEV; disk_block_events(disk); disk->ev->poll_msecs = intv; @@ -1943,7 +2265,7 @@ * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to - * /sys/module/block/events_dfl_poll_msecs. + * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) @@ -1983,7 +2305,7 @@ { struct disk_events *ev; - if (!disk->fops->check_events) + if (!disk->fops->check_events || !disk->events) return; ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -2005,13 +2327,13 @@ static void disk_add_events(struct gendisk *disk) { - if (!disk->ev) - return; - /* FIXME: error handling */ if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) pr_warn("%s: failed to create sysfs files for events\n", disk->disk_name); + + if (!disk->ev) + return; mutex_lock(&disk_events_mutex); list_add_tail(&disk->ev->node, &disk_events); @@ -2026,14 +2348,13 @@ static void disk_del_events(struct gendisk *disk) { - if (!disk->ev) - return; + if (disk->ev) { + disk_block_events(disk); - disk_block_events(disk); - - mutex_lock(&disk_events_mutex); - list_del_init(&disk->ev->node); - mutex_unlock(&disk_events_mutex); + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + } sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); } -- Gitblit v1.6.2