From 23fa18eaa71266feff7ba8d83022d9e1cc83c65a Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 07:42:03 +0000 Subject: [PATCH] disable pwm7 --- kernel/drivers/md/dm.c | 1162 ++++++++++++++++++++++++---------------------------------- 1 files changed, 481 insertions(+), 681 deletions(-) diff --git a/kernel/drivers/md/dm.c b/kernel/drivers/md/dm.c index 3b6b2b8..cb58c40 100644 --- a/kernel/drivers/md/dm.c +++ b/kernel/drivers/md/dm.c @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <linux/pr.h> #include <linux/refcount.h> +#include <linux/part_stat.h> #include <linux/blk-crypto.h> #include <linux/keyslot-manager.h> @@ -148,6 +149,16 @@ #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; +#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) +static int swap_bios = DEFAULT_SWAP_BIOS; +static int get_swap_bios(void) +{ + int latch = READ_ONCE(swap_bios); + if (unlikely(latch <= 0)) + latch = DEFAULT_SWAP_BIOS; + return latch; +} + /* * For mempools pre-allocation at the table loading time. */ @@ -161,9 +172,6 @@ refcount_t count; struct dm_dev dm_dev; }; - -static struct kmem_cache *_rq_tio_cache; -static struct kmem_cache *_rq_cache; /* * Bio-based DM's mempools' reserved IOs set by the user. @@ -226,20 +234,11 @@ static int __init local_init(void) { - int r = -ENOMEM; - - _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); - if (!_rq_tio_cache) - return r; - - _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), - __alignof__(struct request), 0, NULL); - if (!_rq_cache) - goto out_free_rq_tio_cache; + int r; r = dm_uevent_init(); if (r) - goto out_free_rq_cache; + return r; deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); if (!deferred_remove_workqueue) { @@ -261,21 +260,14 @@ destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); -out_free_rq_cache: - kmem_cache_destroy(_rq_cache); -out_free_rq_tio_cache: - kmem_cache_destroy(_rq_tio_cache); return r; } static void local_exit(void) { - flush_scheduled_work(); destroy_workqueue(deferred_remove_workqueue); - kmem_cache_destroy(_rq_cache); - kmem_cache_destroy(_rq_tio_cache); unregister_blkdev(_major, _name); dm_uevent_exit(); @@ -440,27 +432,90 @@ dm_deferred_remove(); } -sector_t dm_get_size(struct mapped_device *md) -{ - return get_capacity(md->disk); -} - -struct request_queue *dm_get_md_queue(struct mapped_device *md) -{ - return md->queue; -} - -struct dm_stats *dm_get_stats(struct mapped_device *md) -{ - return &md->stats; -} - static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mapped_device *md = bdev->bd_disk->private_data; return dm_get_geometry(md, geo); } + +#ifdef CONFIG_BLK_DEV_ZONED +int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} +EXPORT_SYMBOL_GPL(dm_report_zones_cb); + +static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct mapped_device *md = disk->private_data; + struct dm_table *map; + int srcu_idx, ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; + + if (dm_suspended_md(md)) + return -EAGAIN; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) { + ret = -EIO; + goto out; + } + + do { + struct dm_target *tgt; + + tgt = dm_table_find_target(map, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) { + ret = -EIO; + goto out; + } + + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); + if (ret < 0) + goto out; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + + ret = args.zone_idx; +out: + dm_put_live_table(md, srcu_idx); + return ret; +} +#else +#define dm_blk_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) @@ -531,7 +586,45 @@ return r; } -static void start_io_acct(struct dm_io *io); +u64 dm_start_time_ns_from_clone(struct bio *bio) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + struct dm_io *io = tio->io; + + return jiffies_to_nsecs(io->start_time); +} +EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + +static void start_io_acct(struct dm_io *io) +{ + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + + io->start_time = bio_start_io_acct(bio); + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio_data_dir(bio), + bio->bi_iter.bi_sector, bio_sectors(bio), + false, 0, &io->stats_aux); +} + +static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) +{ + unsigned long duration = jiffies - start_time; + + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio_data_dir(bio), + bio->bi_iter.bi_sector, bio_sectors(bio), + true, duration, stats_aux); + + smp_wmb(); + + bio_end_io_acct(bio, start_time); + + /* nudge anyone waiting on suspend queue */ + if (unlikely(wq_has_sleeper(&md->wait))) + wake_up(&md->wait); +} static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) { @@ -595,61 +688,6 @@ if (tio->inside_dm_io) return; bio_put(&tio->clone); -} - -int md_in_flight(struct mapped_device *md) -{ - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); -} - -static void start_io_acct(struct dm_io *io) -{ - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - int rw = bio_data_dir(bio); - - io->start_time = jiffies; - - generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), - &dm_disk(md)->part0); - - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); - - if (unlikely(dm_stats_used(&md->stats))) - dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), - false, 0, &io->stats_aux); -} - -static void end_io_acct(struct dm_io *io) -{ - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; - int pending; - int rw = bio_data_dir(bio); - - generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, - io->start_time); - - if (unlikely(dm_stats_used(&md->stats))) - dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), - true, duration, &io->stats_aux); - - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); - pending += atomic_read(&md->pending[rw^0x1]); - - /* nudge anyone waiting on suspend queue */ - if (!pending) - wake_up(&md->wait); } /* @@ -748,7 +786,8 @@ } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) { + fmode_t mode) +{ struct table_device *td; list_for_each_entry(td, l, list) @@ -759,7 +798,8 @@ } int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, - struct dm_dev **result) { + struct dm_dev **result) +{ int r; struct table_device *td; @@ -864,6 +904,8 @@ blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; + unsigned long start_time = 0; + struct dm_stats_aux stats_aux; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { @@ -890,8 +932,10 @@ io_error = io->status; bio = io->orig_bio; - end_io_acct(io); + start_time = io->start_time; + stats_aux = io->stats_aux; free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux); if (io_error == BLK_STS_DM_REQUEUE) return; @@ -937,6 +981,11 @@ limits->max_write_zeroes_sectors = 0; } +static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) +{ + return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); +} + static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; @@ -944,8 +993,9 @@ struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; + struct bio *orig_bio = io->orig_bio; - if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) { + if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && !bio->bi_disk->queue->limits.max_discard_sectors) disable_discard(md); @@ -957,12 +1007,24 @@ disable_write_zeroes(md); } + /* + * For zone-append bios get offset in zone of the written + * sector and add that to the original bio sector pos. + */ + if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { + sector_t written_sector = bio->bi_iter.bi_sector; + struct request_queue *q = orig_bio->bi_disk->queue; + u64 mask = (u64)blk_queue_zone_sectors(q) - 1; + + orig_bio->bi_iter.bi_sector += written_sector & mask; + } + if (endio) { int r = endio(tio->ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: error = BLK_STS_DM_REQUEUE; - /*FALLTHRU*/ + fallthrough; case DM_ENDIO_DONE: break; case DM_ENDIO_INCOMPLETE: @@ -974,6 +1036,11 @@ } } + if (unlikely(swap_bios_limit(tio->ti, bio))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } + free_tio(tio); dec_pending(io, error); } @@ -982,29 +1049,28 @@ * Return maximum size of I/O possible at the supplied sector up to the current * target boundary. */ -static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) +static inline sector_t max_io_len_target_boundary(struct dm_target *ti, + sector_t target_offset) { - sector_t target_offset = dm_target_offset(ti, sector); - return ti->len - target_offset; } -static sector_t max_io_len(sector_t sector, struct dm_target *ti) +static sector_t max_io_len(struct dm_target *ti, sector_t sector) { - sector_t len = max_io_len_target_boundary(sector, ti); - sector_t offset, max_len; + sector_t target_offset = dm_target_offset(ti, sector); + sector_t len = max_io_len_target_boundary(ti, target_offset); + sector_t max_len; /* - * Does the target need to split even further? + * Does the target need to split IO even further? + * - varied (per target) IO splitting is a tenet of DM; this + * explains why stacked chunk_sectors based splitting via + * blk_max_size_offset() isn't possible here. So pass in + * ti->max_io_len to override stacked chunk_sectors. */ if (ti->max_io_len) { - offset = dm_target_offset(ti, sector); - if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) - max_len = sector_div(offset, ti->max_io_len); - else - max_len = offset & (ti->max_io_len - 1); - max_len = ti->max_io_len - max_len; - + max_len = blk_max_size_offset(ti->table->md->queue, + target_offset, ti->max_io_len); if (len > max_len) len = max_len; } @@ -1039,7 +1105,7 @@ return NULL; ti = dm_table_find_target(map, sector); - if (!dm_target_is_valid(ti)) + if (!ti) return NULL; return ti; @@ -1060,13 +1126,33 @@ goto out; if (!ti->type->direct_access) goto out; - len = max_io_len(sector, ti) / PAGE_SECTORS; + len = max_io_len(ti, sector) / PAGE_SECTORS; if (len < 1) goto out; nr_pages = min(len, nr_pages); ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len) +{ + struct mapped_device *md = dax_get_private(dax_dev); + struct dm_table *map; + bool ret = false; + int srcu_idx; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) + goto out; + + ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize); + +out: dm_put_live_table(md, srcu_idx); return ret; @@ -1120,9 +1206,37 @@ return ret; } +static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int ret = -EIO; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (WARN_ON(!ti->type->dax_zero_page_range)) { + /* + * ->zero_page_range() is mandatory dax operation. If we are + * here, something is wrong. + */ + goto out; + } + ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management + * operations and REQ_OP_ZONE_APPEND (zone append writes). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1152,105 +1266,33 @@ { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_opf & REQ_PREFLUSH); + BUG_ON(op_is_zone_mgmt(bio_op(bio))); + BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -/* - * The zone descriptors obtained with a zone report indicate zone positions - * within the target backing device, regardless of that device is a partition - * and regardless of the target mapping start sector on the device or partition. - * The zone descriptors start sector and write pointer position must be adjusted - * to match their relative position within the dm device. - * A target may call dm_remap_zone_report() after completion of a - * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the - * backing device. - */ -void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start) +static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) { -#ifdef CONFIG_BLK_DEV_ZONED - struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); - struct bio *report_bio = tio->io->orig_bio; - struct blk_zone_report_hdr *hdr = NULL; - struct blk_zone *zone; - unsigned int nr_rep = 0; - unsigned int ofst; - sector_t part_offset; - struct bio_vec bvec; - struct bvec_iter iter; - void *addr; - - if (bio->bi_status) - return; - - /* - * bio sector was incremented by the request size on completion. Taking - * into account the original request sector, the target start offset on - * the backing device and the target mapping offset (ti->begin), the - * start sector of the backing device. The partition offset is always 0 - * if the target uses a whole device. - */ - part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio)); - - /* - * Remap the start sector of the reported zones. For sequential zones, - * also remap the write pointer position. - */ - bio_for_each_segment(bvec, report_bio, iter) { - addr = kmap_atomic(bvec.bv_page); - - /* Remember the report header in the first page */ - if (!hdr) { - hdr = addr; - ofst = sizeof(struct blk_zone_report_hdr); - } else - ofst = 0; - - /* Set zones start sector */ - while (hdr->nr_zones && ofst < bvec.bv_len) { - zone = addr + ofst; - zone->start -= part_offset; - if (zone->start >= start + ti->len) { - hdr->nr_zones = 0; - break; - } - zone->start = zone->start + ti->begin - start; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { - if (zone->cond == BLK_ZONE_COND_FULL) - zone->wp = zone->start + zone->len; - else if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->wp = zone->start; - else - zone->wp = zone->wp + ti->begin - start - part_offset; - } - ofst += sizeof(struct blk_zone); - hdr->nr_zones--; - nr_rep++; - } - - if (addr != hdr) - kunmap_atomic(addr); - - if (!hdr->nr_zones) - break; + mutex_lock(&md->swap_bios_lock); + while (latch < md->swap_bios) { + cond_resched(); + down(&md->swap_bios_semaphore); + md->swap_bios--; } - - if (hdr) { - hdr->nr_zones = nr_rep; - kunmap_atomic(hdr); + while (latch > md->swap_bios) { + cond_resched(); + up(&md->swap_bios_semaphore); + md->swap_bios++; } - - bio_advance(report_bio, report_bio->bi_iter.bi_size); - -#else /* !CONFIG_BLK_DEV_ZONED */ - bio->bi_status = BLK_STS_NOTSUPP; -#endif + mutex_unlock(&md->swap_bios_lock); } -EXPORT_SYMBOL_GPL(dm_remap_zone_report); static blk_qc_t __map_bio(struct dm_target_io *tio) { @@ -1258,7 +1300,6 @@ sector_t sector; struct bio *clone = &tio->clone; struct dm_io *io = tio->io; - struct mapped_device *md = io->md; struct dm_target *ti = tio->ti; blk_qc_t ret = BLK_QC_T_NONE; @@ -1272,6 +1313,14 @@ atomic_inc(&io->io_count); sector = clone->bi_iter.bi_sector; + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + int latch = get_swap_bios(); + if (unlikely(latch != md->swap_bios)) + __set_swap_bios_limit(md, latch); + down(&md->swap_bios_semaphore); + } + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1280,16 +1329,21 @@ /* the bio has been remapped so dispatch it */ trace_block_bio_remap(clone->bi_disk->queue, clone, bio_dev(io->orig_bio), sector); - if (md->type == DM_TYPE_NVME_BIO_BASED) - ret = direct_make_request(clone); - else - ret = generic_make_request(clone); + ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_DM_REQUEUE); break; @@ -1314,13 +1368,15 @@ sector_t sector, unsigned len) { struct bio *clone = &tio->clone; + int r; __bio_clone_fast(clone, bio); - bio_crypt_clone(clone, bio, GFP_NOIO); + r = bio_crypt_clone(clone, bio, GFP_NOIO); + if (r < 0) + return r; - if (unlikely(bio_integrity(bio) != NULL)) { - int r; + if (bio_integrity(bio)) { if (unlikely(!dm_target_has_integrity(tio->ti->type) && !dm_target_passes_integrity(tio->ti->type))) { DMWARN("%s: the target %s doesn't support integrity data.", @@ -1334,11 +1390,10 @@ return r; } - if (bio_op(bio) != REQ_OP_ZONE_REPORT) - bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); - if (unlikely(bio_integrity(bio) != NULL)) + if (bio_integrity(bio)) bio_integrity_trim(clone); return 0; @@ -1417,11 +1472,32 @@ { unsigned target_nr = 0; struct dm_target *ti; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci->bio = &flush_bio; + ci->sector_count = 0; + + /* + * Empty flush uses a statically initialized bio, as the base for + * cloning. However, blkg association requires that a bdev is + * associated with a gendisk, which doesn't happen until the bdev is + * opened. So, blkg association is done at issue time of the flush + * rather than when the device is created in alloc_dev(). + */ + bio_set_dev(ci->bio, ci->io->md->bdev); BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bio_uninit(ci->bio); return 0; } @@ -1444,41 +1520,10 @@ return 0; } -typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); - -static unsigned get_num_discard_bios(struct dm_target *ti) -{ - return ti->num_discard_bios; -} - -static unsigned get_num_secure_erase_bios(struct dm_target *ti) -{ - return ti->num_secure_erase_bios; -} - -static unsigned get_num_write_same_bios(struct dm_target *ti) -{ - return ti->num_write_same_bios; -} - -static unsigned get_num_write_zeroes_bios(struct dm_target *ti) -{ - return ti->num_write_zeroes_bios; -} - -typedef bool (*is_split_required_fn)(struct dm_target *ti); - -static bool is_split_required_for_discard(struct dm_target *ti) -{ - return ti->split_discard_bios; -} - static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - get_num_bios_fn get_num_bios, - is_split_required_fn is_split_required) + unsigned num_bios) { unsigned len; - unsigned num_bios; /* * Even though the device advertised support for this type of @@ -1486,14 +1531,11 @@ * reconfiguration might also have changed that since the * check was performed. */ - num_bios = get_num_bios ? get_num_bios(ti) : 0; if (!num_bios) return -EOPNOTSUPP; - if (is_split_required && !is_split_required(ti)) - len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - else - len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); + len = min_t(sector_t, ci->sector_count, + max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); __send_duplicate_bios(ci, ti, num_bios, &len); @@ -1503,43 +1545,46 @@ return 0; } -static int __send_discard(struct clone_info *ci, struct dm_target *ti) +static bool is_abnormal_io(struct bio *bio) { - return __send_changing_extent_only(ci, ti, get_num_discard_bios, - is_split_required_for_discard); -} + bool r = false; -static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) -{ - return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL); -} + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + r = true; + break; + } -static int __send_write_same(struct clone_info *ci, struct dm_target *ti) -{ - return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL); -} - -static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) -{ - return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL); + return r; } static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, int *result) { struct bio *bio = ci->bio; + unsigned num_bios = 0; - if (bio_op(bio) == REQ_OP_DISCARD) - *result = __send_discard(ci, ti); - else if (bio_op(bio) == REQ_OP_SECURE_ERASE) - *result = __send_secure_erase(ci, ti); - else if (bio_op(bio) == REQ_OP_WRITE_SAME) - *result = __send_write_same(ci, ti); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) - *result = __send_write_zeroes(ci, ti); - else + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + num_bios = ti->num_discard_bios; + break; + case REQ_OP_SECURE_ERASE: + num_bios = ti->num_secure_erase_bios; + break; + case REQ_OP_WRITE_SAME: + num_bios = ti->num_write_same_bios; + break; + case REQ_OP_WRITE_ZEROES: + num_bios = ti->num_write_zeroes_bios; + break; + default: return false; + } + *result = __send_changing_extent_only(ci, ti, num_bios); return true; } @@ -1548,23 +1593,18 @@ */ static int __split_and_process_non_flush(struct clone_info *ci) { - struct bio *bio = ci->bio; struct dm_target *ti; unsigned len; int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) + if (!ti) return -EIO; - if (unlikely(__process_abnormal_io(ci, ti, &r))) + if (__process_abnormal_io(ci, ti, &r)) return r; - if (bio_op(bio) == REQ_OP_ZONE_REPORT) - len = ci->sector_count; - else - len = min_t(sector_t, max_io_len(ci->sector, ti), - ci->sector_count); + len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); if (r < 0) @@ -1584,6 +1624,9 @@ ci->sector = bio->bi_iter.bi_sector; } +#define __dm_part_stat_sub(part, field, subnd) \ + (part_stat_get(part, field) -= (subnd)) + /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1594,21 +1637,12 @@ blk_qc_t ret = BLK_QC_T_NONE; int error = 0; - if (unlikely(!map)) { - bio_io_error(bio); - return ret; - } - - blk_queue_split(md->queue, &bio); - init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; - ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ - } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { + } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; error = __split_and_process_non_flush(&ci); @@ -1619,21 +1653,32 @@ error = __split_and_process_non_flush(&ci); if (current->bio_list && ci.sector_count && !error) { /* - * Remainder must be passed to generic_make_request() + * Remainder must be passed to submit_bio_noacct() * so that it gets handled *after* bios already submitted * have been completely processed. * We take a clone of the original to store in * ci.io->orig_bio to be used by end_io_acct() and * for dec_pending to use for completion handling. - * As this path is not used for REQ_OP_ZONE_REPORT, - * the usage of io->orig_bio in dm_remap_zone_report() - * won't be affected by this reassignment. */ struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, GFP_NOIO, &md->queue->bio_split); ci.io->orig_bio = b; + + /* + * Adjust IO stats for each split, otherwise upon queue + * reentry there will be redundant IO accounting. + * NOTE: this is a stop-gap fix, a proper fix involves + * significant refactoring of DM core's bio splitting + * (by eliminating DM's splitting and just using bio_split) + */ + part_stat_lock(); + __dm_part_stat_sub(&dm_disk(md)->part0, + sectors[op_stat_group(bio_op(bio))], ci.sector_count); + part_stat_unlock(); + bio_chain(b, bio); - ret = generic_make_request(bio); + trace_block_split(md->queue, b, bio->bi_iter.bi_sector); + ret = submit_bio_noacct(bio); break; } } @@ -1644,121 +1689,38 @@ return ret; } -/* - * Optimized variant of __split_and_process_bio that leverages the - * fact that targets that use it do _not_ have a need to split bios. - */ -static blk_qc_t __process_bio(struct mapped_device *md, - struct dm_table *map, struct bio *bio) +static blk_qc_t dm_submit_bio(struct bio *bio) { - struct clone_info ci; - blk_qc_t ret = BLK_QC_T_NONE; - int error = 0; - - if (unlikely(!map)) { - bio_io_error(bio); - return ret; - } - - init_clone_info(&ci, md, map, bio); - - if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; - ci.sector_count = 0; - error = __send_empty_flush(&ci); - /* dec_pending submits any data associated with flush */ - } else { - struct dm_target *ti = md->immutable_target; - struct dm_target_io *tio; - - /* - * Defend against IO still getting in during teardown - * - as was seen for a time with nvme-fcloop - */ - if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) { - error = -EIO; - goto out; - } - - ci.bio = bio; - ci.sector_count = bio_sectors(bio); - if (unlikely(__process_abnormal_io(&ci, ti, &error))) - goto out; - - tio = alloc_tio(&ci, ti, 0, GFP_NOIO); - ret = __clone_and_map_simple_bio(&ci, tio, NULL); - } -out: - /* drop the extra reference count */ - dec_pending(ci.io, errno_to_blk_status(error)); - return ret; -} - -typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *); - -static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio, - process_bio_fn process_bio) -{ - struct mapped_device *md = q->queuedata; + struct mapped_device *md = bio->bi_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; map = dm_get_live_table(md, &srcu_idx); - /* if we're suspended, we have to queue this io for later */ - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - dm_put_live_table(md, srcu_idx); - - if (!(bio->bi_opf & REQ_RAHEAD)) - queue_io(md, bio); - else + /* If suspended, or map not yet available, queue this IO for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || + unlikely(!map)) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + else if (bio->bi_opf & REQ_RAHEAD) bio_io_error(bio); - return ret; + else + queue_io(md, bio); + goto out; } - ret = process_bio(md, map, bio); + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) + * otherwise associated queue_limits won't be imposed. + */ + if (is_abnormal_io(bio)) + blk_queue_split(&bio); + ret = __split_and_process_bio(md, map, bio); +out: dm_put_live_table(md, srcu_idx); return ret; -} - -/* - * The request function that remaps the bio to one target and - * splits off any remainder. - */ -static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) -{ - return __dm_make_request(q, bio, __split_and_process_bio); -} - -static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio) -{ - return __dm_make_request(q, bio, __process_bio); -} - -static int dm_any_congested(void *congested_data, int bdi_bits) -{ - int r = bdi_bits; - struct mapped_device *md = congested_data; - struct dm_table *map; - - if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - if (dm_request_based(md)) { - /* - * With request-based DM we only need to check the - * top-level queue for congestion. - */ - r = md->queue->backing_dev_info->wb.state & bdi_bits; - } else { - map = dm_get_live_table_fast(md); - if (map) - r = dm_table_any_congested(map, bdi_bits); - dm_put_live_table_fast(md); - } - } - - return r; } /*----------------------------------------------------------------- @@ -1811,29 +1773,28 @@ } static const struct block_device_operations dm_blk_dops; +static const struct block_device_operations dm_rq_blk_dops; static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); -static void dm_init_normal_md_queue(struct mapped_device *md) +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +static void dm_queue_destroy_keyslot_manager(struct request_queue *q) { - md->use_blk_mq = false; - - /* - * Initialize aspects of queue that aren't relevant for blk-mq - */ - md->queue->backing_dev_info->congested_data = md; - md->queue->backing_dev_info->congested_fn = dm_any_congested; + dm_destroy_keyslot_manager(q->ksm); } -static void dm_destroy_inline_encryption(struct request_queue *q); +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) +{ +} +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) destroy_workqueue(md->wq); - if (md->kworker_task) - kthread_stop(md->kworker_task); bioset_exit(&md->bs); bioset_exit(&md->io_bs); @@ -1852,7 +1813,7 @@ } if (md->queue) { - dm_destroy_inline_encryption(md->queue); + dm_queue_destroy_keyslot_manager(md->queue); blk_cleanup_queue(md->queue); } @@ -1866,6 +1827,7 @@ mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); + mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); } @@ -1876,7 +1838,6 @@ static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1902,7 +1863,6 @@ goto bad_io_barrier; md->numa_node_id = numa_node_id; - md->use_blk_mq = dm_use_blk_mq_default(); md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); @@ -1917,28 +1877,27 @@ INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); + /* + * default to bio-based until DM table is loaded and md->type + * established. If request-based table is loaded: blk-mq will + * override accordingly. + */ + md->queue = blk_alloc_queue(numa_node_id); if (!md->queue) goto bad; - md->queue->queuedata = md; - /* - * default to bio-based required ->make_request_fn until DM - * table is loaded and md->type established. If request-based - * table is loaded: blk-mq will override accordingly. - */ - blk_queue_make_request(md->queue, dm_make_request); md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) goto bad; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); - md->kworker_task = NULL; + + md->swap_bios = get_swap_bios(); + sema_init(&md->swap_bios_semaphore, md->swap_bios); + mutex_init(&md->swap_bios_lock); md->disk->major = _major; md->disk->first_minor = minor; @@ -1948,11 +1907,13 @@ sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) + md->dax_dev = alloc_dax(md, md->disk->disk_name, + &dm_dax_ops, 0); + if (IS_ERR(md->dax_dev)) { + md->dax_dev = NULL; goto bad; + } } - md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); @@ -1965,11 +1926,9 @@ if (!md->bdev) goto bad; - bio_init(&md->flush_bio, NULL, 0); - bio_set_dev(&md->flush_bio, md->bdev); - md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - - dm_stats_init(&md->stats); + r = dm_stats_init(&md->stats); + if (r < 0) + goto bad; /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); @@ -2072,18 +2031,6 @@ } /* - * Protected by md->suspend_lock obtained by dm_swap_table(). - */ -static void __set_size(struct mapped_device *md, sector_t size) -{ - lockdep_assert_held(&md->suspend_lock); - - set_capacity(md->disk, size); - - i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -} - -/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -2105,7 +2052,8 @@ if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); - __set_size(md, size); + set_capacity(md->disk, size); + bd_set_nr_sectors(md->bdev, size); dm_table_event_callback(t, event_callback, md); @@ -2119,12 +2067,10 @@ if (request_based) dm_stop_queue(q); - if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) { + if (request_based) { /* - * Leverage the fact that request-based DM targets and - * NVMe bio based targets are immutable singletons - * - used to optimize both dm_request_fn and dm_mq_queue_rq; - * and __process_bio. + * Leverage the fact that request-based DM targets are + * immutable singletons - used to optimize dm_mq_queue_rq. */ md->immutable_target = dm_table_get_immutable_target(t); } @@ -2227,166 +2173,6 @@ } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -#ifdef CONFIG_BLK_INLINE_ENCRYPTION -struct dm_keyslot_evict_args { - const struct blk_crypto_key *key; - int err; -}; - -static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct dm_keyslot_evict_args *args = data; - int err; - - err = blk_crypto_evict_key(dev->bdev->bd_queue, args->key); - if (!args->err) - args->err = err; - /* Always try to evict the key from all devices. */ - return 0; -} - -/* - * When an inline encryption key is evicted from a device-mapper device, evict - * it from all the underlying devices. - */ -static int dm_keyslot_evict(struct keyslot_manager *ksm, - const struct blk_crypto_key *key, unsigned int slot) -{ - struct mapped_device *md = keyslot_manager_private(ksm); - struct dm_keyslot_evict_args args = { key }; - struct dm_table *t; - int srcu_idx; - int i; - struct dm_target *ti; - - t = dm_get_live_table(md, &srcu_idx); - if (!t) - return 0; - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - if (!ti->type->iterate_devices) - continue; - ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); - } - dm_put_live_table(md, srcu_idx); - return args.err; -} - -struct dm_derive_raw_secret_args { - const u8 *wrapped_key; - unsigned int wrapped_key_size; - u8 *secret; - unsigned int secret_size; - int err; -}; - -static int dm_derive_raw_secret_callback(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - struct dm_derive_raw_secret_args *args = data; - struct request_queue *q = dev->bdev->bd_queue; - - if (!args->err) - return 0; - - if (!q->ksm) { - args->err = -EOPNOTSUPP; - return 0; - } - - args->err = keyslot_manager_derive_raw_secret(q->ksm, args->wrapped_key, - args->wrapped_key_size, - args->secret, - args->secret_size); - /* Try another device in case this fails. */ - return 0; -} - -/* - * Retrieve the raw_secret from the underlying device. Given that - * only only one raw_secret can exist for a particular wrappedkey, - * retrieve it only from the first device that supports derive_raw_secret() - */ -static int dm_derive_raw_secret(struct keyslot_manager *ksm, - const u8 *wrapped_key, - unsigned int wrapped_key_size, - u8 *secret, unsigned int secret_size) -{ - struct mapped_device *md = keyslot_manager_private(ksm); - struct dm_derive_raw_secret_args args = { - .wrapped_key = wrapped_key, - .wrapped_key_size = wrapped_key_size, - .secret = secret, - .secret_size = secret_size, - .err = -EOPNOTSUPP, - }; - struct dm_table *t; - int srcu_idx; - int i; - struct dm_target *ti; - - t = dm_get_live_table(md, &srcu_idx); - if (!t) - return -EOPNOTSUPP; - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - if (!ti->type->iterate_devices) - continue; - ti->type->iterate_devices(ti, dm_derive_raw_secret_callback, - &args); - if (!args.err) - break; - } - dm_put_live_table(md, srcu_idx); - return args.err; -} - -static struct keyslot_mgmt_ll_ops dm_ksm_ll_ops = { - .keyslot_evict = dm_keyslot_evict, - .derive_raw_secret = dm_derive_raw_secret, -}; - -static int dm_init_inline_encryption(struct mapped_device *md) -{ - unsigned int features; - unsigned int mode_masks[BLK_ENCRYPTION_MODE_MAX]; - - /* - * Initially declare support for all crypto settings. Anything - * unsupported by a child device will be removed later when calculating - * the device restrictions. - */ - features = BLK_CRYPTO_FEATURE_STANDARD_KEYS | - BLK_CRYPTO_FEATURE_WRAPPED_KEYS; - memset(mode_masks, 0xFF, sizeof(mode_masks)); - - md->queue->ksm = keyslot_manager_create_passthrough(NULL, - &dm_ksm_ll_ops, - features, - mode_masks, md); - if (!md->queue->ksm) - return -ENOMEM; - return 0; -} - -static void dm_destroy_inline_encryption(struct request_queue *q) -{ - keyslot_manager_destroy(q->ksm); - q->ksm = NULL; -} -#else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline int dm_init_inline_encryption(struct mapped_device *md) -{ - return 0; -} - -static inline void dm_destroy_inline_encryption(struct request_queue *q) -{ -} -#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ - /* * Setup the DM device's queue based on md's type */ @@ -2398,27 +2184,15 @@ switch (type) { case DM_TYPE_REQUEST_BASED: - dm_init_normal_md_queue(md); - r = dm_old_init_request_queue(md, t); - if (r) { - DMERR("Cannot initialize queue for request-based mapped device"); - return r; - } - break; - case DM_TYPE_MQ_REQUEST_BASED: + md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { - DMERR("Cannot initialize queue for request-based dm-mq mapped device"); + DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - dm_init_normal_md_queue(md); - break; - case DM_TYPE_NVME_BIO_BASED: - dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request_nvme); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); @@ -2430,13 +2204,6 @@ DMERR("Cannot calculate initial queue limits"); return r; } - - r = dm_init_inline_encryption(md); - if (r) { - DMERR("Cannot initialize inline encryption"); - return r; - } - dm_table_set_restrictions(t, md->queue, &limits); blk_register_queue(md->disk); @@ -2516,9 +2283,6 @@ blk_set_queue_dying(md->queue); - if (dm_request_based(md) && md->kworker_task) - kthread_flush_worker(&md->kworker); - /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. @@ -2569,15 +2333,29 @@ } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md, long task_state) +static bool md_in_flight_bios(struct mapped_device *md) +{ + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; + + for_each_possible_cpu(cpu) { + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + + return sum != 0; +} + +static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state) { int r = 0; DEFINE_WAIT(wait); - while (1) { + while (true) { prepare_to_wait(&md->wait, &wait, task_state); - if (!md_in_flight(md)) + if (!md_in_flight_bios(md)) break; if (signal_pending_state(task_state, current)) { @@ -2589,6 +2367,30 @@ } finish_wait(&md->wait, &wait); + smp_rmb(); + + return r; +} + +static int dm_wait_for_completion(struct mapped_device *md, long task_state) +{ + int r = 0; + + if (!queue_is_mq(md->queue)) + return dm_wait_for_bios_completion(md, task_state); + + while (true) { + if (!blk_mq_queue_inflight(md->queue)) + break; + + if (signal_pending_state(task_state, current)) { + r = -EINTR; + break; + } + + msleep(5); + } + return r; } @@ -2597,29 +2399,20 @@ */ static void dm_wq_work(struct work_struct *work) { - struct mapped_device *md = container_of(work, struct mapped_device, - work); - struct bio *c; - int srcu_idx; - struct dm_table *map; - - map = dm_get_live_table(md, &srcu_idx); + struct mapped_device *md = container_of(work, struct mapped_device, work); + struct bio *bio; while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); - c = bio_list_pop(&md->deferred); + bio = bio_list_pop(&md->deferred); spin_unlock_irq(&md->deferred_lock); - if (!c) + if (!bio) break; - if (dm_request_based(md)) - generic_make_request(c); - else - __split_and_process_bio(md, map, c); + submit_bio_noacct(bio); + cond_resched(); } - - dm_put_live_table(md, srcu_idx); } static void dm_queue_flush(struct mapped_device *md) @@ -2681,27 +2474,19 @@ { int r; - WARN_ON(md->frozen_sb); + WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - md->frozen_sb = freeze_bdev(md->bdev); - if (IS_ERR(md->frozen_sb)) { - r = PTR_ERR(md->frozen_sb); - md->frozen_sb = NULL; - return r; - } - - set_bit(DMF_FROZEN, &md->flags); - - return 0; + r = freeze_bdev(md->bdev); + if (!r) + set_bit(DMF_FROZEN, &md->flags); + return r; } static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - - thaw_bdev(md->bdev, md->frozen_sb); - md->frozen_sb = NULL; + thaw_bdev(md->bdev); clear_bit(DMF_FROZEN, &md->flags); } @@ -2731,7 +2516,7 @@ if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); else - pr_debug("%s: suspending with flush\n", dm_device_name(md)); + DMDEBUG("%s: suspending with flush", dm_device_name(md)); /* * This gets reverted if there's an error later and the targets @@ -2756,13 +2541,12 @@ /* * Here we must make sure that no processes are submitting requests * to target drivers i.e. no one may be executing - * __split_and_process_bio. This is called from dm_request and - * dm_wq_work. + * __split_and_process_bio from dm_submit_bio. * - * To get all processes out of __split_and_process_bio in dm_request, + * To get all processes out of __split_and_process_bio in dm_submit_bio, * we take the write lock. To prevent any process from reentering - * __split_and_process_bio from dm_request and quiesce the thread - * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * __split_and_process_bio from dm_submit_bio and quiesce the thread + * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); @@ -2773,11 +2557,8 @@ * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) { + if (dm_request_based(md)) dm_stop_queue(md->queue); - if (md->kworker_task) - kthread_flush_worker(&md->kworker); - } flush_workqueue(md->wq); @@ -3133,19 +2914,19 @@ int dm_suspended(struct dm_target *ti) { - return dm_suspended_md(dm_table_get_md(ti->table)); + return dm_suspended_md(ti->table->md); } EXPORT_SYMBOL_GPL(dm_suspended); int dm_post_suspending(struct dm_target *ti) { - return dm_post_suspending_md(dm_table_get_md(ti->table)); + return dm_post_suspending_md(ti->table->md); } EXPORT_SYMBOL_GPL(dm_post_suspending); int dm_noflush_suspending(struct dm_target *ti) { - return __noflush_suspending(dm_table_get_md(ti->table)); + return __noflush_suspending(ti->table->md); } EXPORT_SYMBOL_GPL(dm_noflush_suspending); @@ -3164,7 +2945,6 @@ switch (type) { case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - case DM_TYPE_NVME_BIO_BASED: pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio); @@ -3175,7 +2955,6 @@ goto out; break; case DM_TYPE_REQUEST_BASED: - case DM_TYPE_MQ_REQUEST_BASED: pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ @@ -3233,6 +3012,11 @@ if (dm_table_get_num_targets(table) != 1) goto out; ti = dm_table_get_target(table, 0); + + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto out; + } ret = -EINVAL; if (!ti->type->iterate_devices) @@ -3373,6 +3157,17 @@ }; static const struct block_device_operations dm_blk_dops = { + .submit_bio = dm_submit_bio, + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .getgeo = dm_blk_getgeo, + .report_zones = dm_blk_report_zones, + .pr_ops = &dm_pr_ops, + .owner = THIS_MODULE +}; + +static const struct block_device_operations dm_rq_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, @@ -3383,8 +3178,10 @@ static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, + .dax_supported = dm_dax_supported, .copy_from_iter = dm_dax_copy_from_iter, .copy_to_iter = dm_dax_copy_to_iter, + .zero_page_range = dm_dax_zero_page_range, }; /* @@ -3402,6 +3199,9 @@ module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); +module_param(swap_bios, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); -- Gitblit v1.6.2