From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/block/blk-mq.c | 2329 ++++++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 1,573 insertions(+), 756 deletions(-) diff --git a/kernel/block/blk-mq.c b/kernel/block/blk-mq.c index ae70b48..21544b1 100644 --- a/kernel/block/blk-mq.c +++ b/kernel/block/blk-mq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * @@ -25,30 +26,36 @@ #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> +#include <linux/blk-crypto.h> #include <trace/events/block.h> #include <linux/blk-mq.h> +#include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +#include <trace/hooks/block.h> + +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); static int blk_mq_poll_stats_bkt(const struct request *rq) { - int ddir, bytes, bucket; + int ddir, sectors, bucket; ddir = rq_data_dir(rq); - bytes = blk_rq_bytes(rq); + sectors = blk_rq_stats_sectors(rq); - bucket = ddir + 2*(ilog2(bytes) - 9); + bucket = ddir + 2 * ilog2(sectors); if (bucket < 0) return -1; @@ -59,7 +66,8 @@ } /* - * Check if any of the ctx's have pending work in this hardware queue + * Check if any of the ctx, dispatch list or elevator + * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { @@ -74,75 +82,67 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) - sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + if (!sbitmap_test_bit(&hctx->ctx_map, bit)) + sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct hd_struct *part; - unsigned int *inflight; + unsigned int inflight[2]; }; -static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; - /* - * index[0] counts the specific partition that was asked for. index[1] - * counts the ones that are active on the whole device, so increment - * that if mi->part is indeed a partition, and not a whole device. - */ - if (rq->part == mi->part) - mi->inflight[0]++; - if (mi->part->partno) - mi->inflight[1]++; -} - -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) -{ - struct mq_inflight mi = { .part = part, .inflight = inflight, }; - - inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); -} - -static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) -{ - struct mq_inflight *mi = priv; - - if (rq->part == mi->part) + if ((!mi->part->partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; + + return true; +} + +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +{ + struct mq_inflight mi = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + + return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_inc_return(&q->mq_freeze_depth); - if (freeze_depth == 1) { + mutex_lock(&q->mq_freeze_lock); + if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - if (q->mq_ops) + mutex_unlock(&q->mq_freeze_lock); + if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); + } else { + mutex_unlock(&q->mq_freeze_lock); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -176,8 +176,6 @@ * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); - if (!q->mq_ops) - blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } @@ -193,14 +191,14 @@ void blk_mq_unfreeze_queue(struct request_queue *q) { - int freeze_depth; - - freeze_depth = atomic_dec_return(&q->mq_freeze_depth); - WARN_ON_ONCE(freeze_depth < 0); - if (!freeze_depth) { - percpu_ref_reinit(&q->q_usage_counter); + mutex_lock(&q->mq_freeze_lock); + q->mq_freeze_depth--; + WARN_ON_ONCE(q->mq_freeze_depth < 0); + if (!q->mq_freeze_depth) { + percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + mutex_unlock(&q->mq_freeze_lock); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); @@ -268,40 +266,37 @@ blk_mq_tag_wakeup_all(hctx->tags, true); } -bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) +/* + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) { - return blk_mq_has_free_tags(hctx->tags); + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; } -EXPORT_SYMBOL(blk_mq_can_queue); static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op) + unsigned int tag, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; - if (data->flags & BLK_MQ_REQ_INTERNAL) { - rq->tag = -1; + if (data->q->elevator) { + rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; - rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; + rq->internal_tag = BLK_MQ_NO_TAG; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; - rq->rq_flags = rq_flags; - rq->cpu = -1; - rq->cmd_flags = op; - if (data->flags & BLK_MQ_REQ_PREEMPT) - rq->rq_flags |= RQF_PREEMPT; + rq->mq_hctx = data->hctx; + rq->rq_flags = 0; + rq->cmd_flags = data->cmd_flags; + if (data->flags & BLK_MQ_REQ_PM) + rq->rq_flags |= RQF_PM; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; INIT_LIST_HEAD(&rq->queuelist); @@ -309,97 +304,110 @@ RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - rq->special = NULL; + blk_crypto_rq_set_defaults(rq); /* tag was already set */ - rq->extra_len = 0; - rq->__deadline = 0; + WRITE_ONCE(rq->deadline, 0); - INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; - rq->next_rq = NULL; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; -#endif - - data->ctx->rq_dispatched[op_is_sync(op)]++; + data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; refcount_set(&rq->ref, 1); + + if (!op_is_flush(data->cmd_flags)) { + struct elevator_queue *e = data->q->elevator; + + rq->elv.icq = NULL; + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); + + e->type->ops.prepare_request(rq); + rq->rq_flags |= RQF_ELVPRIV; + } + } + + data->hctx->queued++; + trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns); return rq; } -static struct request *blk_mq_get_request(struct request_queue *q, - struct bio *bio, unsigned int op, - struct blk_mq_alloc_data *data) +static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) { + struct request_queue *q = data->q; struct elevator_queue *e = q->elevator; - struct request *rq; + u64 alloc_time_ns = 0; unsigned int tag; - bool put_ctx_on_error = false; - blk_queue_enter_live(q); - data->q = q; - if (likely(!data->ctx)) { - data->ctx = blk_mq_get_ctx(q); - put_ctx_on_error = true; - } - if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - if (op & REQ_NOWAIT) + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + + if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; - /* * Flush requests are special and go directly to the * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ - if (!op_is_flush(op) && e->type->ops.mq.limit_depth && + if (!op_is_flush(data->cmd_flags) && + e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.mq.limit_depth(op, data); - } else { + e->type->ops.limit_depth(data->cmd_flags, data); + } + +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (!e) blk_mq_tag_busy(data->hctx); - } + /* + * Waiting allocations only fail because of an inactive hctx. In that + * case just retry the hctx assignment and tag allocation as CPU hotplug + * should have migrated us to an online CPU by now. + */ tag = blk_mq_get_tag(data); - if (tag == BLK_MQ_TAG_FAIL) { - if (put_ctx_on_error) { - blk_mq_put_ctx(data->ctx); - data->ctx = NULL; - } - blk_queue_exit(q); - return NULL; - } + if (tag == BLK_MQ_NO_TAG) { + if (data->flags & BLK_MQ_REQ_NOWAIT) + return NULL; - rq = blk_mq_rq_ctx_init(data, tag, op); - if (!op_is_flush(op)) { - rq->elv.icq = NULL; - if (e && e->type->ops.mq.prepare_request) { - if (e->type->icq_cache && rq_ioc(bio)) - blk_mq_sched_assign_ioc(rq, bio); - - e->type->ops.mq.prepare_request(rq, bio); - rq->rq_flags |= RQF_ELVPRIV; - } + /* + * Give up the CPU and sleep for a random short time to ensure + * that thread using a realtime scheduling class are migrated + * off the CPU, and thus off the hctx that is going away. + */ + msleep(3); + goto retry; } - data->hctx->queued++; - return rq; + return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; struct request *rq; int ret; @@ -407,28 +415,35 @@ if (ret) return ERR_PTR(ret); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); - blk_queue_exit(q); - + rq = __blk_mq_alloc_request(&data); if (!rq) - return ERR_PTR(-EWOULDBLOCK); - - blk_mq_put_ctx(alloc_data.ctx); - + goto out_queue_exit; rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; - struct request *rq; + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; + u64 alloc_time_ns = 0; unsigned int cpu; + unsigned int tag; int ret; + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a @@ -436,7 +451,8 @@ * allocator for this for the rare use case of a command tied to * a specific queue. */ - if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || + WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) @@ -450,21 +466,27 @@ * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { - blk_queue_exit(q); - return ERR_PTR(-EXDEV); - } - cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); - alloc_data.ctx = __blk_mq_get_ctx(q, cpu); + ret = -EXDEV; + data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(data.hctx)) + goto out_queue_exit; + cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); + if (cpu >= nr_cpu_ids) + goto out_queue_exit; + data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + if (!q->elevator) + blk_mq_tag_busy(data.hctx); + + ret = -EWOULDBLOCK; + tag = blk_mq_get_tag(&data); + if (tag == BLK_MQ_NO_TAG) + goto out_queue_exit; + return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + +out_queue_exit: blk_queue_exit(q); - - if (!rq) - return ERR_PTR(-EWOULDBLOCK); - - return rq; + return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); @@ -472,13 +494,16 @@ { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; - if (rq->tag != -1) - blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); - if (sched_tag != -1) - blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + rq->mq_hctx = NULL; + if (rq->tag != BLK_MQ_NO_TAG) + blk_mq_put_tag(hctx->tags, ctx, rq->tag); + if (sched_tag != BLK_MQ_NO_TAG) + blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } @@ -488,11 +513,11 @@ struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.mq.finish_request) - e->type->ops.mq.finish_request(rq); + if (e && e->type->ops.finish_request) + e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); rq->elv.icq = NULL; @@ -501,15 +526,12 @@ ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) - atomic_dec(&hctx->nr_active); + __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); rq_qos_done(q, rq); - - if (blk_rq_rl(rq)) - blk_put_rl(blk_rq_rl(rq)); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (refcount_dec_and_test(&rq->ref)) @@ -519,12 +541,17 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } + + blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -532,8 +559,6 @@ rq_qos_done(rq->q, rq); rq->end_io(rq, error); } else { - if (unlikely(blk_bidi_rq(rq))) - blk_mq_free_request(rq->next_rq); blk_mq_free_request(rq); } } @@ -547,43 +572,139 @@ } EXPORT_SYMBOL(blk_mq_end_request); +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static __latent_entropy void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = this_cpu_ptr(&blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); + rq->q->mq_ops->complete(rq); + } +} + +static void blk_mq_trigger_softirq(struct request *rq) +{ + struct list_head *list; + unsigned long flags; + + local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + /* + * If the list only contains our just added request, signal a raise of + * the softirq. If there are already entries there, someone already + * raised the irq but it hasn't run yet. + */ + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_restore(flags); +} + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + + return 0; +} + + static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; - rq->q->softirq_done_fn(rq); + /* + * For most of single queue controllers, there is only one irq vector + * for handling I/O completion, and the only irq's affinity is set + * to all possible CPUs. On most of ARCHs, this affinity means the irq + * is handled on one specific CPU. + * + * So complete I/O requests in softirq context in case of single queue + * devices to avoid degrading I/O performance due to irqsoff latency. + */ + if (rq->q->nr_hw_queues == 1) + blk_mq_trigger_softirq(rq); + else + rq->q->mq_ops->complete(rq); } -static void __blk_mq_complete_request(struct request *rq) +static inline bool blk_mq_complete_need_ipi(struct request *rq) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - bool shared = false; - int cpu; + int cpu = raw_smp_processor_id(); - if (!blk_mq_mark_complete(rq)) - return; - if (rq->internal_tag != -1) - blk_mq_sched_completed_request(rq); + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; - if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { - rq->q->softirq_done_fn(rq); - return; - } + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; - cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); +} - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { +bool blk_mq_complete_request_remote(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if (rq->cmd_flags & REQ_HIPRI) + return false; + + if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); + smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { - rq->q->softirq_done_fn(rq); + if (rq->q->nr_hw_queues > 1) + return false; + blk_mq_trigger_softirq(rq); } - put_cpu(); + + return true; } +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -606,40 +727,22 @@ } /** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed + * blk_mq_start_request - Start processing a request + * @rq: Pointer to request to be started * - * Description: - * Ends all I/O on a request. It does not handle partial completions. - * The actual completion happens out-of-order, through a IPI handler. - **/ -void blk_mq_complete_request(struct request *rq) -{ - if (unlikely(blk_should_fake_timeout(rq->q))) - return; - __blk_mq_complete_request(rq); -} -EXPORT_SYMBOL(blk_mq_complete_request); - -int blk_mq_request_started(struct request *rq) -{ - return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_started); - + * Function used by device drivers to notify the block layer that a request + * is going to be processed now, so blk layer can do proper initializations + * such as starting the timeout timer. + */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - - blk_mq_sched_started_request(rq); trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - rq->throtl_size = blk_rq_sectors(rq); -#endif + rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } @@ -649,14 +752,10 @@ blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); @@ -672,8 +771,6 @@ if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } @@ -684,7 +781,6 @@ /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(blk_queued_rq(rq)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -712,7 +808,7 @@ * merge. */ if (rq->rq_flags & RQF_DONTPREP) - blk_mq_request_bypass_insert(rq, false); + blk_mq_request_bypass_insert(rq, false, false); else blk_mq_sched_insert_request(rq, true, false, false); } @@ -750,7 +846,6 @@ if (kick_requeue_list) blk_mq_kick_requeue_list(q); } -EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { @@ -777,6 +872,32 @@ } EXPORT_SYMBOL(blk_mq_tag_to_rq); +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) +{ + /* + * If we find a request that isn't idle and the queue matches, + * we know the queue is busy. Return false to stop the iteration. + */ + if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + bool *busy = priv; + + *busy = true; + return false; + } + + return true; +} + +bool blk_mq_queue_inflight(struct request_queue *q) +{ + bool busy = false; + + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); + return busy; +} +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); + static void blk_mq_rq_timed_out(struct request *req, bool reserved) { req->rq_flags |= RQF_TIMED_OUT; @@ -801,7 +922,7 @@ if (rq->rq_flags & RQF_TIMED_OUT) return false; - deadline = blk_rq_deadline(rq); + deadline = READ_ONCE(rq->deadline); if (time_after_eq(jiffies, deadline)) return true; @@ -812,43 +933,29 @@ return false; } -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, +void blk_mq_put_rq_ref(struct request *rq) +{ + if (is_flush_rq(rq)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + +static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; /* - * Just do a quick check if it is expired before locking the request in - * so we're not unnecessarilly synchronizing across CPUs. - */ - if (!blk_mq_req_expired(rq, next)) - return; - - /* - * We have reason to believe the request may be expired. Take a - * reference on the request to lock this request lifetime into its - * currently allocated context to prevent it from being reallocated in - * the event the completion by-passes this timeout handler. - * - * If the reference was already released, then the driver beat the - * timeout handler to posting a natural completion. - */ - if (!refcount_inc_not_zero(&rq->ref)) - return; - - /* - * The request is now locked and cannot be reallocated underneath the - * timeout handler's processing. Re-verify this exact request is truly - * expired; if it is not expired, then the request was completed and - * reallocated as a new request. + * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot + * be reallocated underneath the timeout handler's processing, then + * the expire check is reliable. If the request is not expired, then + * it was completed and reallocated as a new request after returning + * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - - if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); + return true; } static void blk_mq_timeout_work(struct work_struct *work) @@ -905,9 +1012,10 @@ struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, flush_data->list); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; @@ -939,12 +1047,13 @@ struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); - if (list_empty(&ctx->rq_list)) + if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); @@ -955,7 +1064,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { - unsigned off = start ? start->index_hw : 0; + unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, @@ -975,33 +1084,44 @@ return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -bool blk_mq_get_driver_tag(struct request *rq) +static bool __blk_mq_get_driver_tag(struct request *rq) { - struct blk_mq_alloc_data data = { - .q = rq->q, - .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), - .flags = BLK_MQ_REQ_NOWAIT, - }; - bool shared; + struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + int tag; - if (rq->tag != -1) - goto done; + blk_mq_tag_busy(rq->mq_hctx); - if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) - data.flags |= BLK_MQ_REQ_RESERVED; - - shared = blk_mq_tag_busy(data.hctx); - rq->tag = blk_mq_get_tag(&data); - if (rq->tag >= 0) { - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&data.hctx->nr_active); - } - data.hctx->tags->rqs[rq->tag] = rq; + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { + bt = rq->mq_hctx->tags->breserved_tags; + tag_offset = 0; + } else { + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; } -done: - return rq->tag != -1; + tag = __sbitmap_queue_get(bt); + if (tag == BLK_MQ_NO_TAG) + return false; + + rq->tag = tag + tag_offset; + return true; +} + +static bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && + !(rq->rq_flags & RQF_MQ_INFLIGHT)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + __blk_mq_inc_active_requests(hctx); + } + hctx->tags->rqs[rq->tag] = rq; + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1012,7 +1132,13 @@ hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); - list_del_init(&wait->entry); + if (!list_empty(&wait->entry)) { + struct sbitmap_queue *sbq; + + list_del_init(&wait->entry); + sbq = hctx->tags->bitmap_tags; + atomic_dec(&sbq->ws_active); + } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); @@ -1028,13 +1154,13 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { + struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the @@ -1051,7 +1177,7 @@ if (!list_empty_careful(&wait->entry)) return false; - wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); @@ -1061,6 +1187,7 @@ return false; } + atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); @@ -1081,6 +1208,7 @@ * someone else gets the wakeup. */ list_del_init(&wait->entry); + atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); @@ -1099,9 +1227,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; - - if (hctx->queue->elevator) - return; ewma = hctx->dispatch_busy; @@ -1135,22 +1260,83 @@ __blk_mq_requeue_request(rq); } +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + +enum prep_dispatch { + PREP_DISPATCH_OK, + PREP_DISPATCH_NO_TAG, + PREP_DISPATCH_NO_BUDGET, +}; + +static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, + bool need_budget) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + + if (!blk_mq_get_driver_tag(rq)) { + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. + */ + if (!blk_mq_mark_tag_wait(hctx, rq)) { + /* + * All budgets not got from this function will be put + * together during handling partial dispatch + */ + if (need_budget) + blk_mq_put_dispatch_budget(rq->q); + return PREP_DISPATCH_NO_TAG; + } + } + + return PREP_DISPATCH_OK; +} + +/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ +static void blk_mq_release_budgets(struct request_queue *q, + unsigned int nr_budgets) +{ + int i; + + for (i = 0; i < nr_budgets; i++) + blk_mq_put_dispatch_budget(q); +} + /* * Returns true if we did some work AND can potentially do more. */ -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets) { - struct blk_mq_hw_ctx *hctx; + enum prep_dispatch prep; + struct request_queue *q = hctx->queue; struct request *rq, *nxt; - bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; + LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; - - WARN_ON(!list_is_singular(list) && got_budget); /* * Now process all the entries, sending them to the driver. @@ -1161,29 +1347,10 @@ rq = list_first_entry(list, struct request, queuelist); - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + WARN_ON_ONCE(hctx != rq->mq_hctx); + prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + if (prep != PREP_DISPATCH_OK) break; - - if (!blk_mq_get_driver_tag(rq)) { - /* - * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. The - * waitqueue takes care of that. If the queue is run - * before we add this entry back on the dispatch list, - * we'll re-run it below. - */ - if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(hctx); - /* - * For non-shared tags, the RESTART check - * will suffice. - */ - if (hctx->flags & BLK_MQ_F_TAG_SHARED) - no_tag = true; - break; - } - } list_del_init(&rq->queuelist); @@ -1200,32 +1367,63 @@ bd.last = !blk_mq_get_driver_tag(nxt); } + /* + * once the request is queued to lld, no need to cover the + * budget any more + */ + if (nr_budgets) + nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_handle_dev_resource(rq, list); + switch (ret) { + case BLK_STS_OK: + queued++; break; - } - - if (unlikely(ret != BLK_STS_OK)) { + case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; + break; + default: errors++; blk_mq_end_request(rq, BLK_STS_IOERR); - continue; } - - queued++; } while (!list_empty(list)); +out: + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); hctx->dispatched[queued_to_index(queued)]++; + /* If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if ((!list_empty(list) || errors || needs_resource || + ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) + q->mq_ops->commit_rqs(hctx); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; + /* For non-shared tags, the RESTART check will suffice */ + bool no_tag = prep == PREP_DISPATCH_NO_TAG && + (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); + + blk_mq_release_budgets(q, nr_budgets); spin_lock(&hctx->lock); - list_splice_init(list, &hctx->dispatch); + list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* @@ -1259,13 +1457,17 @@ * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls - * that could otherwise occur if the queue is idle. + * that could otherwise occur if the queue is idle. We'll do + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE)) + else if (needs_restart && needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -1273,16 +1475,15 @@ } else blk_mq_update_dispatch_busy(hctx, false); - /* - * If the host/device is unable to accept more work, inform the - * caller of that. - */ - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - return false; - return (queued + errors) != 0; } +/** + * __blk_mq_run_hw_queue - Run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * + * Send pending requests to the hardware. + */ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; @@ -1380,6 +1581,15 @@ return next_cpu; } +/** + * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * @msecs: Microseconds of delay to wait before running the queue. + * + * If !@async, try to run the queue now. Else, run the queue asynchronously and + * with a delay of @msecs. + */ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { @@ -1401,13 +1611,29 @@ msecs_to_jiffies(msecs)); } +/** + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. + * @hctx: Pointer to the hardware queue to run. + * @msecs: Microseconds of delay to wait before running the queue. + * + * Run a hardware queue asynchronously with a delay of @msecs. + */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { __blk_mq_delay_run_hw_queue(hctx, true, msecs); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +/** + * blk_mq_run_hw_queue - Start to run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * + * Check if the request queue is not in a quiesced state and if there are + * pending requests to be sent. If this is true, run the queue to send requests + * to hardware. + */ +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; @@ -1425,28 +1651,101 @@ blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); - if (need_run) { + if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); - return true; - } - - return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/* + * Is the request queue handled by an IO scheduler that does not respect + * hardware queues when dispatching? + */ +static bool blk_mq_has_sqsched(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.dispatch_request && + !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) + return true; + return false; +} + +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx); + + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + +/** + * blk_mq_run_hw_queues - Run all hardware queues in a request queue. + * @q: Pointer to the request queue to run. + * @async: If we want to run the queue asynchronously. + */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); + +/** + * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. + * @q: Pointer to the request queue to run. + * @msecs: Microseconds of delay to wait before running the queues. + */ +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) +{ + struct blk_mq_hw_ctx *hctx, *sq_hctx; + int i; + + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); + } +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /** * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped @@ -1551,7 +1850,7 @@ /* * If we are stopped, don't run the queue. */ - if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + if (blk_mq_hctx_stopped(hctx)) return; __blk_mq_run_hw_queue(hctx); @@ -1562,15 +1861,16 @@ bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); trace_block_rq_insert(hctx->queue, rq); if (at_head) - list_add(&rq->queuelist, &ctx->rq_list); + list_add(&rq->queuelist, &ctx->rq_lists[type]); else - list_add_tail(&rq->queuelist, &ctx->rq_list); + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1584,17 +1884,25 @@ blk_mq_hctx_mark_pending(hctx, ctx); } -/* +/** + * blk_mq_request_bypass_insert - Insert a request at dispatch list. + * @rq: Pointer to request to be inserted. + * @at_head: true if the request should be inserted at the head of the list. + * @run_queue: If we should run the hardware queue after inserting the request. + * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) +void blk_mq_request_bypass_insert(struct request *rq, bool at_head, + bool run_queue) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); - list_add_tail(&rq->queuelist, &hctx->dispatch); + if (at_head) + list_add(&rq->queuelist, &hctx->dispatch); + else + list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); if (run_queue) @@ -1606,6 +1914,7 @@ { struct request *rq; + enum hctx_type type = hctx->type; /* * preemption doesn't flush plug list, so it's possible ctx->cpu is @@ -1617,95 +1926,87 @@ } spin_lock(&ctx->lock); - list_splice_tail_init(list, &ctx->rq_list); + list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - return !(rqa->mq_ctx < rqb->mq_ctx || - (rqa->mq_ctx == rqb->mq_ctx && - blk_rq_pos(rqa) < blk_rq_pos(rqb))); + if (rqa->mq_ctx != rqb->mq_ctx) + return rqa->mq_ctx > rqb->mq_ctx; + if (rqa->mq_hctx != rqb->mq_hctx) + return rqa->mq_hctx > rqb->mq_hctx; + + return blk_rq_pos(rqa) > blk_rq_pos(rqb); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_ctx *this_ctx; - struct request_queue *this_q; - struct request *rq; LIST_HEAD(list); - LIST_HEAD(ctx_list); - unsigned int depth; + if (list_empty(&plug->mq_list)) + return; list_splice_init(&plug->mq_list, &list); - list_sort(NULL, &list, plug_ctx_cmp); + if (plug->rq_count > 2 && plug->multiple_queues) + list_sort(NULL, &list, plug_rq_cmp); - this_q = NULL; - this_ctx = NULL; - depth = 0; + plug->rq_count = 0; - while (!list_empty(&list)) { - rq = list_entry_rq(list.next); - list_del_init(&rq->queuelist); - BUG_ON(!rq->q); - if (rq->mq_ctx != this_ctx) { - if (this_ctx) { - trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, - &ctx_list, - from_schedule); - } + do { + struct list_head rq_list; + struct request *rq, *head_rq = list_entry_rq(list.next); + struct list_head *pos = &head_rq->queuelist; /* skip first */ + struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; + struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; + unsigned int depth = 1; - this_ctx = rq->mq_ctx; - this_q = rq->q; - depth = 0; + list_for_each_continue(pos, &list) { + rq = list_entry_rq(pos); + BUG_ON(!rq->q); + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) + break; + depth++; } - depth++; - list_add_tail(&rq->queuelist, &ctx_list); - } - - /* - * If 'this_ctx' is set, we know we have entries to complete - * on 'ctx_list'. Do those. - */ - if (this_ctx) { - trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + list_cut_before(&rq_list, &list, pos); + trace_block_unplug(head_rq->q, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); - } + } while(!list_empty(&list)); } -static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, + unsigned int nr_segs) { - blk_init_request_from_bio(rq, bio); + int err; - blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); + if (bio->bi_opf & REQ_RAHEAD) + rq->cmd_flags |= REQ_FAILFAST_MASK; - blk_account_io_start(rq, true); -} + rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; + blk_rq_bio_prep(rq, bio, nr_segs); -static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag != -1) - return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); + /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ + err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); + WARN_ON_ONCE(err); - return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); + blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie) + blk_qc_t *cookie, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .last = true, + .last = last, }; blk_qc_t new_cookie; blk_status_t ret; @@ -1740,7 +2041,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass_insert) + bool bypass_insert, bool last) { struct request_queue *q = rq->q; bool run_queue = true; @@ -1761,23 +2062,35 @@ if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) goto insert; if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); goto insert; } - return __blk_mq_issue_directly(hctx, rq, cookie); + return __blk_mq_issue_directly(hctx, rq, cookie, last); insert: if (bypass_insert) return BLK_STS_RESOURCE; - blk_mq_request_bypass_insert(rq, run_queue); + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } +/** + * blk_mq_try_issue_directly - Try to send a request directly to device driver. + * @hctx: Pointer of the associated hardware queue. + * @rq: Pointer to request to be sent. + * @cookie: Request queue cookie. + * + * If the device has enough resources to accept a new request now, send the + * request directly to device driver. Else, insert at hctx->dispatch queue, so + * we can try send it another time in the future. Requests inserted at this + * queue have higher priority. + */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { @@ -1788,25 +2101,24 @@ hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, true); + blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); hctx_unlock(hctx, srcu_idx); } -blk_status_t blk_mq_request_issue_directly(struct request *rq) +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; int srcu_idx; blk_qc_t unused_cookie; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); hctx_unlock(hctx, srcu_idx); return ret; @@ -1815,104 +2127,169 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { + int queued = 0; + int errors = 0; + while (!list_empty(list)) { blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = blk_mq_request_issue_directly(rq); + ret = blk_mq_request_issue_directly(rq, list_empty(list)); if (ret != BLK_STS_OK) { + errors++; if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, + blk_mq_request_bypass_insert(rq, false, list_empty(list)); break; } blk_mq_end_request(rq, ret); - } + } else + queued++; + } + + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if ((!list_empty(list) || errors) && + hctx->queue->mq_ops->commit_rqs && queued) + hctx->queue->mq_ops->commit_rqs(hctx); +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count++; + if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { + struct request *tmp; + + tmp = list_first_entry(&plug->mq_list, struct request, + queuelist); + if (tmp->q != rq->q) + plug->multiple_queues = true; } } -static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +/** + * blk_mq_submit_bio - Create and send a request to block device. + * @bio: Bio pointer. + * + * Builds up a request structure from @q and @bio and send to the device. The + * request may not be queued directly to hardware if: + * * This request can be merged with another one + * * We want to place request at plug queue for possible future merging + * * There is an IO scheduler active at this queue + * + * It will not queue the request if there is an error with the bio, or at the + * request creation. + * + * Returns: Request queue cookie. + */ +blk_qc_t blk_mq_submit_bio(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { + .q = q, + }; struct request *rq; - unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; + unsigned int nr_segs; blk_qc_t cookie; + blk_status_t ret; blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio); + __blk_queue_split(&bio, &nr_segs); if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; + goto queue_exit; if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return BLK_QC_T_NONE; + blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) + goto queue_exit; - if (blk_mq_sched_bio_merge(q, bio)) - return BLK_QC_T_NONE; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + goto queue_exit; - rq_qos_throttle(q, bio, NULL); + rq_qos_throttle(q, bio); - trace_block_getrq(q, bio, bio->bi_opf); - - rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); + data.cmd_flags = bio->bi_opf; + rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return BLK_QC_T_NONE; + goto queue_exit; } + + trace_block_getrq(q, bio, bio->bi_opf); rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); - plug = current->plug; - if (unlikely(is_flush_fua)) { - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + blk_mq_bio_to_request(rq, bio, nr_segs); - /* bypass scheduler for flush rq */ + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) { + bio->bi_status = ret; + bio_endio(bio); + blk_mq_free_request(rq); + return BLK_QC_T_NONE; + } + + plug = blk_mq_plug(q, bio); + if (unlikely(is_flush_fua)) { + /* Bypass scheduler for flush requests */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && q->nr_hw_queues == 1) { - struct request *last = NULL; - - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); - + } else if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { /* - * @request_count may become stale because of schedule - * out, so check the list again. + * Use plugging if we have a ->commit_rqs() hook as well, as + * we know the driver uses bd->last in a smart fashion. + * + * Use normal plugging if this disk is slow HDD, as sequential + * IO may benefit a lot from plug merging. */ - if (list_empty(&plug->mq_list)) - request_count = 0; - else if (blk_queue_nomerges(q)) - request_count = blk_plug_queued_count(q); + unsigned int request_count = plug->rq_count; + struct request *last = NULL; if (!request_count) trace_block_plug(q); else last = list_entry_rq(plug->mq_list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + if (request_count >= blk_plug_max_rq_count(plug) || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); } - list_add_tail(&rq->queuelist, &plug->mq_list); + blk_add_rq_to_plug(plug, rq); + } else if (q->elevator) { + /* Insert the request at the IO scheduler queue */ + blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { - blk_mq_bio_to_request(rq, bio); - /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be @@ -1922,30 +2299,74 @@ */ if (list_empty(&plug->mq_list)) same_queue_rq = NULL; - if (same_queue_rq) + if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist); - list_add_tail(&rq->queuelist, &plug->mq_list); - - blk_mq_put_ctx(data.ctx); + plug->rq_count--; + } + blk_add_rq_to_plug(plug, rq); + trace_block_plug(q); if (same_queue_rq) { - data.hctx = blk_mq_map_queue(q, - same_queue_rq->mq_ctx->cpu); + data.hctx = same_queue_rq->mq_hctx; + trace_block_unplug(q, 1, true); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && - !data.hctx->dispatch_busy)) { - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + } else if ((q->nr_hw_queues > 1 && is_sync) || + !data.hctx->dispatch_busy) { + /* + * There is no scheduler and we can try to send directly + * to the hardware. + */ blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); + /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } return cookie; +queue_exit: + blk_queue_exit(q); + return BLK_QC_T_NONE; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, @@ -1966,42 +2387,44 @@ } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in - * blk_mq_init_rq_map(). + * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); } struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, - unsigned int reserved_tags) + unsigned int reserved_tags, + unsigned int flags) { struct blk_mq_tags *tags; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); if (!tags) return NULL; @@ -2009,7 +2432,7 @@ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); return NULL; } @@ -2018,16 +2441,11 @@ node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); return NULL; } return tags; -} - -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -2052,7 +2470,7 @@ size_t rq_size, left; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2064,6 +2482,7 @@ */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); + trace_android_vh_blk_alloc_rqs(&rq_size, set, tags); left = rq_size * depth; for (i = 0; i < depth; ) { @@ -2122,6 +2541,86 @@ return -ENOMEM; } +struct rq_iter_data { + struct blk_mq_hw_ctx *hctx; + bool has_rq; +}; + +static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +{ + struct rq_iter_data *iter_data = data; + + if (rq->mq_hctx != iter_data->hctx) + return true; + iter_data->has_rq = true; + return false; +} + +static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->sched_tags ? + hctx->sched_tags : hctx->tags; + struct rq_iter_data data = { + .hctx = hctx, + }; + + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + return data.has_rq; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask) || + !blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Prevent new request from being allocated on the current hctx. + * + * The smp_mb__after_atomic() Pairs with the implied barrier in + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is + * seen once we return from the tag allocator. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + + /* + * Try to grab a reference to the queue and wait for any outstanding + * requests. If we could not grab a reference the queue has been + * frozen and there are no requests. + */ + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { + while (blk_mq_hctx_has_requests(hctx)) + msleep(5); + percpu_ref_put(&hctx->queue->q_usage_counter); + } + + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + return 0; +} + /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -2132,13 +2631,18 @@ struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -2156,8 +2660,40 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); +} + +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); } /* hctx->ctxs will be freed in queue's release handler */ @@ -2165,18 +2701,24 @@ struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - blk_mq_debugfs_unregister_hctx(hctx); + struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_remove_cpuhp(hctx); + + spin_lock(&q->unused_hctx_lock); + list_add(&hctx->hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -2188,112 +2730,160 @@ queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; + blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } +} + +static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) +{ + int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); + + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), + __alignof__(struct blk_mq_hw_ctx)) != + sizeof(struct blk_mq_hw_ctx)); + + if (tag_set->flags & BLK_MQ_F_BLOCKING) + hw_ctx_size += sizeof(struct srcu_struct); + + return hw_ctx_size; } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - int node; + hctx->queue_num = hctx_idx; - node = hctx->numa_node; + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); + + hctx->tags = set->tags[hctx_idx]; + + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto unregister_cpu_notifier; + + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, + hctx->numa_node)) + goto exit_hctx; + return 0; + + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + unregister_cpu_notifier: + blk_mq_remove_cpuhp(hctx); + return -1; +} + +static struct blk_mq_hw_ctx * +blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, + int node) +{ + struct blk_mq_hw_ctx *hctx; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + if (!hctx) + goto fail_alloc_hctx; + + if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) + goto free_hctx; + + atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) - node = hctx->numa_node = set->numa_node; + node = set->numa_node; + hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; - hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); - - hctx->tags = set->tags[hctx_idx]; + INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), - GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); + gfp, node); if (!hctx->ctxs) - goto unregister_cpu_notifier; + goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), - GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) + gfp, node)) goto free_ctxs; - hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - if (set->ops->init_hctx && - set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto free_bitmap; - - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, - GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) - goto exit_hctx; - - if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) - goto free_fq; + goto free_bitmap; if (hctx->flags & BLK_MQ_F_BLOCKING) init_srcu_struct(hctx->srcu); + blk_mq_hctx_kobj_init(hctx); - blk_mq_debugfs_register_hctx(q, hctx); + return hctx; - return 0; - - free_fq: - blk_free_flush_queue(hctx->fq); - exit_hctx: - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); - unregister_cpu_notifier: - blk_mq_remove_cpuhp(hctx); - return -1; + free_cpumask: + free_cpumask_var(hctx->cpumask); + free_hctx: + kfree(hctx); + fail_alloc_hctx: + return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { - unsigned int i; + struct blk_mq_tag_set *set = q->tag_set; + unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; + int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ - hctx = blk_mq_map_queue(q, i); - if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + for (j = 0; j < set->nr_maps; j++) { + hctx = blk_mq_map_queue_type(q, j, i); + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = cpu_to_node(i); + } } } -static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, + int hctx_idx) { + unsigned int flags = set->flags; int ret = 0; set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags); + set->queue_depth, set->reserved_tags, flags); if (!set->tags[hctx_idx]) return false; @@ -2302,7 +2892,7 @@ if (!ret) return true; - blk_mq_free_rq_map(set->tags[hctx_idx]); + blk_mq_free_rq_map(set->tags[hctx_idx], flags); set->tags[hctx_idx] = NULL; return false; } @@ -2310,16 +2900,18 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { - if (set->tags[hctx_idx]) { + unsigned int flags = set->flags; + + if (set->tags && set->tags[hctx_idx]) { blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx]); + blk_mq_free_rq_map(set->tags[hctx_idx], flags); set->tags[hctx_idx] = NULL; } } static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, hctx_idx; + unsigned int i, j, hctx_idx; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2336,25 +2928,52 @@ * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = q->mq_map[i]; - /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx] && - !__blk_mq_alloc_rq_map(set, hctx_idx)) { - /* - * If tags initialization fail for some hctx, - * that hctx won't be brought online. In this - * case, remap the current ctx to hctx[0] which - * is guaranteed to always have tags allocated - */ - q->mq_map[i] = 0; - } ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = blk_mq_map_queue(q, i); + for (j = 0; j < set->nr_maps; j++) { + if (!set->map[j].nr_queues) { + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); + continue; + } + hctx_idx = set->map[j].mq_map[i]; + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + /* + * If tags initialization fail for some hctx, + * that hctx won't be brought online. In this + * case, remap the current ctx to hctx[0] which + * is guaranteed to always have tags allocated + */ + set->map[j].mq_map[i] = 0; + } - cpumask_set_cpu(i, hctx->cpumask); - ctx->index_hw = hctx->nr_ctx; - hctx->ctxs[hctx->nr_ctx++] = ctx; + hctx = blk_mq_map_queue_type(q, j, i); + ctx->hctxs[j] = hctx; + /* + * If the CPU is already set in the mask, then we've + * mapped this one already. This can happen if + * devices share queues across queue maps. + */ + if (cpumask_test_cpu(i, hctx->cpumask)) + continue; + + cpumask_set_cpu(i, hctx->cpumask); + hctx->type = j; + ctx->index_hw[hctx->type] = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; + + /* + * If the nr_ctx type overflows, we have exceeded the + * amount of sw queues we can support. + */ + BUG_ON(!hctx->nr_ctx); + } + + for (; j < HCTX_MAX_TYPES; j++) + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { @@ -2403,14 +3022,14 @@ queue_for_each_hw_ctx(q, hctx, i) { if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; + hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, - bool shared) +static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, + bool shared) { struct request_queue *q; @@ -2428,12 +3047,12 @@ struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); - list_del_rcu(&q->tag_set_list); + list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ - set->flags &= ~BLK_MQ_F_TAG_SHARED; + set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, false); + blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); @@ -2442,24 +3061,50 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { - q->tag_set = set; - mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && - !(set->flags & BLK_MQ_F_TAG_SHARED)) { - set->flags |= BLK_MQ_F_TAG_SHARED; + !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, true); + blk_mq_update_tag_set_shared(set, true); } - if (set->flags & BLK_MQ_F_TAG_SHARED) + if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); - list_add_tail_rcu(&q->tag_set_list, &set->tag_list); + list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); +} + +/* All allocations will be freed in release handler of q->mq_kobj */ +static int blk_mq_alloc_ctxs(struct request_queue *q) +{ + struct blk_mq_ctxs *ctxs; + int cpu; + + ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctxs->queue_ctx) + goto fail; + + for_each_possible_cpu(cpu) { + struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); + ctx->ctxs = ctxs; + } + + q->mq_kobj = &ctxs->kobj; + q->queue_ctx = ctxs->queue_ctx; + + return 0; + fail: + kfree(ctxs); + return -ENOMEM; } /* @@ -2470,17 +3115,17 @@ */ void blk_mq_release(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + struct blk_mq_hw_ctx *hctx, *next; + int i; - /* hctx kobj stays in hctx */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx) - continue; + queue_for_each_hw_ctx(q, hctx, i) + WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); + + /* all hctx are in .unused_hctx_list now */ + list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { + list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } - - q->mq_map = NULL; kfree(q->queue_hw_ctx); @@ -2489,102 +3134,184 @@ * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); - - free_percpu(q->queue_ctx); } -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); + uninit_q = blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; - q = blk_mq_init_allocated_queue(set, uninit_q); + /* + * Initialize the queue without an elevator. device_add_disk() will do + * the initialization. + */ + q = blk_mq_init_allocated_queue(set, uninit_q, false); if (IS_ERR(q)) blk_cleanup_queue(uninit_q); return q; } +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} EXPORT_SYMBOL(blk_mq_init_queue); -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) +/* + * Helper for setting up a queue with mq ops, given queue depth, and + * the passed in mq ops flags. + */ +struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, + unsigned int queue_depth, + unsigned int set_flags) { - int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); + struct request_queue *q; + int ret; - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), - __alignof__(struct blk_mq_hw_ctx)) != - sizeof(struct blk_mq_hw_ctx)); + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; - if (tag_set->flags & BLK_MQ_F_BLOCKING) - hw_ctx_size += sizeof(struct srcu_struct); + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); - return hw_ctx_size; + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + blk_mq_free_tag_set(set); + return q; + } + + return q; +} +EXPORT_SYMBOL(blk_mq_init_sq_queue); + +static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( + struct blk_mq_tag_set *set, struct request_queue *q, + int hctx_idx, int node) +{ + struct blk_mq_hw_ctx *hctx = NULL, *tmp; + + /* reuse dead hctx first */ + spin_lock(&q->unused_hctx_lock); + list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { + if (tmp->numa_node == node) { + hctx = tmp; + break; + } + } + if (hctx) + list_del_init(&hctx->hctx_list); + spin_unlock(&q->unused_hctx_lock); + + if (!hctx) + hctx = blk_mq_alloc_hctx(q, set, node); + if (!hctx) + goto fail; + + if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) + goto free_hctx; + + return hctx; + + free_hctx: + kobject_put(&hctx->kobj); + fail: + return NULL; } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j; + int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - blk_mq_sysfs_unregister(q); + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + kfree(hctxs); + hctxs = new_hctxs; + } /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; + struct blk_mq_hw_ctx *hctx; - if (hctxs[i]) + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); + /* + * If the hw queue has been mapped to another numa node, + * we need to realloc the hctx. If allocation fails, fallback + * to use the previous one. + */ + if (hctxs[i] && (hctxs[i]->numa_node == node)) continue; - node = blk_mq_hw_queue_to_node(q->mq_map, i); - hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), - GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - node); - if (!hctxs[i]) - break; - - if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, - GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - node)) { - kfree(hctxs[i]); - hctxs[i] = NULL; - break; + hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (hctx) { + if (hctxs[i]) + blk_mq_exit_hctx(q, set, hctxs[i], i); + hctxs[i] = hctx; + } else { + if (hctxs[i]) + pr_warn("Allocate new hctx on node %d fails,\ + fallback to previous one on node %d\n", + node, hctxs[i]->numa_node); + else + break; } - - atomic_set(&hctxs[i]->nr_active, 0); - hctxs[i]->numa_node = node; - hctxs[i]->queue_num = i; - - if (blk_mq_init_hctx(q, set, hctxs[i], i)) { - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - hctxs[i] = NULL; - break; - } - blk_mq_hctx_kobj_init(hctxs[i]); } - for (j = i; j < q->nr_hw_queues; j++) { + /* + * Increasing nr_hw_queues fails. Free the newly allocated + * hctxs and keep the previous q->nr_hw_queues. + */ + if (i != set->nr_hw_queues) { + j = q->nr_hw_queues; + end = i; + } else { + j = i; + end = q->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + } + + for (; j < end; j++) { struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { if (hctx->tags) blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); - kobject_put(&hctx->kobj); hctxs[j] = NULL; - } } - q->nr_hw_queues = i; mutex_unlock(&q->sysfs_lock); - blk_mq_sysfs_register(q); } struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) + struct request_queue *q, + bool elevator_init) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2595,19 +3322,14 @@ if (!q->poll_cb) goto err_exit; - q->queue_ctx = alloc_percpu(struct blk_mq_ctx); - if (!q->queue_ctx) - goto err_exit; + if (blk_mq_alloc_ctxs(q)) + goto err_poll; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), - GFP_KERNEL, set->numa_node); - if (!q->queue_hw_ctx) - goto err_percpu; - - q->mq_map = set->mq_map; + INIT_LIST_HEAD(&q->unused_hctx_list); + spin_lock_init(&q->unused_hctx_lock); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2616,12 +3338,12 @@ INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->nr_queues = nr_cpu_ids; + q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - - if (!(set->flags & BLK_MQ_F_SG_MERGE)) - queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); q->sg_reserved_size = INT_MAX; @@ -2629,41 +3351,29 @@ INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - if (q->mq_ops->poll) - q->poll_fn = blk_mq_poll; - - /* - * Do this after blk_queue_make_request() overrides it... - */ q->nr_requests = set->queue_depth; /* * Default to classic polling */ - q->poll_nsec = -1; - - if (set->ops->complete) - blk_queue_softirq_done(q, set->ops->complete); + q->poll_nsec = BLK_MQ_POLL_CLASSIC; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - if (!(set->flags & BLK_MQ_F_NO_SCHED)) { - int ret; - - ret = elevator_init_mq(q); - if (ret) - return ERR_PTR(ret); - } + if (elevator_init) + elevator_init_mq(q); return q; err_hctxs: kfree(q->queue_hw_ctx); -err_percpu: - free_percpu(q->queue_ctx); + q->nr_hw_queues = 0; + blk_mq_sysfs_deinit(q); +err_poll: + blk_stat_free_callback(q->poll_cb); + q->poll_cb = NULL; err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); @@ -2681,38 +3391,21 @@ blk_mq_del_queue_tag_set(q); } -/* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) -{ - WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); - - blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister(q); - - /* - * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe - * we should change hctx numa_node according to the new topology (this - * involves freeing and re-allocating memory, worth doing?) - */ - blk_mq_map_swqueue(q); - - blk_mq_sysfs_register(q); - blk_mq_debugfs_register_hctxs(q); -} - static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) - if (!__blk_mq_alloc_rq_map(set, i)) + for (i = 0; i < set->nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_request(set, i)) goto out_unwind; + cond_resched(); + } return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set->tags[i]); + blk_mq_free_map_and_requests(set, i); return -ENOMEM; } @@ -2722,7 +3415,7 @@ * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -2754,7 +3447,17 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) { + /* + * blk_mq_map_queues() and multiple .map_queues() implementations + * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the + * number of hardware queues. + */ + if (set->nr_maps == 1) + set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; + + if (set->ops->map_queues && !is_kdump_kernel()) { + int i; + /* * transport .map_queues is usually done in the following * way: @@ -2762,18 +3465,44 @@ * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) - * set->mq_map[cpu] = queue; + * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - blk_mq_clear_mq_map(set); + for (i = 0; i < set->nr_maps; i++) + blk_mq_clear_mq_map(&set->map[i]); return set->ops->map_queues(set); - } else - return blk_mq_map_queues(set); + } else { + BUG_ON(set->nr_maps > 1); + return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + } +} + +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; } /* @@ -2784,7 +3513,7 @@ */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int ret; + int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); @@ -2807,6 +3536,11 @@ set->queue_depth = BLK_MQ_MAX_DEPTH; } + if (!set->nr_maps) + set->nr_maps = 1; + else if (set->nr_maps > HCTX_MAX_TYPES) + return -EINVAL; + /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and @@ -2814,42 +3548,59 @@ */ if (is_kdump_kernel()) { set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* - * There is no use for more h/w queues than cpus. + * There is no use for more h/w queues than cpus if we just have + * a single map */ - if (set->nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), - GFP_KERNEL, set->numa_node); - if (!set->tags) + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; - set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), - GFP_KERNEL, set->numa_node); - if (!set->mq_map) - goto out_free_tags; + for (i = 0; i < set->nr_maps; i++) { + set->map[i].mq_map = kcalloc_node(nr_cpu_ids, + sizeof(set->map[i].mq_map[0]), + GFP_KERNEL, set->numa_node); + if (!set->map[i].mq_map) + goto out_free_mq_map; + set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + } ret = blk_mq_update_queue_map(set); if (ret) goto out_free_mq_map; - ret = blk_mq_alloc_rq_maps(set); + ret = blk_mq_alloc_map_and_requests(set); if (ret) goto out_free_mq_map; + + if (blk_mq_is_sbitmap_shared(set->flags)) { + atomic_set(&set->active_queues_shared_sbitmap, 0); + + if (blk_mq_init_shared_sbitmap(set, set->flags)) { + ret = -ENOMEM; + goto out_free_mq_rq_maps; + } + } mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; +out_free_mq_rq_maps: + for (i = 0; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); out_free_mq_map: - kfree(set->mq_map); - set->mq_map = NULL; -out_free_tags: + for (i = 0; i < set->nr_maps; i++) { + kfree(set->map[i].mq_map); + set->map[i].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; return ret; @@ -2858,13 +3609,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { - int i; + int i, j; - for (i = 0; i < nr_cpu_ids; i++) + for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); - kfree(set->mq_map); - set->mq_map = NULL; + if (blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_exit_shared_sbitmap(set); + + for (j = 0; j < set->nr_maps; j++) { + kfree(set->map[j].mq_map); + set->map[j].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; @@ -2880,6 +3636,9 @@ if (!set) return -EINVAL; + if (q->nr_requests == nr) + return 0; + blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -2894,14 +3653,16 @@ if (!hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); + if (!ret && blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_tag_resize_shared_sbitmap(set, nr); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); } if (ret) break; - if (q->elevator && q->elevator->type->ops.mq.depth_updated) - q->elevator->type->ops.mq.depth_updated(hctx); + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(hctx); } if (!ret) @@ -2988,20 +3749,19 @@ { struct request_queue *q; LIST_HEAD(head); + int prev_nr_hw_queues; lockdep_assert_held(&set->tag_list_lock); - if (nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; - if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) + if (nr_hw_queues < 1) + return; + if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - /* - * Sync with blk_mq_queue_tag_busy_iter. - */ - synchronize_rcu(); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -3011,11 +3771,35 @@ if (!blk_mq_elv_switch_none(&head, q)) goto switch_back; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_debugfs_unregister_hctxs(q); + blk_mq_sysfs_unregister(q); + } + + prev_nr_hw_queues = set->nr_hw_queues; + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + set->nr_hw_queues = nr_hw_queues; +fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); - blk_mq_queue_reinit(q); + if (q->nr_hw_queues != set->nr_hw_queues) { + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", + nr_hw_queues, prev_nr_hw_queues); + set->nr_hw_queues = prev_nr_hw_queues; + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + goto fallback; + } + blk_mq_map_swqueue(q); + } + +reregister: + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_sysfs_register(q); + blk_mq_debugfs_register_hctxs(q); } switch_back: @@ -3069,7 +3853,6 @@ } static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, struct request *rq) { unsigned long ret = 0; @@ -3102,7 +3885,6 @@ } static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, struct request *rq) { struct hrtimer_sleeper hs; @@ -3114,18 +3896,15 @@ return false; /* - * poll_nsec can be: + * If we get here, hybrid polling is enabled. Hence poll_nsec can be: * - * -1: don't ever hybrid sleep * 0: use half of prev avg * >0: use this specific value */ - if (q->poll_nsec == -1) - return false; - else if (q->poll_nsec > 0) + if (q->poll_nsec > 0) nsecs = q->poll_nsec; else - nsecs = blk_mq_poll_nsecs(q, hctx, rq); + nsecs = blk_mq_poll_nsecs(q, rq); if (!nsecs) return false; @@ -3139,15 +3918,14 @@ kt = nsecs; mode = HRTIMER_MODE_REL; - hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); hrtimer_set_expires(&hs.timer, kt); - hrtimer_init_sleeper(&hs, current); do { if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_start_expires(&hs.timer, mode); + hrtimer_sleeper_start_expires(&hs, mode); if (hs.task) io_schedule(); hrtimer_cancel(&hs.timer); @@ -3159,59 +3937,14 @@ return true; } -static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) { - struct request_queue *q = hctx->queue; - long state; - - /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. - */ - if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) - return true; - - hctx->poll_considered++; - - state = current->state; - while (!need_resched()) { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, rq->tag); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } - - if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return true; - if (ret < 0) - break; - cpu_relax(); - } - - __set_current_state(TASK_RUNNING); - return false; -} - -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) -{ - struct blk_mq_hw_ctx *hctx; struct request *rq; - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) return false; - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; if (!blk_qc_t_is_internal(cookie)) rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); else { @@ -3226,13 +3959,97 @@ return false; } - return __blk_mq_poll(hctx, rq); + return blk_mq_poll_hybrid_sleep(q, rq); } + +/** + * blk_poll - poll for IO completions + * @q: the queue + * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions + * + * Description: + * Poll for completions on the passed in queue. Returns number of + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). + */ +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +{ + struct blk_mq_hw_ctx *hctx; + long state; + + if (!blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid(q, hctx, cookie)) + return 1; + + hctx->poll_considered++; + + state = current->state; + do { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx); + if (ret > 0) { + hctx->poll_success++; + __set_current_state(TASK_RUNNING); + return ret; + } + + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return 1; + if (ret < 0 || !spin) + break; + cpu_relax(); + } while (!need_resched()); + + __set_current_state(TASK_RUNNING); + return 0; +} +EXPORT_SYMBOL_GPL(blk_poll); + +unsigned int blk_mq_rq_cpu(struct request *rq) +{ + return rq->mq_ctx->cpu; +} +EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", + blk_mq_hctx_notify_online, + blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); -- Gitblit v1.6.2