hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/block/blk-mq.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Block multiqueue core code
34 *
....@@ -25,30 +26,36 @@
2526 #include <linux/delay.h>
2627 #include <linux/crash_dump.h>
2728 #include <linux/prefetch.h>
29
+#include <linux/blk-crypto.h>
2830
2931 #include <trace/events/block.h>
3032
3133 #include <linux/blk-mq.h>
34
+#include <linux/t10-pi.h>
3235 #include "blk.h"
3336 #include "blk-mq.h"
3437 #include "blk-mq-debugfs.h"
3538 #include "blk-mq-tag.h"
39
+#include "blk-pm.h"
3640 #include "blk-stat.h"
3741 #include "blk-mq-sched.h"
3842 #include "blk-rq-qos.h"
3943
40
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
44
+#include <trace/hooks/block.h>
45
+
46
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
47
+
4148 static void blk_mq_poll_stats_start(struct request_queue *q);
4249 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
4350
4451 static int blk_mq_poll_stats_bkt(const struct request *rq)
4552 {
46
- int ddir, bytes, bucket;
53
+ int ddir, sectors, bucket;
4754
4855 ddir = rq_data_dir(rq);
49
- bytes = blk_rq_bytes(rq);
56
+ sectors = blk_rq_stats_sectors(rq);
5057
51
- bucket = ddir + 2*(ilog2(bytes) - 9);
58
+ bucket = ddir + 2 * ilog2(sectors);
5259
5360 if (bucket < 0)
5461 return -1;
....@@ -59,7 +66,8 @@
5966 }
6067
6168 /*
62
- * Check if any of the ctx's have pending work in this hardware queue
69
+ * Check if any of the ctx, dispatch list or elevator
70
+ * have pending work in this hardware queue.
6371 */
6472 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
6573 {
....@@ -74,75 +82,67 @@
7482 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
7583 struct blk_mq_ctx *ctx)
7684 {
77
- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78
- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
85
+ const int bit = ctx->index_hw[hctx->type];
86
+
87
+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
88
+ sbitmap_set_bit(&hctx->ctx_map, bit);
7989 }
8090
8191 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
8292 struct blk_mq_ctx *ctx)
8393 {
84
- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
94
+ const int bit = ctx->index_hw[hctx->type];
95
+
96
+ sbitmap_clear_bit(&hctx->ctx_map, bit);
8597 }
8698
8799 struct mq_inflight {
88100 struct hd_struct *part;
89
- unsigned int *inflight;
101
+ unsigned int inflight[2];
90102 };
91103
92
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
104
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93105 struct request *rq, void *priv,
94106 bool reserved)
95107 {
96108 struct mq_inflight *mi = priv;
97109
98
- /*
99
- * index[0] counts the specific partition that was asked for. index[1]
100
- * counts the ones that are active on the whole device, so increment
101
- * that if mi->part is indeed a partition, and not a whole device.
102
- */
103
- if (rq->part == mi->part)
104
- mi->inflight[0]++;
105
- if (mi->part->partno)
106
- mi->inflight[1]++;
107
-}
108
-
109
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
110
- unsigned int inflight[2])
111
-{
112
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113
-
114
- inflight[0] = inflight[1] = 0;
115
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116
-}
117
-
118
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119
- struct request *rq, void *priv,
120
- bool reserved)
121
-{
122
- struct mq_inflight *mi = priv;
123
-
124
- if (rq->part == mi->part)
110
+ if ((!mi->part->partno || rq->part == mi->part) &&
111
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125112 mi->inflight[rq_data_dir(rq)]++;
113
+
114
+ return true;
115
+}
116
+
117
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
118
+{
119
+ struct mq_inflight mi = { .part = part };
120
+
121
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
122
+
123
+ return mi.inflight[0] + mi.inflight[1];
126124 }
127125
128126 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
129127 unsigned int inflight[2])
130128 {
131
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
129
+ struct mq_inflight mi = { .part = part };
132130
133
- inflight[0] = inflight[1] = 0;
134
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
131
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
132
+ inflight[0] = mi.inflight[0];
133
+ inflight[1] = mi.inflight[1];
135134 }
136135
137136 void blk_freeze_queue_start(struct request_queue *q)
138137 {
139
- int freeze_depth;
140
-
141
- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142
- if (freeze_depth == 1) {
138
+ mutex_lock(&q->mq_freeze_lock);
139
+ if (++q->mq_freeze_depth == 1) {
143140 percpu_ref_kill(&q->q_usage_counter);
144
- if (q->mq_ops)
141
+ mutex_unlock(&q->mq_freeze_lock);
142
+ if (queue_is_mq(q))
145143 blk_mq_run_hw_queues(q, false);
144
+ } else {
145
+ mutex_unlock(&q->mq_freeze_lock);
146146 }
147147 }
148148 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
....@@ -176,8 +176,6 @@
176176 * exported to drivers as the only user for unfreeze is blk_mq.
177177 */
178178 blk_freeze_queue_start(q);
179
- if (!q->mq_ops)
180
- blk_drain_queue(q);
181179 blk_mq_freeze_queue_wait(q);
182180 }
183181
....@@ -193,14 +191,14 @@
193191
194192 void blk_mq_unfreeze_queue(struct request_queue *q)
195193 {
196
- int freeze_depth;
197
-
198
- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199
- WARN_ON_ONCE(freeze_depth < 0);
200
- if (!freeze_depth) {
201
- percpu_ref_reinit(&q->q_usage_counter);
194
+ mutex_lock(&q->mq_freeze_lock);
195
+ q->mq_freeze_depth--;
196
+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
197
+ if (!q->mq_freeze_depth) {
198
+ percpu_ref_resurrect(&q->q_usage_counter);
202199 wake_up_all(&q->mq_freeze_wq);
203200 }
201
+ mutex_unlock(&q->mq_freeze_lock);
204202 }
205203 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206204
....@@ -268,40 +266,37 @@
268266 blk_mq_tag_wakeup_all(hctx->tags, true);
269267 }
270268
271
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
269
+/*
270
+ * Only need start/end time stamping if we have iostat or
271
+ * blk stats enabled, or using an IO scheduler.
272
+ */
273
+static inline bool blk_mq_need_time_stamp(struct request *rq)
272274 {
273
- return blk_mq_has_free_tags(hctx->tags);
275
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
274276 }
275
-EXPORT_SYMBOL(blk_mq_can_queue);
276277
277278 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
278
- unsigned int tag, unsigned int op)
279
+ unsigned int tag, u64 alloc_time_ns)
279280 {
280281 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281282 struct request *rq = tags->static_rqs[tag];
282
- req_flags_t rq_flags = 0;
283283
284
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285
- rq->tag = -1;
284
+ if (data->q->elevator) {
285
+ rq->tag = BLK_MQ_NO_TAG;
286286 rq->internal_tag = tag;
287287 } else {
288
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289
- rq_flags = RQF_MQ_INFLIGHT;
290
- atomic_inc(&data->hctx->nr_active);
291
- }
292288 rq->tag = tag;
293
- rq->internal_tag = -1;
294
- data->hctx->tags->rqs[rq->tag] = rq;
289
+ rq->internal_tag = BLK_MQ_NO_TAG;
295290 }
296291
297292 /* csd/requeue_work/fifo_time is initialized before use */
298293 rq->q = data->q;
299294 rq->mq_ctx = data->ctx;
300
- rq->rq_flags = rq_flags;
301
- rq->cpu = -1;
302
- rq->cmd_flags = op;
303
- if (data->flags & BLK_MQ_REQ_PREEMPT)
304
- rq->rq_flags |= RQF_PREEMPT;
295
+ rq->mq_hctx = data->hctx;
296
+ rq->rq_flags = 0;
297
+ rq->cmd_flags = data->cmd_flags;
298
+ if (data->flags & BLK_MQ_REQ_PM)
299
+ rq->rq_flags |= RQF_PM;
305300 if (blk_queue_io_stat(data->q))
306301 rq->rq_flags |= RQF_IO_STAT;
307302 INIT_LIST_HEAD(&rq->queuelist);
....@@ -309,100 +304,110 @@
309304 RB_CLEAR_NODE(&rq->rb_node);
310305 rq->rq_disk = NULL;
311306 rq->part = NULL;
312
- rq->start_time_ns = ktime_get_ns();
307
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
308
+ rq->alloc_time_ns = alloc_time_ns;
309
+#endif
310
+ if (blk_mq_need_time_stamp(rq))
311
+ rq->start_time_ns = ktime_get_ns();
312
+ else
313
+ rq->start_time_ns = 0;
313314 rq->io_start_time_ns = 0;
315
+ rq->stats_sectors = 0;
314316 rq->nr_phys_segments = 0;
315317 #if defined(CONFIG_BLK_DEV_INTEGRITY)
316318 rq->nr_integrity_segments = 0;
317319 #endif
318
- rq->special = NULL;
320
+ blk_crypto_rq_set_defaults(rq);
319321 /* tag was already set */
320
- rq->extra_len = 0;
321
- rq->__deadline = 0;
322
+ WRITE_ONCE(rq->deadline, 0);
322323
323
-#ifdef CONFIG_PREEMPT_RT_FULL
324
- INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
325
-#endif
326
- INIT_LIST_HEAD(&rq->timeout_list);
327324 rq->timeout = 0;
328325
329326 rq->end_io = NULL;
330327 rq->end_io_data = NULL;
331
- rq->next_rq = NULL;
332328
333
-#ifdef CONFIG_BLK_CGROUP
334
- rq->rl = NULL;
335
-#endif
336
-
337
- data->ctx->rq_dispatched[op_is_sync(op)]++;
329
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
338330 refcount_set(&rq->ref, 1);
331
+
332
+ if (!op_is_flush(data->cmd_flags)) {
333
+ struct elevator_queue *e = data->q->elevator;
334
+
335
+ rq->elv.icq = NULL;
336
+ if (e && e->type->ops.prepare_request) {
337
+ if (e->type->icq_cache)
338
+ blk_mq_sched_assign_ioc(rq);
339
+
340
+ e->type->ops.prepare_request(rq);
341
+ rq->rq_flags |= RQF_ELVPRIV;
342
+ }
343
+ }
344
+
345
+ data->hctx->queued++;
346
+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
339347 return rq;
340348 }
341349
342
-static struct request *blk_mq_get_request(struct request_queue *q,
343
- struct bio *bio, unsigned int op,
344
- struct blk_mq_alloc_data *data)
350
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
345351 {
352
+ struct request_queue *q = data->q;
346353 struct elevator_queue *e = q->elevator;
347
- struct request *rq;
354
+ u64 alloc_time_ns = 0;
348355 unsigned int tag;
349
- bool put_ctx_on_error = false;
350356
351
- blk_queue_enter_live(q);
352
- data->q = q;
353
- if (likely(!data->ctx)) {
354
- data->ctx = blk_mq_get_ctx(q);
355
- put_ctx_on_error = true;
356
- }
357
- if (likely(!data->hctx))
358
- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
359
- if (op & REQ_NOWAIT)
357
+ /* alloc_time includes depth and tag waits */
358
+ if (blk_queue_rq_alloc_time(q))
359
+ alloc_time_ns = ktime_get_ns();
360
+
361
+ if (data->cmd_flags & REQ_NOWAIT)
360362 data->flags |= BLK_MQ_REQ_NOWAIT;
361363
362364 if (e) {
363
- data->flags |= BLK_MQ_REQ_INTERNAL;
364
-
365365 /*
366366 * Flush requests are special and go directly to the
367367 * dispatch list. Don't include reserved tags in the
368368 * limiting, as it isn't useful.
369369 */
370
- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
370
+ if (!op_is_flush(data->cmd_flags) &&
371
+ e->type->ops.limit_depth &&
371372 !(data->flags & BLK_MQ_REQ_RESERVED))
372
- e->type->ops.mq.limit_depth(op, data);
373
- } else {
373
+ e->type->ops.limit_depth(data->cmd_flags, data);
374
+ }
375
+
376
+retry:
377
+ data->ctx = blk_mq_get_ctx(q);
378
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
379
+ if (!e)
374380 blk_mq_tag_busy(data->hctx);
375
- }
376381
382
+ /*
383
+ * Waiting allocations only fail because of an inactive hctx. In that
384
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
385
+ * should have migrated us to an online CPU by now.
386
+ */
377387 tag = blk_mq_get_tag(data);
378
- if (tag == BLK_MQ_TAG_FAIL) {
379
- if (put_ctx_on_error) {
380
- blk_mq_put_ctx(data->ctx);
381
- data->ctx = NULL;
382
- }
383
- blk_queue_exit(q);
384
- return NULL;
385
- }
388
+ if (tag == BLK_MQ_NO_TAG) {
389
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
390
+ return NULL;
386391
387
- rq = blk_mq_rq_ctx_init(data, tag, op);
388
- if (!op_is_flush(op)) {
389
- rq->elv.icq = NULL;
390
- if (e && e->type->ops.mq.prepare_request) {
391
- if (e->type->icq_cache && rq_ioc(bio))
392
- blk_mq_sched_assign_ioc(rq, bio);
393
-
394
- e->type->ops.mq.prepare_request(rq, bio);
395
- rq->rq_flags |= RQF_ELVPRIV;
396
- }
392
+ /*
393
+ * Give up the CPU and sleep for a random short time to ensure
394
+ * that thread using a realtime scheduling class are migrated
395
+ * off the CPU, and thus off the hctx that is going away.
396
+ */
397
+ msleep(3);
398
+ goto retry;
397399 }
398
- data->hctx->queued++;
399
- return rq;
400
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
400401 }
401402
402403 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
403404 blk_mq_req_flags_t flags)
404405 {
405
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
406
+ struct blk_mq_alloc_data data = {
407
+ .q = q,
408
+ .flags = flags,
409
+ .cmd_flags = op,
410
+ };
406411 struct request *rq;
407412 int ret;
408413
....@@ -410,28 +415,35 @@
410415 if (ret)
411416 return ERR_PTR(ret);
412417
413
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
414
- blk_queue_exit(q);
415
-
418
+ rq = __blk_mq_alloc_request(&data);
416419 if (!rq)
417
- return ERR_PTR(-EWOULDBLOCK);
418
-
419
- blk_mq_put_ctx(alloc_data.ctx);
420
-
420
+ goto out_queue_exit;
421421 rq->__data_len = 0;
422422 rq->__sector = (sector_t) -1;
423423 rq->bio = rq->biotail = NULL;
424424 return rq;
425
+out_queue_exit:
426
+ blk_queue_exit(q);
427
+ return ERR_PTR(-EWOULDBLOCK);
425428 }
426429 EXPORT_SYMBOL(blk_mq_alloc_request);
427430
428431 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
429432 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
430433 {
431
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
432
- struct request *rq;
434
+ struct blk_mq_alloc_data data = {
435
+ .q = q,
436
+ .flags = flags,
437
+ .cmd_flags = op,
438
+ };
439
+ u64 alloc_time_ns = 0;
433440 unsigned int cpu;
441
+ unsigned int tag;
434442 int ret;
443
+
444
+ /* alloc_time includes depth and tag waits */
445
+ if (blk_queue_rq_alloc_time(q))
446
+ alloc_time_ns = ktime_get_ns();
435447
436448 /*
437449 * If the tag allocator sleeps we could get an allocation for a
....@@ -439,7 +451,8 @@
439451 * allocator for this for the rare use case of a command tied to
440452 * a specific queue.
441453 */
442
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
454
+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
455
+ WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
443456 return ERR_PTR(-EINVAL);
444457
445458 if (hctx_idx >= q->nr_hw_queues)
....@@ -453,21 +466,27 @@
453466 * Check if the hardware context is actually mapped to anything.
454467 * If not tell the caller that it should skip this queue.
455468 */
456
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
457
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
458
- blk_queue_exit(q);
459
- return ERR_PTR(-EXDEV);
460
- }
461
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
462
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
469
+ ret = -EXDEV;
470
+ data.hctx = q->queue_hw_ctx[hctx_idx];
471
+ if (!blk_mq_hw_queue_mapped(data.hctx))
472
+ goto out_queue_exit;
473
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
474
+ if (cpu >= nr_cpu_ids)
475
+ goto out_queue_exit;
476
+ data.ctx = __blk_mq_get_ctx(q, cpu);
463477
464
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
478
+ if (!q->elevator)
479
+ blk_mq_tag_busy(data.hctx);
480
+
481
+ ret = -EWOULDBLOCK;
482
+ tag = blk_mq_get_tag(&data);
483
+ if (tag == BLK_MQ_NO_TAG)
484
+ goto out_queue_exit;
485
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
486
+
487
+out_queue_exit:
465488 blk_queue_exit(q);
466
-
467
- if (!rq)
468
- return ERR_PTR(-EWOULDBLOCK);
469
-
470
- return rq;
489
+ return ERR_PTR(ret);
471490 }
472491 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
473492
....@@ -475,13 +494,16 @@
475494 {
476495 struct request_queue *q = rq->q;
477496 struct blk_mq_ctx *ctx = rq->mq_ctx;
478
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
497
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
479498 const int sched_tag = rq->internal_tag;
480499
481
- if (rq->tag != -1)
482
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
483
- if (sched_tag != -1)
484
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
500
+ blk_crypto_free_request(rq);
501
+ blk_pm_mark_last_busy(rq);
502
+ rq->mq_hctx = NULL;
503
+ if (rq->tag != BLK_MQ_NO_TAG)
504
+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
505
+ if (sched_tag != BLK_MQ_NO_TAG)
506
+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
485507 blk_mq_sched_restart(hctx);
486508 blk_queue_exit(q);
487509 }
....@@ -491,11 +513,11 @@
491513 struct request_queue *q = rq->q;
492514 struct elevator_queue *e = q->elevator;
493515 struct blk_mq_ctx *ctx = rq->mq_ctx;
494
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
516
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
495517
496518 if (rq->rq_flags & RQF_ELVPRIV) {
497
- if (e && e->type->ops.mq.finish_request)
498
- e->type->ops.mq.finish_request(rq);
519
+ if (e && e->type->ops.finish_request)
520
+ e->type->ops.finish_request(rq);
499521 if (rq->elv.icq) {
500522 put_io_context(rq->elv.icq->ioc);
501523 rq->elv.icq = NULL;
....@@ -504,15 +526,12 @@
504526
505527 ctx->rq_completed[rq_is_sync(rq)]++;
506528 if (rq->rq_flags & RQF_MQ_INFLIGHT)
507
- atomic_dec(&hctx->nr_active);
529
+ __blk_mq_dec_active_requests(hctx);
508530
509531 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
510532 laptop_io_completion(q->backing_dev_info);
511533
512534 rq_qos_done(q, rq);
513
-
514
- if (blk_rq_rl(rq))
515
- blk_put_rl(blk_rq_rl(rq));
516535
517536 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
518537 if (refcount_dec_and_test(&rq->ref))
....@@ -522,12 +541,17 @@
522541
523542 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
524543 {
525
- u64 now = ktime_get_ns();
544
+ u64 now = 0;
545
+
546
+ if (blk_mq_need_time_stamp(rq))
547
+ now = ktime_get_ns();
526548
527549 if (rq->rq_flags & RQF_STATS) {
528550 blk_mq_poll_stats_start(rq->q);
529551 blk_stat_add(rq, now);
530552 }
553
+
554
+ blk_mq_sched_completed_request(rq, now);
531555
532556 blk_account_io_done(rq, now);
533557
....@@ -535,8 +559,6 @@
535559 rq_qos_done(rq->q, rq);
536560 rq->end_io(rq, error);
537561 } else {
538
- if (unlikely(blk_bidi_rq(rq)))
539
- blk_mq_free_request(rq->next_rq);
540562 blk_mq_free_request(rq);
541563 }
542564 }
....@@ -550,63 +572,139 @@
550572 }
551573 EXPORT_SYMBOL(blk_mq_end_request);
552574
553
-#ifdef CONFIG_PREEMPT_RT_FULL
554
-
555
-void __blk_mq_complete_request_remote_work(struct work_struct *work)
575
+/*
576
+ * Softirq action handler - move entries to local list and loop over them
577
+ * while passing them to the queue registered handler.
578
+ */
579
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
556580 {
557
- struct request *rq = container_of(work, struct request, work);
581
+ struct list_head *cpu_list, local_list;
558582
559
- rq->q->softirq_done_fn(rq);
583
+ local_irq_disable();
584
+ cpu_list = this_cpu_ptr(&blk_cpu_done);
585
+ list_replace_init(cpu_list, &local_list);
586
+ local_irq_enable();
587
+
588
+ while (!list_empty(&local_list)) {
589
+ struct request *rq;
590
+
591
+ rq = list_entry(local_list.next, struct request, ipi_list);
592
+ list_del_init(&rq->ipi_list);
593
+ rq->q->mq_ops->complete(rq);
594
+ }
560595 }
561596
562
-#else
597
+static void blk_mq_trigger_softirq(struct request *rq)
598
+{
599
+ struct list_head *list;
600
+ unsigned long flags;
601
+
602
+ local_irq_save(flags);
603
+ list = this_cpu_ptr(&blk_cpu_done);
604
+ list_add_tail(&rq->ipi_list, list);
605
+
606
+ /*
607
+ * If the list only contains our just added request, signal a raise of
608
+ * the softirq. If there are already entries there, someone already
609
+ * raised the irq but it hasn't run yet.
610
+ */
611
+ if (list->next == &rq->ipi_list)
612
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
613
+ local_irq_restore(flags);
614
+}
615
+
616
+static int blk_softirq_cpu_dead(unsigned int cpu)
617
+{
618
+ /*
619
+ * If a CPU goes away, splice its entries to the current CPU
620
+ * and trigger a run of the softirq
621
+ */
622
+ local_irq_disable();
623
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
624
+ this_cpu_ptr(&blk_cpu_done));
625
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
626
+ local_irq_enable();
627
+
628
+ return 0;
629
+}
630
+
563631
564632 static void __blk_mq_complete_request_remote(void *data)
565633 {
566634 struct request *rq = data;
567635
568
- rq->q->softirq_done_fn(rq);
636
+ /*
637
+ * For most of single queue controllers, there is only one irq vector
638
+ * for handling I/O completion, and the only irq's affinity is set
639
+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
640
+ * is handled on one specific CPU.
641
+ *
642
+ * So complete I/O requests in softirq context in case of single queue
643
+ * devices to avoid degrading I/O performance due to irqsoff latency.
644
+ */
645
+ if (rq->q->nr_hw_queues == 1)
646
+ blk_mq_trigger_softirq(rq);
647
+ else
648
+ rq->q->mq_ops->complete(rq);
569649 }
570
-#endif
571650
572
-static void __blk_mq_complete_request(struct request *rq)
651
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
573652 {
574
- struct blk_mq_ctx *ctx = rq->mq_ctx;
575
- bool shared = false;
576
- int cpu;
653
+ int cpu = raw_smp_processor_id();
577654
578
- if (!blk_mq_mark_complete(rq))
579
- return;
580
- if (rq->internal_tag != -1)
581
- blk_mq_sched_completed_request(rq);
655
+ if (!IS_ENABLED(CONFIG_SMP) ||
656
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
657
+ return false;
582658
583
- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
584
- rq->q->softirq_done_fn(rq);
585
- return;
586
- }
659
+ /* same CPU or cache domain? Complete locally */
660
+ if (cpu == rq->mq_ctx->cpu ||
661
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
662
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
663
+ return false;
587664
588
- cpu = get_cpu_light();
589
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
590
- shared = cpus_share_cache(cpu, ctx->cpu);
665
+ /* don't try to IPI to an offline CPU */
666
+ return cpu_online(rq->mq_ctx->cpu);
667
+}
591668
592
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
593
-#ifdef CONFIG_PREEMPT_RT_FULL
594
- /*
595
- * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
596
- * here. But we could try to invoke it one the CPU like this.
597
- */
598
- schedule_work_on(ctx->cpu, &rq->work);
599
-#else
669
+bool blk_mq_complete_request_remote(struct request *rq)
670
+{
671
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
672
+
673
+ /*
674
+ * For a polled request, always complete locallly, it's pointless
675
+ * to redirect the completion.
676
+ */
677
+ if (rq->cmd_flags & REQ_HIPRI)
678
+ return false;
679
+
680
+ if (blk_mq_complete_need_ipi(rq)) {
600681 rq->csd.func = __blk_mq_complete_request_remote;
601682 rq->csd.info = rq;
602683 rq->csd.flags = 0;
603
- smp_call_function_single_async(ctx->cpu, &rq->csd);
604
-#endif
684
+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
605685 } else {
606
- rq->q->softirq_done_fn(rq);
686
+ if (rq->q->nr_hw_queues > 1)
687
+ return false;
688
+ blk_mq_trigger_softirq(rq);
607689 }
608
- put_cpu_light();
690
+
691
+ return true;
609692 }
693
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
694
+
695
+/**
696
+ * blk_mq_complete_request - end I/O on a request
697
+ * @rq: the request being processed
698
+ *
699
+ * Description:
700
+ * Complete a request by scheduling the ->complete_rq operation.
701
+ **/
702
+void blk_mq_complete_request(struct request *rq)
703
+{
704
+ if (!blk_mq_complete_request_remote(rq))
705
+ rq->q->mq_ops->complete(rq);
706
+}
707
+EXPORT_SYMBOL(blk_mq_complete_request);
610708
611709 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
612710 __releases(hctx->srcu)
....@@ -629,40 +727,22 @@
629727 }
630728
631729 /**
632
- * blk_mq_complete_request - end I/O on a request
633
- * @rq: the request being processed
730
+ * blk_mq_start_request - Start processing a request
731
+ * @rq: Pointer to request to be started
634732 *
635
- * Description:
636
- * Ends all I/O on a request. It does not handle partial completions.
637
- * The actual completion happens out-of-order, through a IPI handler.
638
- **/
639
-void blk_mq_complete_request(struct request *rq)
640
-{
641
- if (unlikely(blk_should_fake_timeout(rq->q)))
642
- return;
643
- __blk_mq_complete_request(rq);
644
-}
645
-EXPORT_SYMBOL(blk_mq_complete_request);
646
-
647
-int blk_mq_request_started(struct request *rq)
648
-{
649
- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
650
-}
651
-EXPORT_SYMBOL_GPL(blk_mq_request_started);
652
-
733
+ * Function used by device drivers to notify the block layer that a request
734
+ * is going to be processed now, so blk layer can do proper initializations
735
+ * such as starting the timeout timer.
736
+ */
653737 void blk_mq_start_request(struct request *rq)
654738 {
655739 struct request_queue *q = rq->q;
656
-
657
- blk_mq_sched_started_request(rq);
658740
659741 trace_block_rq_issue(q, rq);
660742
661743 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
662744 rq->io_start_time_ns = ktime_get_ns();
663
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
664
- rq->throtl_size = blk_rq_sectors(rq);
665
-#endif
745
+ rq->stats_sectors = blk_rq_sectors(rq);
666746 rq->rq_flags |= RQF_STATS;
667747 rq_qos_issue(q, rq);
668748 }
....@@ -672,14 +752,10 @@
672752 blk_add_timer(rq);
673753 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
674754
675
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
676
- /*
677
- * Make sure space for the drain appears. We know we can do
678
- * this because max_hw_segments has been adjusted to be one
679
- * fewer than the device can handle.
680
- */
681
- rq->nr_phys_segments++;
682
- }
755
+#ifdef CONFIG_BLK_DEV_INTEGRITY
756
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
757
+ q->integrity.profile->prepare_fn(rq);
758
+#endif
683759 }
684760 EXPORT_SYMBOL(blk_mq_start_request);
685761
....@@ -695,8 +771,6 @@
695771 if (blk_mq_request_started(rq)) {
696772 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
697773 rq->rq_flags &= ~RQF_TIMED_OUT;
698
- if (q->dma_drain_size && blk_rq_bytes(rq))
699
- rq->nr_phys_segments--;
700774 }
701775 }
702776
....@@ -707,7 +781,6 @@
707781 /* this request will be re-inserted to io scheduler queue */
708782 blk_mq_sched_requeue_request(rq);
709783
710
- BUG_ON(blk_queued_rq(rq));
711784 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
712785 }
713786 EXPORT_SYMBOL(blk_mq_requeue_request);
....@@ -735,7 +808,7 @@
735808 * merge.
736809 */
737810 if (rq->rq_flags & RQF_DONTPREP)
738
- blk_mq_request_bypass_insert(rq, false);
811
+ blk_mq_request_bypass_insert(rq, false, false);
739812 else
740813 blk_mq_sched_insert_request(rq, true, false, false);
741814 }
....@@ -773,7 +846,6 @@
773846 if (kick_requeue_list)
774847 blk_mq_kick_requeue_list(q);
775848 }
776
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
777849
778850 void blk_mq_kick_requeue_list(struct request_queue *q)
779851 {
....@@ -800,6 +872,32 @@
800872 }
801873 EXPORT_SYMBOL(blk_mq_tag_to_rq);
802874
875
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
876
+ void *priv, bool reserved)
877
+{
878
+ /*
879
+ * If we find a request that isn't idle and the queue matches,
880
+ * we know the queue is busy. Return false to stop the iteration.
881
+ */
882
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
883
+ bool *busy = priv;
884
+
885
+ *busy = true;
886
+ return false;
887
+ }
888
+
889
+ return true;
890
+}
891
+
892
+bool blk_mq_queue_inflight(struct request_queue *q)
893
+{
894
+ bool busy = false;
895
+
896
+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
897
+ return busy;
898
+}
899
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
900
+
803901 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
804902 {
805903 req->rq_flags |= RQF_TIMED_OUT;
....@@ -824,7 +922,7 @@
824922 if (rq->rq_flags & RQF_TIMED_OUT)
825923 return false;
826924
827
- deadline = blk_rq_deadline(rq);
925
+ deadline = READ_ONCE(rq->deadline);
828926 if (time_after_eq(jiffies, deadline))
829927 return true;
830928
....@@ -835,43 +933,29 @@
835933 return false;
836934 }
837935
838
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
936
+void blk_mq_put_rq_ref(struct request *rq)
937
+{
938
+ if (is_flush_rq(rq))
939
+ rq->end_io(rq, 0);
940
+ else if (refcount_dec_and_test(&rq->ref))
941
+ __blk_mq_free_request(rq);
942
+}
943
+
944
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
839945 struct request *rq, void *priv, bool reserved)
840946 {
841947 unsigned long *next = priv;
842948
843949 /*
844
- * Just do a quick check if it is expired before locking the request in
845
- * so we're not unnecessarilly synchronizing across CPUs.
846
- */
847
- if (!blk_mq_req_expired(rq, next))
848
- return;
849
-
850
- /*
851
- * We have reason to believe the request may be expired. Take a
852
- * reference on the request to lock this request lifetime into its
853
- * currently allocated context to prevent it from being reallocated in
854
- * the event the completion by-passes this timeout handler.
855
- *
856
- * If the reference was already released, then the driver beat the
857
- * timeout handler to posting a natural completion.
858
- */
859
- if (!refcount_inc_not_zero(&rq->ref))
860
- return;
861
-
862
- /*
863
- * The request is now locked and cannot be reallocated underneath the
864
- * timeout handler's processing. Re-verify this exact request is truly
865
- * expired; if it is not expired, then the request was completed and
866
- * reallocated as a new request.
950
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
951
+ * be reallocated underneath the timeout handler's processing, then
952
+ * the expire check is reliable. If the request is not expired, then
953
+ * it was completed and reallocated as a new request after returning
954
+ * from blk_mq_check_expired().
867955 */
868956 if (blk_mq_req_expired(rq, next))
869957 blk_mq_rq_timed_out(rq, reserved);
870
-
871
- if (is_flush_rq(rq, hctx))
872
- rq->end_io(rq, 0);
873
- else if (refcount_dec_and_test(&rq->ref))
874
- __blk_mq_free_request(rq);
958
+ return true;
875959 }
876960
877961 static void blk_mq_timeout_work(struct work_struct *work)
....@@ -928,9 +1012,10 @@
9281012 struct flush_busy_ctx_data *flush_data = data;
9291013 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
9301014 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1015
+ enum hctx_type type = hctx->type;
9311016
9321017 spin_lock(&ctx->lock);
933
- list_splice_tail_init(&ctx->rq_list, flush_data->list);
1018
+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
9341019 sbitmap_clear_bit(sb, bitnr);
9351020 spin_unlock(&ctx->lock);
9361021 return true;
....@@ -962,12 +1047,13 @@
9621047 struct dispatch_rq_data *dispatch_data = data;
9631048 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
9641049 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1050
+ enum hctx_type type = hctx->type;
9651051
9661052 spin_lock(&ctx->lock);
967
- if (!list_empty(&ctx->rq_list)) {
968
- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
1053
+ if (!list_empty(&ctx->rq_lists[type])) {
1054
+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
9691055 list_del_init(&dispatch_data->rq->queuelist);
970
- if (list_empty(&ctx->rq_list))
1056
+ if (list_empty(&ctx->rq_lists[type]))
9711057 sbitmap_clear_bit(sb, bitnr);
9721058 }
9731059 spin_unlock(&ctx->lock);
....@@ -978,7 +1064,7 @@
9781064 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
9791065 struct blk_mq_ctx *start)
9801066 {
981
- unsigned off = start ? start->index_hw : 0;
1067
+ unsigned off = start ? start->index_hw[hctx->type] : 0;
9821068 struct dispatch_rq_data data = {
9831069 .hctx = hctx,
9841070 .rq = NULL,
....@@ -998,33 +1084,44 @@
9981084 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
9991085 }
10001086
1001
-bool blk_mq_get_driver_tag(struct request *rq)
1087
+static bool __blk_mq_get_driver_tag(struct request *rq)
10021088 {
1003
- struct blk_mq_alloc_data data = {
1004
- .q = rq->q,
1005
- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
1006
- .flags = BLK_MQ_REQ_NOWAIT,
1007
- };
1008
- bool shared;
1089
+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
1090
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1091
+ int tag;
10091092
1010
- if (rq->tag != -1)
1011
- goto done;
1093
+ blk_mq_tag_busy(rq->mq_hctx);
10121094
1013
- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1014
- data.flags |= BLK_MQ_REQ_RESERVED;
1015
-
1016
- shared = blk_mq_tag_busy(data.hctx);
1017
- rq->tag = blk_mq_get_tag(&data);
1018
- if (rq->tag >= 0) {
1019
- if (shared) {
1020
- rq->rq_flags |= RQF_MQ_INFLIGHT;
1021
- atomic_inc(&data.hctx->nr_active);
1022
- }
1023
- data.hctx->tags->rqs[rq->tag] = rq;
1095
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1096
+ bt = rq->mq_hctx->tags->breserved_tags;
1097
+ tag_offset = 0;
1098
+ } else {
1099
+ if (!hctx_may_queue(rq->mq_hctx, bt))
1100
+ return false;
10241101 }
10251102
1026
-done:
1027
- return rq->tag != -1;
1103
+ tag = __sbitmap_queue_get(bt);
1104
+ if (tag == BLK_MQ_NO_TAG)
1105
+ return false;
1106
+
1107
+ rq->tag = tag + tag_offset;
1108
+ return true;
1109
+}
1110
+
1111
+static bool blk_mq_get_driver_tag(struct request *rq)
1112
+{
1113
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1114
+
1115
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1116
+ return false;
1117
+
1118
+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1119
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1120
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
1121
+ __blk_mq_inc_active_requests(hctx);
1122
+ }
1123
+ hctx->tags->rqs[rq->tag] = rq;
1124
+ return true;
10281125 }
10291126
10301127 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
....@@ -1035,7 +1132,13 @@
10351132 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
10361133
10371134 spin_lock(&hctx->dispatch_wait_lock);
1038
- list_del_init(&wait->entry);
1135
+ if (!list_empty(&wait->entry)) {
1136
+ struct sbitmap_queue *sbq;
1137
+
1138
+ list_del_init(&wait->entry);
1139
+ sbq = hctx->tags->bitmap_tags;
1140
+ atomic_dec(&sbq->ws_active);
1141
+ }
10391142 spin_unlock(&hctx->dispatch_wait_lock);
10401143
10411144 blk_mq_run_hw_queue(hctx, true);
....@@ -1051,13 +1154,13 @@
10511154 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
10521155 struct request *rq)
10531156 {
1157
+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
10541158 struct wait_queue_head *wq;
10551159 wait_queue_entry_t *wait;
10561160 bool ret;
10571161
1058
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1059
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1060
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1162
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1163
+ blk_mq_sched_mark_restart_hctx(hctx);
10611164
10621165 /*
10631166 * It's possible that a tag was freed in the window between the
....@@ -1074,7 +1177,7 @@
10741177 if (!list_empty_careful(&wait->entry))
10751178 return false;
10761179
1077
- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1180
+ wq = &bt_wait_ptr(sbq, hctx)->wait;
10781181
10791182 spin_lock_irq(&wq->lock);
10801183 spin_lock(&hctx->dispatch_wait_lock);
....@@ -1084,6 +1187,7 @@
10841187 return false;
10851188 }
10861189
1190
+ atomic_inc(&sbq->ws_active);
10871191 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
10881192 __add_wait_queue(wq, wait);
10891193
....@@ -1104,6 +1208,7 @@
11041208 * someone else gets the wakeup.
11051209 */
11061210 list_del_init(&wait->entry);
1211
+ atomic_dec(&sbq->ws_active);
11071212 spin_unlock(&hctx->dispatch_wait_lock);
11081213 spin_unlock_irq(&wq->lock);
11091214
....@@ -1122,9 +1227,6 @@
11221227 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
11231228 {
11241229 unsigned int ewma;
1125
-
1126
- if (hctx->queue->elevator)
1127
- return;
11281230
11291231 ewma = hctx->dispatch_busy;
11301232
....@@ -1158,22 +1260,83 @@
11581260 __blk_mq_requeue_request(rq);
11591261 }
11601262
1263
+static void blk_mq_handle_zone_resource(struct request *rq,
1264
+ struct list_head *zone_list)
1265
+{
1266
+ /*
1267
+ * If we end up here it is because we cannot dispatch a request to a
1268
+ * specific zone due to LLD level zone-write locking or other zone
1269
+ * related resource not being available. In this case, set the request
1270
+ * aside in zone_list for retrying it later.
1271
+ */
1272
+ list_add(&rq->queuelist, zone_list);
1273
+ __blk_mq_requeue_request(rq);
1274
+}
1275
+
1276
+enum prep_dispatch {
1277
+ PREP_DISPATCH_OK,
1278
+ PREP_DISPATCH_NO_TAG,
1279
+ PREP_DISPATCH_NO_BUDGET,
1280
+};
1281
+
1282
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1283
+ bool need_budget)
1284
+{
1285
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1286
+
1287
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
1288
+ blk_mq_put_driver_tag(rq);
1289
+ return PREP_DISPATCH_NO_BUDGET;
1290
+ }
1291
+
1292
+ if (!blk_mq_get_driver_tag(rq)) {
1293
+ /*
1294
+ * The initial allocation attempt failed, so we need to
1295
+ * rerun the hardware queue when a tag is freed. The
1296
+ * waitqueue takes care of that. If the queue is run
1297
+ * before we add this entry back on the dispatch list,
1298
+ * we'll re-run it below.
1299
+ */
1300
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
1301
+ /*
1302
+ * All budgets not got from this function will be put
1303
+ * together during handling partial dispatch
1304
+ */
1305
+ if (need_budget)
1306
+ blk_mq_put_dispatch_budget(rq->q);
1307
+ return PREP_DISPATCH_NO_TAG;
1308
+ }
1309
+ }
1310
+
1311
+ return PREP_DISPATCH_OK;
1312
+}
1313
+
1314
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1315
+static void blk_mq_release_budgets(struct request_queue *q,
1316
+ unsigned int nr_budgets)
1317
+{
1318
+ int i;
1319
+
1320
+ for (i = 0; i < nr_budgets; i++)
1321
+ blk_mq_put_dispatch_budget(q);
1322
+}
1323
+
11611324 /*
11621325 * Returns true if we did some work AND can potentially do more.
11631326 */
1164
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1165
- bool got_budget)
1327
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1328
+ unsigned int nr_budgets)
11661329 {
1167
- struct blk_mq_hw_ctx *hctx;
1330
+ enum prep_dispatch prep;
1331
+ struct request_queue *q = hctx->queue;
11681332 struct request *rq, *nxt;
1169
- bool no_tag = false;
11701333 int errors, queued;
11711334 blk_status_t ret = BLK_STS_OK;
1335
+ LIST_HEAD(zone_list);
1336
+ bool needs_resource = false;
11721337
11731338 if (list_empty(list))
11741339 return false;
1175
-
1176
- WARN_ON(!list_is_singular(list) && got_budget);
11771340
11781341 /*
11791342 * Now process all the entries, sending them to the driver.
....@@ -1184,29 +1347,10 @@
11841347
11851348 rq = list_first_entry(list, struct request, queuelist);
11861349
1187
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1188
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1350
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
1351
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1352
+ if (prep != PREP_DISPATCH_OK)
11891353 break;
1190
-
1191
- if (!blk_mq_get_driver_tag(rq)) {
1192
- /*
1193
- * The initial allocation attempt failed, so we need to
1194
- * rerun the hardware queue when a tag is freed. The
1195
- * waitqueue takes care of that. If the queue is run
1196
- * before we add this entry back on the dispatch list,
1197
- * we'll re-run it below.
1198
- */
1199
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1200
- blk_mq_put_dispatch_budget(hctx);
1201
- /*
1202
- * For non-shared tags, the RESTART check
1203
- * will suffice.
1204
- */
1205
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1206
- no_tag = true;
1207
- break;
1208
- }
1209
- }
12101354
12111355 list_del_init(&rq->queuelist);
12121356
....@@ -1223,32 +1367,63 @@
12231367 bd.last = !blk_mq_get_driver_tag(nxt);
12241368 }
12251369
1370
+ /*
1371
+ * once the request is queued to lld, no need to cover the
1372
+ * budget any more
1373
+ */
1374
+ if (nr_budgets)
1375
+ nr_budgets--;
12261376 ret = q->mq_ops->queue_rq(hctx, &bd);
1227
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1228
- blk_mq_handle_dev_resource(rq, list);
1377
+ switch (ret) {
1378
+ case BLK_STS_OK:
1379
+ queued++;
12291380 break;
1230
- }
1231
-
1232
- if (unlikely(ret != BLK_STS_OK)) {
1381
+ case BLK_STS_RESOURCE:
1382
+ needs_resource = true;
1383
+ fallthrough;
1384
+ case BLK_STS_DEV_RESOURCE:
1385
+ blk_mq_handle_dev_resource(rq, list);
1386
+ goto out;
1387
+ case BLK_STS_ZONE_RESOURCE:
1388
+ /*
1389
+ * Move the request to zone_list and keep going through
1390
+ * the dispatch list to find more requests the drive can
1391
+ * accept.
1392
+ */
1393
+ blk_mq_handle_zone_resource(rq, &zone_list);
1394
+ needs_resource = true;
1395
+ break;
1396
+ default:
12331397 errors++;
12341398 blk_mq_end_request(rq, BLK_STS_IOERR);
1235
- continue;
12361399 }
1237
-
1238
- queued++;
12391400 } while (!list_empty(list));
1401
+out:
1402
+ if (!list_empty(&zone_list))
1403
+ list_splice_tail_init(&zone_list, list);
12401404
12411405 hctx->dispatched[queued_to_index(queued)]++;
12421406
1407
+ /* If we didn't flush the entire list, we could have told the driver
1408
+ * there was more coming, but that turned out to be a lie.
1409
+ */
1410
+ if ((!list_empty(list) || errors || needs_resource ||
1411
+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
1412
+ q->mq_ops->commit_rqs(hctx);
12431413 /*
12441414 * Any items that need requeuing? Stuff them into hctx->dispatch,
12451415 * that is where we will continue on next queue run.
12461416 */
12471417 if (!list_empty(list)) {
12481418 bool needs_restart;
1419
+ /* For non-shared tags, the RESTART check will suffice */
1420
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1421
+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1422
+
1423
+ blk_mq_release_budgets(q, nr_budgets);
12491424
12501425 spin_lock(&hctx->lock);
1251
- list_splice_init(list, &hctx->dispatch);
1426
+ list_splice_tail_init(list, &hctx->dispatch);
12521427 spin_unlock(&hctx->lock);
12531428
12541429 /*
....@@ -1282,13 +1457,17 @@
12821457 *
12831458 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
12841459 * bit is set, run queue after a delay to avoid IO stalls
1285
- * that could otherwise occur if the queue is idle.
1460
+ * that could otherwise occur if the queue is idle. We'll do
1461
+ * similar if we couldn't get budget or couldn't lock a zone
1462
+ * and SCHED_RESTART is set.
12861463 */
12871464 needs_restart = blk_mq_sched_needs_restart(hctx);
1465
+ if (prep == PREP_DISPATCH_NO_BUDGET)
1466
+ needs_resource = true;
12881467 if (!needs_restart ||
12891468 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
12901469 blk_mq_run_hw_queue(hctx, true);
1291
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
1470
+ else if (needs_restart && needs_resource)
12921471 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
12931472
12941473 blk_mq_update_dispatch_busy(hctx, true);
....@@ -1296,16 +1475,15 @@
12961475 } else
12971476 blk_mq_update_dispatch_busy(hctx, false);
12981477
1299
- /*
1300
- * If the host/device is unable to accept more work, inform the
1301
- * caller of that.
1302
- */
1303
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1304
- return false;
1305
-
13061478 return (queued + errors) != 0;
13071479 }
13081480
1481
+/**
1482
+ * __blk_mq_run_hw_queue - Run a hardware queue.
1483
+ * @hctx: Pointer to the hardware queue to run.
1484
+ *
1485
+ * Send pending requests to the hardware.
1486
+ */
13091487 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
13101488 {
13111489 int srcu_idx;
....@@ -1403,6 +1581,15 @@
14031581 return next_cpu;
14041582 }
14051583
1584
+/**
1585
+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1586
+ * @hctx: Pointer to the hardware queue to run.
1587
+ * @async: If we want to run the queue asynchronously.
1588
+ * @msecs: Microseconds of delay to wait before running the queue.
1589
+ *
1590
+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
1591
+ * with a delay of @msecs.
1592
+ */
14061593 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
14071594 unsigned long msecs)
14081595 {
....@@ -1410,27 +1597,43 @@
14101597 return;
14111598
14121599 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1413
- int cpu = get_cpu_light();
1600
+ int cpu = get_cpu();
14141601 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
14151602 __blk_mq_run_hw_queue(hctx);
1416
- put_cpu_light();
1603
+ put_cpu();
14171604 return;
14181605 }
14191606
1420
- put_cpu_light();
1607
+ put_cpu();
14211608 }
14221609
14231610 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
14241611 msecs_to_jiffies(msecs));
14251612 }
14261613
1614
+/**
1615
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1616
+ * @hctx: Pointer to the hardware queue to run.
1617
+ * @msecs: Microseconds of delay to wait before running the queue.
1618
+ *
1619
+ * Run a hardware queue asynchronously with a delay of @msecs.
1620
+ */
14271621 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
14281622 {
14291623 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
14301624 }
14311625 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
14321626
1433
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1627
+/**
1628
+ * blk_mq_run_hw_queue - Start to run a hardware queue.
1629
+ * @hctx: Pointer to the hardware queue to run.
1630
+ * @async: If we want to run the queue asynchronously.
1631
+ *
1632
+ * Check if the request queue is not in a quiesced state and if there are
1633
+ * pending requests to be sent. If this is true, run the queue to send requests
1634
+ * to hardware.
1635
+ */
1636
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
14341637 {
14351638 int srcu_idx;
14361639 bool need_run;
....@@ -1448,28 +1651,101 @@
14481651 blk_mq_hctx_has_pending(hctx);
14491652 hctx_unlock(hctx, srcu_idx);
14501653
1451
- if (need_run) {
1654
+ if (need_run)
14521655 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1453
- return true;
1454
- }
1455
-
1456
- return false;
14571656 }
14581657 EXPORT_SYMBOL(blk_mq_run_hw_queue);
14591658
1659
+/*
1660
+ * Is the request queue handled by an IO scheduler that does not respect
1661
+ * hardware queues when dispatching?
1662
+ */
1663
+static bool blk_mq_has_sqsched(struct request_queue *q)
1664
+{
1665
+ struct elevator_queue *e = q->elevator;
1666
+
1667
+ if (e && e->type->ops.dispatch_request &&
1668
+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1669
+ return true;
1670
+ return false;
1671
+}
1672
+
1673
+/*
1674
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
1675
+ * scheduler.
1676
+ */
1677
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1678
+{
1679
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1680
+ /*
1681
+ * If the IO scheduler does not respect hardware queues when
1682
+ * dispatching, we just don't bother with multiple HW queues and
1683
+ * dispatch from hctx for the current CPU since running multiple queues
1684
+ * just causes lock contention inside the scheduler and pointless cache
1685
+ * bouncing.
1686
+ */
1687
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
1688
+
1689
+ if (!blk_mq_hctx_stopped(hctx))
1690
+ return hctx;
1691
+ return NULL;
1692
+}
1693
+
1694
+/**
1695
+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
1696
+ * @q: Pointer to the request queue to run.
1697
+ * @async: If we want to run the queue asynchronously.
1698
+ */
14601699 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
14611700 {
1462
- struct blk_mq_hw_ctx *hctx;
1701
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
14631702 int i;
14641703
1704
+ sq_hctx = NULL;
1705
+ if (blk_mq_has_sqsched(q))
1706
+ sq_hctx = blk_mq_get_sq_hctx(q);
14651707 queue_for_each_hw_ctx(q, hctx, i) {
14661708 if (blk_mq_hctx_stopped(hctx))
14671709 continue;
1468
-
1469
- blk_mq_run_hw_queue(hctx, async);
1710
+ /*
1711
+ * Dispatch from this hctx either if there's no hctx preferred
1712
+ * by IO scheduler or if it has requests that bypass the
1713
+ * scheduler.
1714
+ */
1715
+ if (!sq_hctx || sq_hctx == hctx ||
1716
+ !list_empty_careful(&hctx->dispatch))
1717
+ blk_mq_run_hw_queue(hctx, async);
14701718 }
14711719 }
14721720 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1721
+
1722
+/**
1723
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1724
+ * @q: Pointer to the request queue to run.
1725
+ * @msecs: Microseconds of delay to wait before running the queues.
1726
+ */
1727
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1728
+{
1729
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
1730
+ int i;
1731
+
1732
+ sq_hctx = NULL;
1733
+ if (blk_mq_has_sqsched(q))
1734
+ sq_hctx = blk_mq_get_sq_hctx(q);
1735
+ queue_for_each_hw_ctx(q, hctx, i) {
1736
+ if (blk_mq_hctx_stopped(hctx))
1737
+ continue;
1738
+ /*
1739
+ * Dispatch from this hctx either if there's no hctx preferred
1740
+ * by IO scheduler or if it has requests that bypass the
1741
+ * scheduler.
1742
+ */
1743
+ if (!sq_hctx || sq_hctx == hctx ||
1744
+ !list_empty_careful(&hctx->dispatch))
1745
+ blk_mq_delay_run_hw_queue(hctx, msecs);
1746
+ }
1747
+}
1748
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
14731749
14741750 /**
14751751 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
....@@ -1574,7 +1850,7 @@
15741850 /*
15751851 * If we are stopped, don't run the queue.
15761852 */
1577
- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1853
+ if (blk_mq_hctx_stopped(hctx))
15781854 return;
15791855
15801856 __blk_mq_run_hw_queue(hctx);
....@@ -1585,15 +1861,16 @@
15851861 bool at_head)
15861862 {
15871863 struct blk_mq_ctx *ctx = rq->mq_ctx;
1864
+ enum hctx_type type = hctx->type;
15881865
15891866 lockdep_assert_held(&ctx->lock);
15901867
15911868 trace_block_rq_insert(hctx->queue, rq);
15921869
15931870 if (at_head)
1594
- list_add(&rq->queuelist, &ctx->rq_list);
1871
+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
15951872 else
1596
- list_add_tail(&rq->queuelist, &ctx->rq_list);
1873
+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
15971874 }
15981875
15991876 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
....@@ -1607,17 +1884,25 @@
16071884 blk_mq_hctx_mark_pending(hctx, ctx);
16081885 }
16091886
1610
-/*
1887
+/**
1888
+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1889
+ * @rq: Pointer to request to be inserted.
1890
+ * @at_head: true if the request should be inserted at the head of the list.
1891
+ * @run_queue: If we should run the hardware queue after inserting the request.
1892
+ *
16111893 * Should only be used carefully, when the caller knows we want to
16121894 * bypass a potential IO scheduler on the target device.
16131895 */
1614
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1896
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1897
+ bool run_queue)
16151898 {
1616
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1617
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1899
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
16181900
16191901 spin_lock(&hctx->lock);
1620
- list_add_tail(&rq->queuelist, &hctx->dispatch);
1902
+ if (at_head)
1903
+ list_add(&rq->queuelist, &hctx->dispatch);
1904
+ else
1905
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
16211906 spin_unlock(&hctx->lock);
16221907
16231908 if (run_queue)
....@@ -1629,6 +1914,7 @@
16291914
16301915 {
16311916 struct request *rq;
1917
+ enum hctx_type type = hctx->type;
16321918
16331919 /*
16341920 * preemption doesn't flush plug list, so it's possible ctx->cpu is
....@@ -1640,95 +1926,87 @@
16401926 }
16411927
16421928 spin_lock(&ctx->lock);
1643
- list_splice_tail_init(list, &ctx->rq_list);
1929
+ list_splice_tail_init(list, &ctx->rq_lists[type]);
16441930 blk_mq_hctx_mark_pending(hctx, ctx);
16451931 spin_unlock(&ctx->lock);
16461932 }
16471933
1648
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1934
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
16491935 {
16501936 struct request *rqa = container_of(a, struct request, queuelist);
16511937 struct request *rqb = container_of(b, struct request, queuelist);
16521938
1653
- return !(rqa->mq_ctx < rqb->mq_ctx ||
1654
- (rqa->mq_ctx == rqb->mq_ctx &&
1655
- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1939
+ if (rqa->mq_ctx != rqb->mq_ctx)
1940
+ return rqa->mq_ctx > rqb->mq_ctx;
1941
+ if (rqa->mq_hctx != rqb->mq_hctx)
1942
+ return rqa->mq_hctx > rqb->mq_hctx;
1943
+
1944
+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
16561945 }
16571946
16581947 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
16591948 {
1660
- struct blk_mq_ctx *this_ctx;
1661
- struct request_queue *this_q;
1662
- struct request *rq;
16631949 LIST_HEAD(list);
1664
- LIST_HEAD(ctx_list);
1665
- unsigned int depth;
16661950
1951
+ if (list_empty(&plug->mq_list))
1952
+ return;
16671953 list_splice_init(&plug->mq_list, &list);
16681954
1669
- list_sort(NULL, &list, plug_ctx_cmp);
1955
+ if (plug->rq_count > 2 && plug->multiple_queues)
1956
+ list_sort(NULL, &list, plug_rq_cmp);
16701957
1671
- this_q = NULL;
1672
- this_ctx = NULL;
1673
- depth = 0;
1958
+ plug->rq_count = 0;
16741959
1675
- while (!list_empty(&list)) {
1676
- rq = list_entry_rq(list.next);
1677
- list_del_init(&rq->queuelist);
1678
- BUG_ON(!rq->q);
1679
- if (rq->mq_ctx != this_ctx) {
1680
- if (this_ctx) {
1681
- trace_block_unplug(this_q, depth, !from_schedule);
1682
- blk_mq_sched_insert_requests(this_q, this_ctx,
1683
- &ctx_list,
1684
- from_schedule);
1685
- }
1960
+ do {
1961
+ struct list_head rq_list;
1962
+ struct request *rq, *head_rq = list_entry_rq(list.next);
1963
+ struct list_head *pos = &head_rq->queuelist; /* skip first */
1964
+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1965
+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1966
+ unsigned int depth = 1;
16861967
1687
- this_ctx = rq->mq_ctx;
1688
- this_q = rq->q;
1689
- depth = 0;
1968
+ list_for_each_continue(pos, &list) {
1969
+ rq = list_entry_rq(pos);
1970
+ BUG_ON(!rq->q);
1971
+ if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1972
+ break;
1973
+ depth++;
16901974 }
16911975
1692
- depth++;
1693
- list_add_tail(&rq->queuelist, &ctx_list);
1694
- }
1695
-
1696
- /*
1697
- * If 'this_ctx' is set, we know we have entries to complete
1698
- * on 'ctx_list'. Do those.
1699
- */
1700
- if (this_ctx) {
1701
- trace_block_unplug(this_q, depth, !from_schedule);
1702
- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1976
+ list_cut_before(&rq_list, &list, pos);
1977
+ trace_block_unplug(head_rq->q, depth, !from_schedule);
1978
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
17031979 from_schedule);
1704
- }
1980
+ } while(!list_empty(&list));
17051981 }
17061982
1707
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1983
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1984
+ unsigned int nr_segs)
17081985 {
1709
- blk_init_request_from_bio(rq, bio);
1986
+ int err;
17101987
1711
- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1988
+ if (bio->bi_opf & REQ_RAHEAD)
1989
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
17121990
1713
- blk_account_io_start(rq, true);
1714
-}
1991
+ rq->__sector = bio->bi_iter.bi_sector;
1992
+ rq->write_hint = bio->bi_write_hint;
1993
+ blk_rq_bio_prep(rq, bio, nr_segs);
17151994
1716
-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1717
-{
1718
- if (rq->tag != -1)
1719
- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1995
+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
1996
+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1997
+ WARN_ON_ONCE(err);
17201998
1721
- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1999
+ blk_account_io_start(rq);
17222000 }
17232001
17242002 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
17252003 struct request *rq,
1726
- blk_qc_t *cookie)
2004
+ blk_qc_t *cookie, bool last)
17272005 {
17282006 struct request_queue *q = rq->q;
17292007 struct blk_mq_queue_data bd = {
17302008 .rq = rq,
1731
- .last = true,
2009
+ .last = last,
17322010 };
17332011 blk_qc_t new_cookie;
17342012 blk_status_t ret;
....@@ -1763,7 +2041,7 @@
17632041 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17642042 struct request *rq,
17652043 blk_qc_t *cookie,
1766
- bool bypass_insert)
2044
+ bool bypass_insert, bool last)
17672045 {
17682046 struct request_queue *q = rq->q;
17692047 bool run_queue = true;
....@@ -1784,23 +2062,35 @@
17842062 if (q->elevator && !bypass_insert)
17852063 goto insert;
17862064
1787
- if (!blk_mq_get_dispatch_budget(hctx))
2065
+ if (!blk_mq_get_dispatch_budget(q))
17882066 goto insert;
17892067
17902068 if (!blk_mq_get_driver_tag(rq)) {
1791
- blk_mq_put_dispatch_budget(hctx);
2069
+ blk_mq_put_dispatch_budget(q);
17922070 goto insert;
17932071 }
17942072
1795
- return __blk_mq_issue_directly(hctx, rq, cookie);
2073
+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
17962074 insert:
17972075 if (bypass_insert)
17982076 return BLK_STS_RESOURCE;
17992077
1800
- blk_mq_request_bypass_insert(rq, run_queue);
2078
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
2079
+
18012080 return BLK_STS_OK;
18022081 }
18032082
2083
+/**
2084
+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2085
+ * @hctx: Pointer of the associated hardware queue.
2086
+ * @rq: Pointer to request to be sent.
2087
+ * @cookie: Request queue cookie.
2088
+ *
2089
+ * If the device has enough resources to accept a new request now, send the
2090
+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
2091
+ * we can try send it another time in the future. Requests inserted at this
2092
+ * queue have higher priority.
2093
+ */
18042094 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
18052095 struct request *rq, blk_qc_t *cookie)
18062096 {
....@@ -1811,25 +2101,24 @@
18112101
18122102 hctx_lock(hctx, &srcu_idx);
18132103
1814
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
2104
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
18152105 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1816
- blk_mq_request_bypass_insert(rq, true);
2106
+ blk_mq_request_bypass_insert(rq, false, true);
18172107 else if (ret != BLK_STS_OK)
18182108 blk_mq_end_request(rq, ret);
18192109
18202110 hctx_unlock(hctx, srcu_idx);
18212111 }
18222112
1823
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
2113
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
18242114 {
18252115 blk_status_t ret;
18262116 int srcu_idx;
18272117 blk_qc_t unused_cookie;
1828
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1829
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
2118
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
18302119
18312120 hctx_lock(hctx, &srcu_idx);
1832
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
2121
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
18332122 hctx_unlock(hctx, srcu_idx);
18342123
18352124 return ret;
....@@ -1838,104 +2127,169 @@
18382127 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
18392128 struct list_head *list)
18402129 {
2130
+ int queued = 0;
2131
+ int errors = 0;
2132
+
18412133 while (!list_empty(list)) {
18422134 blk_status_t ret;
18432135 struct request *rq = list_first_entry(list, struct request,
18442136 queuelist);
18452137
18462138 list_del_init(&rq->queuelist);
1847
- ret = blk_mq_request_issue_directly(rq);
2139
+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
18482140 if (ret != BLK_STS_OK) {
2141
+ errors++;
18492142 if (ret == BLK_STS_RESOURCE ||
18502143 ret == BLK_STS_DEV_RESOURCE) {
1851
- blk_mq_request_bypass_insert(rq,
2144
+ blk_mq_request_bypass_insert(rq, false,
18522145 list_empty(list));
18532146 break;
18542147 }
18552148 blk_mq_end_request(rq, ret);
1856
- }
2149
+ } else
2150
+ queued++;
2151
+ }
2152
+
2153
+ /*
2154
+ * If we didn't flush the entire list, we could have told
2155
+ * the driver there was more coming, but that turned out to
2156
+ * be a lie.
2157
+ */
2158
+ if ((!list_empty(list) || errors) &&
2159
+ hctx->queue->mq_ops->commit_rqs && queued)
2160
+ hctx->queue->mq_ops->commit_rqs(hctx);
2161
+}
2162
+
2163
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2164
+{
2165
+ list_add_tail(&rq->queuelist, &plug->mq_list);
2166
+ plug->rq_count++;
2167
+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2168
+ struct request *tmp;
2169
+
2170
+ tmp = list_first_entry(&plug->mq_list, struct request,
2171
+ queuelist);
2172
+ if (tmp->q != rq->q)
2173
+ plug->multiple_queues = true;
18572174 }
18582175 }
18592176
1860
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2177
+/*
2178
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
2179
+ * queues. This is important for md arrays to benefit from merging
2180
+ * requests.
2181
+ */
2182
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
18612183 {
2184
+ if (plug->multiple_queues)
2185
+ return BLK_MAX_REQUEST_COUNT * 2;
2186
+ return BLK_MAX_REQUEST_COUNT;
2187
+}
2188
+
2189
+/**
2190
+ * blk_mq_submit_bio - Create and send a request to block device.
2191
+ * @bio: Bio pointer.
2192
+ *
2193
+ * Builds up a request structure from @q and @bio and send to the device. The
2194
+ * request may not be queued directly to hardware if:
2195
+ * * This request can be merged with another one
2196
+ * * We want to place request at plug queue for possible future merging
2197
+ * * There is an IO scheduler active at this queue
2198
+ *
2199
+ * It will not queue the request if there is an error with the bio, or at the
2200
+ * request creation.
2201
+ *
2202
+ * Returns: Request queue cookie.
2203
+ */
2204
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
2205
+{
2206
+ struct request_queue *q = bio->bi_disk->queue;
18622207 const int is_sync = op_is_sync(bio->bi_opf);
18632208 const int is_flush_fua = op_is_flush(bio->bi_opf);
1864
- struct blk_mq_alloc_data data = { .flags = 0 };
2209
+ struct blk_mq_alloc_data data = {
2210
+ .q = q,
2211
+ };
18652212 struct request *rq;
1866
- unsigned int request_count = 0;
18672213 struct blk_plug *plug;
18682214 struct request *same_queue_rq = NULL;
2215
+ unsigned int nr_segs;
18692216 blk_qc_t cookie;
2217
+ blk_status_t ret;
18702218
18712219 blk_queue_bounce(q, &bio);
1872
-
1873
- blk_queue_split(q, &bio);
2220
+ __blk_queue_split(&bio, &nr_segs);
18742221
18752222 if (!bio_integrity_prep(bio))
1876
- return BLK_QC_T_NONE;
2223
+ goto queue_exit;
18772224
18782225 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1879
- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1880
- return BLK_QC_T_NONE;
2226
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2227
+ goto queue_exit;
18812228
1882
- if (blk_mq_sched_bio_merge(q, bio))
1883
- return BLK_QC_T_NONE;
2229
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2230
+ goto queue_exit;
18842231
1885
- rq_qos_throttle(q, bio, NULL);
2232
+ rq_qos_throttle(q, bio);
18862233
1887
- trace_block_getrq(q, bio, bio->bi_opf);
1888
-
1889
- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
2234
+ data.cmd_flags = bio->bi_opf;
2235
+ rq = __blk_mq_alloc_request(&data);
18902236 if (unlikely(!rq)) {
18912237 rq_qos_cleanup(q, bio);
18922238 if (bio->bi_opf & REQ_NOWAIT)
18932239 bio_wouldblock_error(bio);
1894
- return BLK_QC_T_NONE;
2240
+ goto queue_exit;
18952241 }
2242
+
2243
+ trace_block_getrq(q, bio, bio->bi_opf);
18962244
18972245 rq_qos_track(q, rq, bio);
18982246
18992247 cookie = request_to_qc_t(data.hctx, rq);
19002248
1901
- plug = current->plug;
1902
- if (unlikely(is_flush_fua)) {
1903
- blk_mq_put_ctx(data.ctx);
1904
- blk_mq_bio_to_request(rq, bio);
2249
+ blk_mq_bio_to_request(rq, bio, nr_segs);
19052250
1906
- /* bypass scheduler for flush rq */
2251
+ ret = blk_crypto_rq_get_keyslot(rq);
2252
+ if (ret != BLK_STS_OK) {
2253
+ bio->bi_status = ret;
2254
+ bio_endio(bio);
2255
+ blk_mq_free_request(rq);
2256
+ return BLK_QC_T_NONE;
2257
+ }
2258
+
2259
+ plug = blk_mq_plug(q, bio);
2260
+ if (unlikely(is_flush_fua)) {
2261
+ /* Bypass scheduler for flush requests */
19072262 blk_insert_flush(rq);
19082263 blk_mq_run_hw_queue(data.hctx, true);
1909
- } else if (plug && q->nr_hw_queues == 1) {
1910
- struct request *last = NULL;
1911
-
1912
- blk_mq_put_ctx(data.ctx);
1913
- blk_mq_bio_to_request(rq, bio);
1914
-
2264
+ } else if (plug && (q->nr_hw_queues == 1 ||
2265
+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
2266
+ q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
19152267 /*
1916
- * @request_count may become stale because of schedule
1917
- * out, so check the list again.
2268
+ * Use plugging if we have a ->commit_rqs() hook as well, as
2269
+ * we know the driver uses bd->last in a smart fashion.
2270
+ *
2271
+ * Use normal plugging if this disk is slow HDD, as sequential
2272
+ * IO may benefit a lot from plug merging.
19182273 */
1919
- if (list_empty(&plug->mq_list))
1920
- request_count = 0;
1921
- else if (blk_queue_nomerges(q))
1922
- request_count = blk_plug_queued_count(q);
2274
+ unsigned int request_count = plug->rq_count;
2275
+ struct request *last = NULL;
19232276
19242277 if (!request_count)
19252278 trace_block_plug(q);
19262279 else
19272280 last = list_entry_rq(plug->mq_list.prev);
19282281
1929
- if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2282
+ if (request_count >= blk_plug_max_rq_count(plug) || (last &&
19302283 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
19312284 blk_flush_plug_list(plug, false);
19322285 trace_block_plug(q);
19332286 }
19342287
1935
- list_add_tail(&rq->queuelist, &plug->mq_list);
2288
+ blk_add_rq_to_plug(plug, rq);
2289
+ } else if (q->elevator) {
2290
+ /* Insert the request at the IO scheduler queue */
2291
+ blk_mq_sched_insert_request(rq, false, true, true);
19362292 } else if (plug && !blk_queue_nomerges(q)) {
1937
- blk_mq_bio_to_request(rq, bio);
1938
-
19392293 /*
19402294 * We do limited plugging. If the bio can be merged, do that.
19412295 * Otherwise the existing request in the plug list will be
....@@ -1945,30 +2299,74 @@
19452299 */
19462300 if (list_empty(&plug->mq_list))
19472301 same_queue_rq = NULL;
1948
- if (same_queue_rq)
2302
+ if (same_queue_rq) {
19492303 list_del_init(&same_queue_rq->queuelist);
1950
- list_add_tail(&rq->queuelist, &plug->mq_list);
1951
-
1952
- blk_mq_put_ctx(data.ctx);
2304
+ plug->rq_count--;
2305
+ }
2306
+ blk_add_rq_to_plug(plug, rq);
2307
+ trace_block_plug(q);
19532308
19542309 if (same_queue_rq) {
1955
- data.hctx = blk_mq_map_queue(q,
1956
- same_queue_rq->mq_ctx->cpu);
2310
+ data.hctx = same_queue_rq->mq_hctx;
2311
+ trace_block_unplug(q, 1, true);
19572312 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
19582313 &cookie);
19592314 }
1960
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1961
- !data.hctx->dispatch_busy)) {
1962
- blk_mq_put_ctx(data.ctx);
1963
- blk_mq_bio_to_request(rq, bio);
2315
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
2316
+ !data.hctx->dispatch_busy) {
2317
+ /*
2318
+ * There is no scheduler and we can try to send directly
2319
+ * to the hardware.
2320
+ */
19642321 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
19652322 } else {
1966
- blk_mq_put_ctx(data.ctx);
1967
- blk_mq_bio_to_request(rq, bio);
2323
+ /* Default case. */
19682324 blk_mq_sched_insert_request(rq, false, true, true);
19692325 }
19702326
19712327 return cookie;
2328
+queue_exit:
2329
+ blk_queue_exit(q);
2330
+ return BLK_QC_T_NONE;
2331
+}
2332
+
2333
+static size_t order_to_size(unsigned int order)
2334
+{
2335
+ return (size_t)PAGE_SIZE << order;
2336
+}
2337
+
2338
+/* called before freeing request pool in @tags */
2339
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
2340
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
2341
+{
2342
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
2343
+ struct page *page;
2344
+ unsigned long flags;
2345
+
2346
+ list_for_each_entry(page, &tags->page_list, lru) {
2347
+ unsigned long start = (unsigned long)page_address(page);
2348
+ unsigned long end = start + order_to_size(page->private);
2349
+ int i;
2350
+
2351
+ for (i = 0; i < set->queue_depth; i++) {
2352
+ struct request *rq = drv_tags->rqs[i];
2353
+ unsigned long rq_addr = (unsigned long)rq;
2354
+
2355
+ if (rq_addr >= start && rq_addr < end) {
2356
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
2357
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
2358
+ }
2359
+ }
2360
+ }
2361
+
2362
+ /*
2363
+ * Wait until all pending iteration is done.
2364
+ *
2365
+ * Request reference is cleared and it is guaranteed to be observed
2366
+ * after the ->lock is released.
2367
+ */
2368
+ spin_lock_irqsave(&drv_tags->lock, flags);
2369
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
19722370 }
19732371
19742372 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
....@@ -1989,42 +2387,44 @@
19892387 }
19902388 }
19912389
2390
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
2391
+
19922392 while (!list_empty(&tags->page_list)) {
19932393 page = list_first_entry(&tags->page_list, struct page, lru);
19942394 list_del_init(&page->lru);
19952395 /*
19962396 * Remove kmemleak object previously allocated in
1997
- * blk_mq_init_rq_map().
2397
+ * blk_mq_alloc_rqs().
19982398 */
19992399 kmemleak_free(page_address(page));
20002400 __free_pages(page, page->private);
20012401 }
20022402 }
20032403
2004
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2404
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
20052405 {
20062406 kfree(tags->rqs);
20072407 tags->rqs = NULL;
20082408 kfree(tags->static_rqs);
20092409 tags->static_rqs = NULL;
20102410
2011
- blk_mq_free_tags(tags);
2411
+ blk_mq_free_tags(tags, flags);
20122412 }
20132413
20142414 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
20152415 unsigned int hctx_idx,
20162416 unsigned int nr_tags,
2017
- unsigned int reserved_tags)
2417
+ unsigned int reserved_tags,
2418
+ unsigned int flags)
20182419 {
20192420 struct blk_mq_tags *tags;
20202421 int node;
20212422
2022
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2423
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20232424 if (node == NUMA_NO_NODE)
20242425 node = set->numa_node;
20252426
2026
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2027
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2427
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
20282428 if (!tags)
20292429 return NULL;
20302430
....@@ -2032,7 +2432,7 @@
20322432 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
20332433 node);
20342434 if (!tags->rqs) {
2035
- blk_mq_free_tags(tags);
2435
+ blk_mq_free_tags(tags, flags);
20362436 return NULL;
20372437 }
20382438
....@@ -2041,16 +2441,11 @@
20412441 node);
20422442 if (!tags->static_rqs) {
20432443 kfree(tags->rqs);
2044
- blk_mq_free_tags(tags);
2444
+ blk_mq_free_tags(tags, flags);
20452445 return NULL;
20462446 }
20472447
20482448 return tags;
2049
-}
2050
-
2051
-static size_t order_to_size(unsigned int order)
2052
-{
2053
- return (size_t)PAGE_SIZE << order;
20542449 }
20552450
20562451 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
....@@ -2075,7 +2470,7 @@
20752470 size_t rq_size, left;
20762471 int node;
20772472
2078
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2473
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20792474 if (node == NUMA_NO_NODE)
20802475 node = set->numa_node;
20812476
....@@ -2087,6 +2482,7 @@
20872482 */
20882483 rq_size = round_up(sizeof(struct request) + set->cmd_size,
20892484 cache_line_size());
2485
+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
20902486 left = rq_size * depth;
20912487
20922488 for (i = 0; i < depth; ) {
....@@ -2145,6 +2541,86 @@
21452541 return -ENOMEM;
21462542 }
21472543
2544
+struct rq_iter_data {
2545
+ struct blk_mq_hw_ctx *hctx;
2546
+ bool has_rq;
2547
+};
2548
+
2549
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2550
+{
2551
+ struct rq_iter_data *iter_data = data;
2552
+
2553
+ if (rq->mq_hctx != iter_data->hctx)
2554
+ return true;
2555
+ iter_data->has_rq = true;
2556
+ return false;
2557
+}
2558
+
2559
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2560
+{
2561
+ struct blk_mq_tags *tags = hctx->sched_tags ?
2562
+ hctx->sched_tags : hctx->tags;
2563
+ struct rq_iter_data data = {
2564
+ .hctx = hctx,
2565
+ };
2566
+
2567
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2568
+ return data.has_rq;
2569
+}
2570
+
2571
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2572
+ struct blk_mq_hw_ctx *hctx)
2573
+{
2574
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2575
+ return false;
2576
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2577
+ return false;
2578
+ return true;
2579
+}
2580
+
2581
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2582
+{
2583
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2584
+ struct blk_mq_hw_ctx, cpuhp_online);
2585
+
2586
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2587
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
2588
+ return 0;
2589
+
2590
+ /*
2591
+ * Prevent new request from being allocated on the current hctx.
2592
+ *
2593
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
2594
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2595
+ * seen once we return from the tag allocator.
2596
+ */
2597
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2598
+ smp_mb__after_atomic();
2599
+
2600
+ /*
2601
+ * Try to grab a reference to the queue and wait for any outstanding
2602
+ * requests. If we could not grab a reference the queue has been
2603
+ * frozen and there are no requests.
2604
+ */
2605
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2606
+ while (blk_mq_hctx_has_requests(hctx))
2607
+ msleep(5);
2608
+ percpu_ref_put(&hctx->queue->q_usage_counter);
2609
+ }
2610
+
2611
+ return 0;
2612
+}
2613
+
2614
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2615
+{
2616
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2617
+ struct blk_mq_hw_ctx, cpuhp_online);
2618
+
2619
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
2620
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2621
+ return 0;
2622
+}
2623
+
21482624 /*
21492625 * 'cpu' is going away. splice any existing rq_list entries from this
21502626 * software queue to the hw queue dispatch list, and ensure that it
....@@ -2155,13 +2631,18 @@
21552631 struct blk_mq_hw_ctx *hctx;
21562632 struct blk_mq_ctx *ctx;
21572633 LIST_HEAD(tmp);
2634
+ enum hctx_type type;
21582635
21592636 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2637
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
2638
+ return 0;
2639
+
21602640 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2641
+ type = hctx->type;
21612642
21622643 spin_lock(&ctx->lock);
2163
- if (!list_empty(&ctx->rq_list)) {
2164
- list_splice_init(&ctx->rq_list, &tmp);
2644
+ if (!list_empty(&ctx->rq_lists[type])) {
2645
+ list_splice_init(&ctx->rq_lists[type], &tmp);
21652646 blk_mq_hctx_clear_pending(hctx, ctx);
21662647 }
21672648 spin_unlock(&ctx->lock);
....@@ -2179,8 +2660,40 @@
21792660
21802661 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
21812662 {
2663
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2664
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2665
+ &hctx->cpuhp_online);
21822666 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
21832667 &hctx->cpuhp_dead);
2668
+}
2669
+
2670
+/*
2671
+ * Before freeing hw queue, clearing the flush request reference in
2672
+ * tags->rqs[] for avoiding potential UAF.
2673
+ */
2674
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
2675
+ unsigned int queue_depth, struct request *flush_rq)
2676
+{
2677
+ int i;
2678
+ unsigned long flags;
2679
+
2680
+ /* The hw queue may not be mapped yet */
2681
+ if (!tags)
2682
+ return;
2683
+
2684
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
2685
+
2686
+ for (i = 0; i < queue_depth; i++)
2687
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
2688
+
2689
+ /*
2690
+ * Wait until all pending iteration is done.
2691
+ *
2692
+ * Request reference is cleared and it is guaranteed to be observed
2693
+ * after the ->lock is released.
2694
+ */
2695
+ spin_lock_irqsave(&tags->lock, flags);
2696
+ spin_unlock_irqrestore(&tags->lock, flags);
21842697 }
21852698
21862699 /* hctx->ctxs will be freed in queue's release handler */
....@@ -2188,18 +2701,24 @@
21882701 struct blk_mq_tag_set *set,
21892702 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
21902703 {
2191
- blk_mq_debugfs_unregister_hctx(hctx);
2704
+ struct request *flush_rq = hctx->fq->flush_rq;
21922705
21932706 if (blk_mq_hw_queue_mapped(hctx))
21942707 blk_mq_tag_idle(hctx);
21952708
2709
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
2710
+ set->queue_depth, flush_rq);
21962711 if (set->ops->exit_request)
2197
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2712
+ set->ops->exit_request(set, flush_rq, hctx_idx);
21982713
21992714 if (set->ops->exit_hctx)
22002715 set->ops->exit_hctx(hctx, hctx_idx);
22012716
22022717 blk_mq_remove_cpuhp(hctx);
2718
+
2719
+ spin_lock(&q->unused_hctx_lock);
2720
+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
2721
+ spin_unlock(&q->unused_hctx_lock);
22032722 }
22042723
22052724 static void blk_mq_exit_hw_queues(struct request_queue *q,
....@@ -2211,112 +2730,160 @@
22112730 queue_for_each_hw_ctx(q, hctx, i) {
22122731 if (i == nr_queue)
22132732 break;
2733
+ blk_mq_debugfs_unregister_hctx(hctx);
22142734 blk_mq_exit_hctx(q, set, hctx, i);
22152735 }
2736
+}
2737
+
2738
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2739
+{
2740
+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2741
+
2742
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2743
+ __alignof__(struct blk_mq_hw_ctx)) !=
2744
+ sizeof(struct blk_mq_hw_ctx));
2745
+
2746
+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
2747
+ hw_ctx_size += sizeof(struct srcu_struct);
2748
+
2749
+ return hw_ctx_size;
22162750 }
22172751
22182752 static int blk_mq_init_hctx(struct request_queue *q,
22192753 struct blk_mq_tag_set *set,
22202754 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
22212755 {
2222
- int node;
2756
+ hctx->queue_num = hctx_idx;
22232757
2224
- node = hctx->numa_node;
2758
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2759
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2760
+ &hctx->cpuhp_online);
2761
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2762
+
2763
+ hctx->tags = set->tags[hctx_idx];
2764
+
2765
+ if (set->ops->init_hctx &&
2766
+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2767
+ goto unregister_cpu_notifier;
2768
+
2769
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2770
+ hctx->numa_node))
2771
+ goto exit_hctx;
2772
+ return 0;
2773
+
2774
+ exit_hctx:
2775
+ if (set->ops->exit_hctx)
2776
+ set->ops->exit_hctx(hctx, hctx_idx);
2777
+ unregister_cpu_notifier:
2778
+ blk_mq_remove_cpuhp(hctx);
2779
+ return -1;
2780
+}
2781
+
2782
+static struct blk_mq_hw_ctx *
2783
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2784
+ int node)
2785
+{
2786
+ struct blk_mq_hw_ctx *hctx;
2787
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2788
+
2789
+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2790
+ if (!hctx)
2791
+ goto fail_alloc_hctx;
2792
+
2793
+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2794
+ goto free_hctx;
2795
+
2796
+ atomic_set(&hctx->nr_active, 0);
22252797 if (node == NUMA_NO_NODE)
2226
- node = hctx->numa_node = set->numa_node;
2798
+ node = set->numa_node;
2799
+ hctx->numa_node = node;
22272800
22282801 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
22292802 spin_lock_init(&hctx->lock);
22302803 INIT_LIST_HEAD(&hctx->dispatch);
22312804 hctx->queue = q;
2232
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2805
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
22332806
2234
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2235
-
2236
- hctx->tags = set->tags[hctx_idx];
2807
+ INIT_LIST_HEAD(&hctx->hctx_list);
22372808
22382809 /*
22392810 * Allocate space for all possible cpus to avoid allocation at
22402811 * runtime
22412812 */
22422813 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2243
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
2814
+ gfp, node);
22442815 if (!hctx->ctxs)
2245
- goto unregister_cpu_notifier;
2816
+ goto free_cpumask;
22462817
22472818 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2248
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
2819
+ gfp, node))
22492820 goto free_ctxs;
2250
-
22512821 hctx->nr_ctx = 0;
22522822
22532823 spin_lock_init(&hctx->dispatch_wait_lock);
22542824 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
22552825 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
22562826
2257
- if (set->ops->init_hctx &&
2258
- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2259
- goto free_bitmap;
2260
-
2261
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2262
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
2827
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
22632828 if (!hctx->fq)
2264
- goto exit_hctx;
2265
-
2266
- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2267
- goto free_fq;
2829
+ goto free_bitmap;
22682830
22692831 if (hctx->flags & BLK_MQ_F_BLOCKING)
22702832 init_srcu_struct(hctx->srcu);
2833
+ blk_mq_hctx_kobj_init(hctx);
22712834
2272
- blk_mq_debugfs_register_hctx(q, hctx);
2835
+ return hctx;
22732836
2274
- return 0;
2275
-
2276
- free_fq:
2277
- blk_free_flush_queue(hctx->fq);
2278
- exit_hctx:
2279
- if (set->ops->exit_hctx)
2280
- set->ops->exit_hctx(hctx, hctx_idx);
22812837 free_bitmap:
22822838 sbitmap_free(&hctx->ctx_map);
22832839 free_ctxs:
22842840 kfree(hctx->ctxs);
2285
- unregister_cpu_notifier:
2286
- blk_mq_remove_cpuhp(hctx);
2287
- return -1;
2841
+ free_cpumask:
2842
+ free_cpumask_var(hctx->cpumask);
2843
+ free_hctx:
2844
+ kfree(hctx);
2845
+ fail_alloc_hctx:
2846
+ return NULL;
22882847 }
22892848
22902849 static void blk_mq_init_cpu_queues(struct request_queue *q,
22912850 unsigned int nr_hw_queues)
22922851 {
2293
- unsigned int i;
2852
+ struct blk_mq_tag_set *set = q->tag_set;
2853
+ unsigned int i, j;
22942854
22952855 for_each_possible_cpu(i) {
22962856 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
22972857 struct blk_mq_hw_ctx *hctx;
2858
+ int k;
22982859
22992860 __ctx->cpu = i;
23002861 spin_lock_init(&__ctx->lock);
2301
- INIT_LIST_HEAD(&__ctx->rq_list);
2862
+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2863
+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2864
+
23022865 __ctx->queue = q;
23032866
23042867 /*
23052868 * Set local node, IFF we have more than one hw queue. If
23062869 * not, we remain on the home node of the device
23072870 */
2308
- hctx = blk_mq_map_queue(q, i);
2309
- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2310
- hctx->numa_node = local_memory_node(cpu_to_node(i));
2871
+ for (j = 0; j < set->nr_maps; j++) {
2872
+ hctx = blk_mq_map_queue_type(q, j, i);
2873
+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2874
+ hctx->numa_node = cpu_to_node(i);
2875
+ }
23112876 }
23122877 }
23132878
2314
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2879
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2880
+ int hctx_idx)
23152881 {
2882
+ unsigned int flags = set->flags;
23162883 int ret = 0;
23172884
23182885 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2319
- set->queue_depth, set->reserved_tags);
2886
+ set->queue_depth, set->reserved_tags, flags);
23202887 if (!set->tags[hctx_idx])
23212888 return false;
23222889
....@@ -2325,7 +2892,7 @@
23252892 if (!ret)
23262893 return true;
23272894
2328
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2895
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23292896 set->tags[hctx_idx] = NULL;
23302897 return false;
23312898 }
....@@ -2333,16 +2900,18 @@
23332900 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
23342901 unsigned int hctx_idx)
23352902 {
2336
- if (set->tags[hctx_idx]) {
2903
+ unsigned int flags = set->flags;
2904
+
2905
+ if (set->tags && set->tags[hctx_idx]) {
23372906 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2338
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2907
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23392908 set->tags[hctx_idx] = NULL;
23402909 }
23412910 }
23422911
23432912 static void blk_mq_map_swqueue(struct request_queue *q)
23442913 {
2345
- unsigned int i, hctx_idx;
2914
+ unsigned int i, j, hctx_idx;
23462915 struct blk_mq_hw_ctx *hctx;
23472916 struct blk_mq_ctx *ctx;
23482917 struct blk_mq_tag_set *set = q->tag_set;
....@@ -2359,25 +2928,52 @@
23592928 * If the cpu isn't present, the cpu is mapped to first hctx.
23602929 */
23612930 for_each_possible_cpu(i) {
2362
- hctx_idx = q->mq_map[i];
2363
- /* unmapped hw queue can be remapped after CPU topo changed */
2364
- if (!set->tags[hctx_idx] &&
2365
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2366
- /*
2367
- * If tags initialization fail for some hctx,
2368
- * that hctx won't be brought online. In this
2369
- * case, remap the current ctx to hctx[0] which
2370
- * is guaranteed to always have tags allocated
2371
- */
2372
- q->mq_map[i] = 0;
2373
- }
23742931
23752932 ctx = per_cpu_ptr(q->queue_ctx, i);
2376
- hctx = blk_mq_map_queue(q, i);
2933
+ for (j = 0; j < set->nr_maps; j++) {
2934
+ if (!set->map[j].nr_queues) {
2935
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2936
+ HCTX_TYPE_DEFAULT, i);
2937
+ continue;
2938
+ }
2939
+ hctx_idx = set->map[j].mq_map[i];
2940
+ /* unmapped hw queue can be remapped after CPU topo changed */
2941
+ if (!set->tags[hctx_idx] &&
2942
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2943
+ /*
2944
+ * If tags initialization fail for some hctx,
2945
+ * that hctx won't be brought online. In this
2946
+ * case, remap the current ctx to hctx[0] which
2947
+ * is guaranteed to always have tags allocated
2948
+ */
2949
+ set->map[j].mq_map[i] = 0;
2950
+ }
23772951
2378
- cpumask_set_cpu(i, hctx->cpumask);
2379
- ctx->index_hw = hctx->nr_ctx;
2380
- hctx->ctxs[hctx->nr_ctx++] = ctx;
2952
+ hctx = blk_mq_map_queue_type(q, j, i);
2953
+ ctx->hctxs[j] = hctx;
2954
+ /*
2955
+ * If the CPU is already set in the mask, then we've
2956
+ * mapped this one already. This can happen if
2957
+ * devices share queues across queue maps.
2958
+ */
2959
+ if (cpumask_test_cpu(i, hctx->cpumask))
2960
+ continue;
2961
+
2962
+ cpumask_set_cpu(i, hctx->cpumask);
2963
+ hctx->type = j;
2964
+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
2965
+ hctx->ctxs[hctx->nr_ctx++] = ctx;
2966
+
2967
+ /*
2968
+ * If the nr_ctx type overflows, we have exceeded the
2969
+ * amount of sw queues we can support.
2970
+ */
2971
+ BUG_ON(!hctx->nr_ctx);
2972
+ }
2973
+
2974
+ for (; j < HCTX_MAX_TYPES; j++)
2975
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2976
+ HCTX_TYPE_DEFAULT, i);
23812977 }
23822978
23832979 queue_for_each_hw_ctx(q, hctx, i) {
....@@ -2426,14 +3022,14 @@
24263022
24273023 queue_for_each_hw_ctx(q, hctx, i) {
24283024 if (shared)
2429
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
3025
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24303026 else
2431
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
3027
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24323028 }
24333029 }
24343030
2435
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2436
- bool shared)
3031
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3032
+ bool shared)
24373033 {
24383034 struct request_queue *q;
24393035
....@@ -2451,12 +3047,12 @@
24513047 struct blk_mq_tag_set *set = q->tag_set;
24523048
24533049 mutex_lock(&set->tag_list_lock);
2454
- list_del_rcu(&q->tag_set_list);
3050
+ list_del(&q->tag_set_list);
24553051 if (list_is_singular(&set->tag_list)) {
24563052 /* just transitioned to unshared */
2457
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
3053
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24583054 /* update existing queue */
2459
- blk_mq_update_tag_set_depth(set, false);
3055
+ blk_mq_update_tag_set_shared(set, false);
24603056 }
24613057 mutex_unlock(&set->tag_list_lock);
24623058 INIT_LIST_HEAD(&q->tag_set_list);
....@@ -2465,24 +3061,50 @@
24653061 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
24663062 struct request_queue *q)
24673063 {
2468
- q->tag_set = set;
2469
-
24703064 mutex_lock(&set->tag_list_lock);
24713065
24723066 /*
24733067 * Check to see if we're transitioning to shared (from 1 to 2 queues).
24743068 */
24753069 if (!list_empty(&set->tag_list) &&
2476
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2477
- set->flags |= BLK_MQ_F_TAG_SHARED;
3070
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3071
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24783072 /* update existing queue */
2479
- blk_mq_update_tag_set_depth(set, true);
3073
+ blk_mq_update_tag_set_shared(set, true);
24803074 }
2481
- if (set->flags & BLK_MQ_F_TAG_SHARED)
3075
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
24823076 queue_set_hctx_shared(q, true);
2483
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
3077
+ list_add_tail(&q->tag_set_list, &set->tag_list);
24843078
24853079 mutex_unlock(&set->tag_list_lock);
3080
+}
3081
+
3082
+/* All allocations will be freed in release handler of q->mq_kobj */
3083
+static int blk_mq_alloc_ctxs(struct request_queue *q)
3084
+{
3085
+ struct blk_mq_ctxs *ctxs;
3086
+ int cpu;
3087
+
3088
+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3089
+ if (!ctxs)
3090
+ return -ENOMEM;
3091
+
3092
+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3093
+ if (!ctxs->queue_ctx)
3094
+ goto fail;
3095
+
3096
+ for_each_possible_cpu(cpu) {
3097
+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3098
+ ctx->ctxs = ctxs;
3099
+ }
3100
+
3101
+ q->mq_kobj = &ctxs->kobj;
3102
+ q->queue_ctx = ctxs->queue_ctx;
3103
+
3104
+ return 0;
3105
+ fail:
3106
+ kfree(ctxs);
3107
+ return -ENOMEM;
24863108 }
24873109
24883110 /*
....@@ -2493,17 +3115,17 @@
24933115 */
24943116 void blk_mq_release(struct request_queue *q)
24953117 {
2496
- struct blk_mq_hw_ctx *hctx;
2497
- unsigned int i;
3118
+ struct blk_mq_hw_ctx *hctx, *next;
3119
+ int i;
24983120
2499
- /* hctx kobj stays in hctx */
2500
- queue_for_each_hw_ctx(q, hctx, i) {
2501
- if (!hctx)
2502
- continue;
3121
+ queue_for_each_hw_ctx(q, hctx, i)
3122
+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3123
+
3124
+ /* all hctx are in .unused_hctx_list now */
3125
+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3126
+ list_del_init(&hctx->hctx_list);
25033127 kobject_put(&hctx->kobj);
25043128 }
2505
-
2506
- q->mq_map = NULL;
25073129
25083130 kfree(q->queue_hw_ctx);
25093131
....@@ -2512,102 +3134,184 @@
25123134 * both share lifetime with request queue.
25133135 */
25143136 blk_mq_sysfs_deinit(q);
2515
-
2516
- free_percpu(q->queue_ctx);
25173137 }
25183138
2519
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3139
+struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3140
+ void *queuedata)
25203141 {
25213142 struct request_queue *uninit_q, *q;
25223143
2523
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
3144
+ uninit_q = blk_alloc_queue(set->numa_node);
25243145 if (!uninit_q)
25253146 return ERR_PTR(-ENOMEM);
3147
+ uninit_q->queuedata = queuedata;
25263148
2527
- q = blk_mq_init_allocated_queue(set, uninit_q);
3149
+ /*
3150
+ * Initialize the queue without an elevator. device_add_disk() will do
3151
+ * the initialization.
3152
+ */
3153
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
25283154 if (IS_ERR(q))
25293155 blk_cleanup_queue(uninit_q);
25303156
25313157 return q;
25323158 }
3159
+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
3160
+
3161
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3162
+{
3163
+ return blk_mq_init_queue_data(set, NULL);
3164
+}
25333165 EXPORT_SYMBOL(blk_mq_init_queue);
25343166
2535
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
3167
+/*
3168
+ * Helper for setting up a queue with mq ops, given queue depth, and
3169
+ * the passed in mq ops flags.
3170
+ */
3171
+struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
3172
+ const struct blk_mq_ops *ops,
3173
+ unsigned int queue_depth,
3174
+ unsigned int set_flags)
25363175 {
2537
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
3176
+ struct request_queue *q;
3177
+ int ret;
25383178
2539
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2540
- __alignof__(struct blk_mq_hw_ctx)) !=
2541
- sizeof(struct blk_mq_hw_ctx));
3179
+ memset(set, 0, sizeof(*set));
3180
+ set->ops = ops;
3181
+ set->nr_hw_queues = 1;
3182
+ set->nr_maps = 1;
3183
+ set->queue_depth = queue_depth;
3184
+ set->numa_node = NUMA_NO_NODE;
3185
+ set->flags = set_flags;
25423186
2543
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2544
- hw_ctx_size += sizeof(struct srcu_struct);
3187
+ ret = blk_mq_alloc_tag_set(set);
3188
+ if (ret)
3189
+ return ERR_PTR(ret);
25453190
2546
- return hw_ctx_size;
3191
+ q = blk_mq_init_queue(set);
3192
+ if (IS_ERR(q)) {
3193
+ blk_mq_free_tag_set(set);
3194
+ return q;
3195
+ }
3196
+
3197
+ return q;
3198
+}
3199
+EXPORT_SYMBOL(blk_mq_init_sq_queue);
3200
+
3201
+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3202
+ struct blk_mq_tag_set *set, struct request_queue *q,
3203
+ int hctx_idx, int node)
3204
+{
3205
+ struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3206
+
3207
+ /* reuse dead hctx first */
3208
+ spin_lock(&q->unused_hctx_lock);
3209
+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3210
+ if (tmp->numa_node == node) {
3211
+ hctx = tmp;
3212
+ break;
3213
+ }
3214
+ }
3215
+ if (hctx)
3216
+ list_del_init(&hctx->hctx_list);
3217
+ spin_unlock(&q->unused_hctx_lock);
3218
+
3219
+ if (!hctx)
3220
+ hctx = blk_mq_alloc_hctx(q, set, node);
3221
+ if (!hctx)
3222
+ goto fail;
3223
+
3224
+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3225
+ goto free_hctx;
3226
+
3227
+ return hctx;
3228
+
3229
+ free_hctx:
3230
+ kobject_put(&hctx->kobj);
3231
+ fail:
3232
+ return NULL;
25473233 }
25483234
25493235 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
25503236 struct request_queue *q)
25513237 {
2552
- int i, j;
3238
+ int i, j, end;
25533239 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
25543240
2555
- blk_mq_sysfs_unregister(q);
3241
+ if (q->nr_hw_queues < set->nr_hw_queues) {
3242
+ struct blk_mq_hw_ctx **new_hctxs;
3243
+
3244
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
3245
+ sizeof(*new_hctxs), GFP_KERNEL,
3246
+ set->numa_node);
3247
+ if (!new_hctxs)
3248
+ return;
3249
+ if (hctxs)
3250
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3251
+ sizeof(*hctxs));
3252
+ q->queue_hw_ctx = new_hctxs;
3253
+ kfree(hctxs);
3254
+ hctxs = new_hctxs;
3255
+ }
25563256
25573257 /* protect against switching io scheduler */
25583258 mutex_lock(&q->sysfs_lock);
25593259 for (i = 0; i < set->nr_hw_queues; i++) {
25603260 int node;
3261
+ struct blk_mq_hw_ctx *hctx;
25613262
2562
- if (hctxs[i])
3263
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3264
+ /*
3265
+ * If the hw queue has been mapped to another numa node,
3266
+ * we need to realloc the hctx. If allocation fails, fallback
3267
+ * to use the previous one.
3268
+ */
3269
+ if (hctxs[i] && (hctxs[i]->numa_node == node))
25633270 continue;
25643271
2565
- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2566
- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2567
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2568
- node);
2569
- if (!hctxs[i])
2570
- break;
2571
-
2572
- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2573
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2574
- node)) {
2575
- kfree(hctxs[i]);
2576
- hctxs[i] = NULL;
2577
- break;
3272
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3273
+ if (hctx) {
3274
+ if (hctxs[i])
3275
+ blk_mq_exit_hctx(q, set, hctxs[i], i);
3276
+ hctxs[i] = hctx;
3277
+ } else {
3278
+ if (hctxs[i])
3279
+ pr_warn("Allocate new hctx on node %d fails,\
3280
+ fallback to previous one on node %d\n",
3281
+ node, hctxs[i]->numa_node);
3282
+ else
3283
+ break;
25783284 }
2579
-
2580
- atomic_set(&hctxs[i]->nr_active, 0);
2581
- hctxs[i]->numa_node = node;
2582
- hctxs[i]->queue_num = i;
2583
-
2584
- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2585
- free_cpumask_var(hctxs[i]->cpumask);
2586
- kfree(hctxs[i]);
2587
- hctxs[i] = NULL;
2588
- break;
2589
- }
2590
- blk_mq_hctx_kobj_init(hctxs[i]);
25913285 }
2592
- for (j = i; j < q->nr_hw_queues; j++) {
3286
+ /*
3287
+ * Increasing nr_hw_queues fails. Free the newly allocated
3288
+ * hctxs and keep the previous q->nr_hw_queues.
3289
+ */
3290
+ if (i != set->nr_hw_queues) {
3291
+ j = q->nr_hw_queues;
3292
+ end = i;
3293
+ } else {
3294
+ j = i;
3295
+ end = q->nr_hw_queues;
3296
+ q->nr_hw_queues = set->nr_hw_queues;
3297
+ }
3298
+
3299
+ for (; j < end; j++) {
25933300 struct blk_mq_hw_ctx *hctx = hctxs[j];
25943301
25953302 if (hctx) {
25963303 if (hctx->tags)
25973304 blk_mq_free_map_and_requests(set, j);
25983305 blk_mq_exit_hctx(q, set, hctx, j);
2599
- kobject_put(&hctx->kobj);
26003306 hctxs[j] = NULL;
2601
-
26023307 }
26033308 }
2604
- q->nr_hw_queues = i;
26053309 mutex_unlock(&q->sysfs_lock);
2606
- blk_mq_sysfs_register(q);
26073310 }
26083311
26093312 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2610
- struct request_queue *q)
3313
+ struct request_queue *q,
3314
+ bool elevator_init)
26113315 {
26123316 /* mark the queue as mq asap */
26133317 q->mq_ops = set->ops;
....@@ -2618,19 +3322,14 @@
26183322 if (!q->poll_cb)
26193323 goto err_exit;
26203324
2621
- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2622
- if (!q->queue_ctx)
2623
- goto err_exit;
3325
+ if (blk_mq_alloc_ctxs(q))
3326
+ goto err_poll;
26243327
26253328 /* init q->mq_kobj and sw queues' kobjects */
26263329 blk_mq_sysfs_init(q);
26273330
2628
- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2629
- GFP_KERNEL, set->numa_node);
2630
- if (!q->queue_hw_ctx)
2631
- goto err_percpu;
2632
-
2633
- q->mq_map = set->mq_map;
3331
+ INIT_LIST_HEAD(&q->unused_hctx_list);
3332
+ spin_lock_init(&q->unused_hctx_lock);
26343333
26353334 blk_mq_realloc_hw_ctxs(set, q);
26363335 if (!q->nr_hw_queues)
....@@ -2639,12 +3338,12 @@
26393338 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
26403339 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
26413340
2642
- q->nr_queues = nr_cpu_ids;
3341
+ q->tag_set = set;
26433342
26443343 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2645
-
2646
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2647
- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
3344
+ if (set->nr_maps > HCTX_TYPE_POLL &&
3345
+ set->map[HCTX_TYPE_POLL].nr_queues)
3346
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
26483347
26493348 q->sg_reserved_size = INT_MAX;
26503349
....@@ -2652,41 +3351,29 @@
26523351 INIT_LIST_HEAD(&q->requeue_list);
26533352 spin_lock_init(&q->requeue_lock);
26543353
2655
- blk_queue_make_request(q, blk_mq_make_request);
2656
- if (q->mq_ops->poll)
2657
- q->poll_fn = blk_mq_poll;
2658
-
2659
- /*
2660
- * Do this after blk_queue_make_request() overrides it...
2661
- */
26623354 q->nr_requests = set->queue_depth;
26633355
26643356 /*
26653357 * Default to classic polling
26663358 */
2667
- q->poll_nsec = -1;
2668
-
2669
- if (set->ops->complete)
2670
- blk_queue_softirq_done(q, set->ops->complete);
3359
+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
26713360
26723361 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
26733362 blk_mq_add_queue_tag_set(set, q);
26743363 blk_mq_map_swqueue(q);
26753364
2676
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2677
- int ret;
2678
-
2679
- ret = elevator_init_mq(q);
2680
- if (ret)
2681
- return ERR_PTR(ret);
2682
- }
3365
+ if (elevator_init)
3366
+ elevator_init_mq(q);
26833367
26843368 return q;
26853369
26863370 err_hctxs:
26873371 kfree(q->queue_hw_ctx);
2688
-err_percpu:
2689
- free_percpu(q->queue_ctx);
3372
+ q->nr_hw_queues = 0;
3373
+ blk_mq_sysfs_deinit(q);
3374
+err_poll:
3375
+ blk_stat_free_callback(q->poll_cb);
3376
+ q->poll_cb = NULL;
26903377 err_exit:
26913378 q->mq_ops = NULL;
26923379 return ERR_PTR(-ENOMEM);
....@@ -2704,38 +3391,21 @@
27043391 blk_mq_del_queue_tag_set(q);
27053392 }
27063393
2707
-/* Basically redo blk_mq_init_queue with queue frozen */
2708
-static void blk_mq_queue_reinit(struct request_queue *q)
2709
-{
2710
- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2711
-
2712
- blk_mq_debugfs_unregister_hctxs(q);
2713
- blk_mq_sysfs_unregister(q);
2714
-
2715
- /*
2716
- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2717
- * we should change hctx numa_node according to the new topology (this
2718
- * involves freeing and re-allocating memory, worth doing?)
2719
- */
2720
- blk_mq_map_swqueue(q);
2721
-
2722
- blk_mq_sysfs_register(q);
2723
- blk_mq_debugfs_register_hctxs(q);
2724
-}
2725
-
27263394 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
27273395 {
27283396 int i;
27293397
2730
- for (i = 0; i < set->nr_hw_queues; i++)
2731
- if (!__blk_mq_alloc_rq_map(set, i))
3398
+ for (i = 0; i < set->nr_hw_queues; i++) {
3399
+ if (!__blk_mq_alloc_map_and_request(set, i))
27323400 goto out_unwind;
3401
+ cond_resched();
3402
+ }
27333403
27343404 return 0;
27353405
27363406 out_unwind:
27373407 while (--i >= 0)
2738
- blk_mq_free_rq_map(set->tags[i]);
3408
+ blk_mq_free_map_and_requests(set, i);
27393409
27403410 return -ENOMEM;
27413411 }
....@@ -2745,7 +3415,7 @@
27453415 * may reduce the depth asked for, if memory is tight. set->queue_depth
27463416 * will be updated to reflect the allocated depth.
27473417 */
2748
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3418
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
27493419 {
27503420 unsigned int depth;
27513421 int err;
....@@ -2777,7 +3447,17 @@
27773447
27783448 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
27793449 {
2780
- if (set->ops->map_queues) {
3450
+ /*
3451
+ * blk_mq_map_queues() and multiple .map_queues() implementations
3452
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3453
+ * number of hardware queues.
3454
+ */
3455
+ if (set->nr_maps == 1)
3456
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3457
+
3458
+ if (set->ops->map_queues && !is_kdump_kernel()) {
3459
+ int i;
3460
+
27813461 /*
27823462 * transport .map_queues is usually done in the following
27833463 * way:
....@@ -2785,18 +3465,44 @@
27853465 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
27863466 * mask = get_cpu_mask(queue)
27873467 * for_each_cpu(cpu, mask)
2788
- * set->mq_map[cpu] = queue;
3468
+ * set->map[x].mq_map[cpu] = queue;
27893469 * }
27903470 *
27913471 * When we need to remap, the table has to be cleared for
27923472 * killing stale mapping since one CPU may not be mapped
27933473 * to any hw queue.
27943474 */
2795
- blk_mq_clear_mq_map(set);
3475
+ for (i = 0; i < set->nr_maps; i++)
3476
+ blk_mq_clear_mq_map(&set->map[i]);
27963477
27973478 return set->ops->map_queues(set);
2798
- } else
2799
- return blk_mq_map_queues(set);
3479
+ } else {
3480
+ BUG_ON(set->nr_maps > 1);
3481
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3482
+ }
3483
+}
3484
+
3485
+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3486
+ int cur_nr_hw_queues, int new_nr_hw_queues)
3487
+{
3488
+ struct blk_mq_tags **new_tags;
3489
+
3490
+ if (cur_nr_hw_queues >= new_nr_hw_queues)
3491
+ return 0;
3492
+
3493
+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3494
+ GFP_KERNEL, set->numa_node);
3495
+ if (!new_tags)
3496
+ return -ENOMEM;
3497
+
3498
+ if (set->tags)
3499
+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
3500
+ sizeof(*set->tags));
3501
+ kfree(set->tags);
3502
+ set->tags = new_tags;
3503
+ set->nr_hw_queues = new_nr_hw_queues;
3504
+
3505
+ return 0;
28003506 }
28013507
28023508 /*
....@@ -2807,7 +3513,7 @@
28073513 */
28083514 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
28093515 {
2810
- int ret;
3516
+ int i, ret;
28113517
28123518 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
28133519
....@@ -2830,6 +3536,11 @@
28303536 set->queue_depth = BLK_MQ_MAX_DEPTH;
28313537 }
28323538
3539
+ if (!set->nr_maps)
3540
+ set->nr_maps = 1;
3541
+ else if (set->nr_maps > HCTX_MAX_TYPES)
3542
+ return -EINVAL;
3543
+
28333544 /*
28343545 * If a crashdump is active, then we are potentially in a very
28353546 * memory constrained environment. Limit us to 1 queue and
....@@ -2837,42 +3548,59 @@
28373548 */
28383549 if (is_kdump_kernel()) {
28393550 set->nr_hw_queues = 1;
3551
+ set->nr_maps = 1;
28403552 set->queue_depth = min(64U, set->queue_depth);
28413553 }
28423554 /*
2843
- * There is no use for more h/w queues than cpus.
3555
+ * There is no use for more h/w queues than cpus if we just have
3556
+ * a single map
28443557 */
2845
- if (set->nr_hw_queues > nr_cpu_ids)
3558
+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
28463559 set->nr_hw_queues = nr_cpu_ids;
28473560
2848
- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2849
- GFP_KERNEL, set->numa_node);
2850
- if (!set->tags)
3561
+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
28513562 return -ENOMEM;
28523563
28533564 ret = -ENOMEM;
2854
- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2855
- GFP_KERNEL, set->numa_node);
2856
- if (!set->mq_map)
2857
- goto out_free_tags;
3565
+ for (i = 0; i < set->nr_maps; i++) {
3566
+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3567
+ sizeof(set->map[i].mq_map[0]),
3568
+ GFP_KERNEL, set->numa_node);
3569
+ if (!set->map[i].mq_map)
3570
+ goto out_free_mq_map;
3571
+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3572
+ }
28583573
28593574 ret = blk_mq_update_queue_map(set);
28603575 if (ret)
28613576 goto out_free_mq_map;
28623577
2863
- ret = blk_mq_alloc_rq_maps(set);
3578
+ ret = blk_mq_alloc_map_and_requests(set);
28643579 if (ret)
28653580 goto out_free_mq_map;
3581
+
3582
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
3583
+ atomic_set(&set->active_queues_shared_sbitmap, 0);
3584
+
3585
+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
3586
+ ret = -ENOMEM;
3587
+ goto out_free_mq_rq_maps;
3588
+ }
3589
+ }
28663590
28673591 mutex_init(&set->tag_list_lock);
28683592 INIT_LIST_HEAD(&set->tag_list);
28693593
28703594 return 0;
28713595
3596
+out_free_mq_rq_maps:
3597
+ for (i = 0; i < set->nr_hw_queues; i++)
3598
+ blk_mq_free_map_and_requests(set, i);
28723599 out_free_mq_map:
2873
- kfree(set->mq_map);
2874
- set->mq_map = NULL;
2875
-out_free_tags:
3600
+ for (i = 0; i < set->nr_maps; i++) {
3601
+ kfree(set->map[i].mq_map);
3602
+ set->map[i].mq_map = NULL;
3603
+ }
28763604 kfree(set->tags);
28773605 set->tags = NULL;
28783606 return ret;
....@@ -2881,13 +3609,18 @@
28813609
28823610 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
28833611 {
2884
- int i;
3612
+ int i, j;
28853613
2886
- for (i = 0; i < nr_cpu_ids; i++)
3614
+ for (i = 0; i < set->nr_hw_queues; i++)
28873615 blk_mq_free_map_and_requests(set, i);
28883616
2889
- kfree(set->mq_map);
2890
- set->mq_map = NULL;
3617
+ if (blk_mq_is_sbitmap_shared(set->flags))
3618
+ blk_mq_exit_shared_sbitmap(set);
3619
+
3620
+ for (j = 0; j < set->nr_maps; j++) {
3621
+ kfree(set->map[j].mq_map);
3622
+ set->map[j].mq_map = NULL;
3623
+ }
28913624
28923625 kfree(set->tags);
28933626 set->tags = NULL;
....@@ -2903,6 +3636,9 @@
29033636 if (!set)
29043637 return -EINVAL;
29053638
3639
+ if (q->nr_requests == nr)
3640
+ return 0;
3641
+
29063642 blk_mq_freeze_queue(q);
29073643 blk_mq_quiesce_queue(q);
29083644
....@@ -2917,14 +3653,16 @@
29173653 if (!hctx->sched_tags) {
29183654 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
29193655 false);
3656
+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
3657
+ blk_mq_tag_resize_shared_sbitmap(set, nr);
29203658 } else {
29213659 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
29223660 nr, true);
29233661 }
29243662 if (ret)
29253663 break;
2926
- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2927
- q->elevator->type->ops.mq.depth_updated(hctx);
3664
+ if (q->elevator && q->elevator->type->ops.depth_updated)
3665
+ q->elevator->type->ops.depth_updated(hctx);
29283666 }
29293667
29303668 if (!ret)
....@@ -3011,20 +3749,19 @@
30113749 {
30123750 struct request_queue *q;
30133751 LIST_HEAD(head);
3752
+ int prev_nr_hw_queues;
30143753
30153754 lockdep_assert_held(&set->tag_list_lock);
30163755
3017
- if (nr_hw_queues > nr_cpu_ids)
3756
+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
30183757 nr_hw_queues = nr_cpu_ids;
3019
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3758
+ if (nr_hw_queues < 1)
3759
+ return;
3760
+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
30203761 return;
30213762
30223763 list_for_each_entry(q, &set->tag_list, tag_set_list)
30233764 blk_mq_freeze_queue(q);
3024
- /*
3025
- * Sync with blk_mq_queue_tag_busy_iter.
3026
- */
3027
- synchronize_rcu();
30283765 /*
30293766 * Switch IO scheduler to 'none', cleaning up the data associated
30303767 * with the previous scheduler. We will switch back once we are done
....@@ -3034,11 +3771,35 @@
30343771 if (!blk_mq_elv_switch_none(&head, q))
30353772 goto switch_back;
30363773
3774
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3775
+ blk_mq_debugfs_unregister_hctxs(q);
3776
+ blk_mq_sysfs_unregister(q);
3777
+ }
3778
+
3779
+ prev_nr_hw_queues = set->nr_hw_queues;
3780
+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3781
+ 0)
3782
+ goto reregister;
3783
+
30373784 set->nr_hw_queues = nr_hw_queues;
3785
+fallback:
30383786 blk_mq_update_queue_map(set);
30393787 list_for_each_entry(q, &set->tag_list, tag_set_list) {
30403788 blk_mq_realloc_hw_ctxs(set, q);
3041
- blk_mq_queue_reinit(q);
3789
+ if (q->nr_hw_queues != set->nr_hw_queues) {
3790
+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3791
+ nr_hw_queues, prev_nr_hw_queues);
3792
+ set->nr_hw_queues = prev_nr_hw_queues;
3793
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3794
+ goto fallback;
3795
+ }
3796
+ blk_mq_map_swqueue(q);
3797
+ }
3798
+
3799
+reregister:
3800
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3801
+ blk_mq_sysfs_register(q);
3802
+ blk_mq_debugfs_register_hctxs(q);
30423803 }
30433804
30443805 switch_back:
....@@ -3092,7 +3853,6 @@
30923853 }
30933854
30943855 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3095
- struct blk_mq_hw_ctx *hctx,
30963856 struct request *rq)
30973857 {
30983858 unsigned long ret = 0;
....@@ -3125,7 +3885,6 @@
31253885 }
31263886
31273887 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3128
- struct blk_mq_hw_ctx *hctx,
31293888 struct request *rq)
31303889 {
31313890 struct hrtimer_sleeper hs;
....@@ -3137,18 +3896,15 @@
31373896 return false;
31383897
31393898 /*
3140
- * poll_nsec can be:
3899
+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
31413900 *
3142
- * -1: don't ever hybrid sleep
31433901 * 0: use half of prev avg
31443902 * >0: use this specific value
31453903 */
3146
- if (q->poll_nsec == -1)
3147
- return false;
3148
- else if (q->poll_nsec > 0)
3904
+ if (q->poll_nsec > 0)
31493905 nsecs = q->poll_nsec;
31503906 else
3151
- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3907
+ nsecs = blk_mq_poll_nsecs(q, rq);
31523908
31533909 if (!nsecs)
31543910 return false;
....@@ -3162,14 +3918,14 @@
31623918 kt = nsecs;
31633919
31643920 mode = HRTIMER_MODE_REL;
3165
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
3921
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
31663922 hrtimer_set_expires(&hs.timer, kt);
31673923
31683924 do {
31693925 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
31703926 break;
31713927 set_current_state(TASK_UNINTERRUPTIBLE);
3172
- hrtimer_start_expires(&hs.timer, mode);
3928
+ hrtimer_sleeper_start_expires(&hs, mode);
31733929 if (hs.task)
31743930 io_schedule();
31753931 hrtimer_cancel(&hs.timer);
....@@ -3181,59 +3937,14 @@
31813937 return true;
31823938 }
31833939
3184
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3940
+static bool blk_mq_poll_hybrid(struct request_queue *q,
3941
+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
31853942 {
3186
- struct request_queue *q = hctx->queue;
3187
- long state;
3188
-
3189
- /*
3190
- * If we sleep, have the caller restart the poll loop to reset
3191
- * the state. Like for the other success return cases, the
3192
- * caller is responsible for checking if the IO completed. If
3193
- * the IO isn't complete, we'll get called again and will go
3194
- * straight to the busy poll loop.
3195
- */
3196
- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3197
- return true;
3198
-
3199
- hctx->poll_considered++;
3200
-
3201
- state = current->state;
3202
- while (!need_resched()) {
3203
- int ret;
3204
-
3205
- hctx->poll_invoked++;
3206
-
3207
- ret = q->mq_ops->poll(hctx, rq->tag);
3208
- if (ret > 0) {
3209
- hctx->poll_success++;
3210
- set_current_state(TASK_RUNNING);
3211
- return true;
3212
- }
3213
-
3214
- if (signal_pending_state(state, current))
3215
- set_current_state(TASK_RUNNING);
3216
-
3217
- if (current->state == TASK_RUNNING)
3218
- return true;
3219
- if (ret < 0)
3220
- break;
3221
- cpu_relax();
3222
- }
3223
-
3224
- __set_current_state(TASK_RUNNING);
3225
- return false;
3226
-}
3227
-
3228
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3229
-{
3230
- struct blk_mq_hw_ctx *hctx;
32313943 struct request *rq;
32323944
3233
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3945
+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
32343946 return false;
32353947
3236
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
32373948 if (!blk_qc_t_is_internal(cookie))
32383949 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
32393950 else {
....@@ -3248,13 +3959,97 @@
32483959 return false;
32493960 }
32503961
3251
- return __blk_mq_poll(hctx, rq);
3962
+ return blk_mq_poll_hybrid_sleep(q, rq);
32523963 }
3964
+
3965
+/**
3966
+ * blk_poll - poll for IO completions
3967
+ * @q: the queue
3968
+ * @cookie: cookie passed back at IO submission time
3969
+ * @spin: whether to spin for completions
3970
+ *
3971
+ * Description:
3972
+ * Poll for completions on the passed in queue. Returns number of
3973
+ * completed entries found. If @spin is true, then blk_poll will continue
3974
+ * looping until at least one completion is found, unless the task is
3975
+ * otherwise marked running (or we need to reschedule).
3976
+ */
3977
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3978
+{
3979
+ struct blk_mq_hw_ctx *hctx;
3980
+ long state;
3981
+
3982
+ if (!blk_qc_t_valid(cookie) ||
3983
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3984
+ return 0;
3985
+
3986
+ if (current->plug)
3987
+ blk_flush_plug_list(current->plug, false);
3988
+
3989
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3990
+
3991
+ /*
3992
+ * If we sleep, have the caller restart the poll loop to reset
3993
+ * the state. Like for the other success return cases, the
3994
+ * caller is responsible for checking if the IO completed. If
3995
+ * the IO isn't complete, we'll get called again and will go
3996
+ * straight to the busy poll loop.
3997
+ */
3998
+ if (blk_mq_poll_hybrid(q, hctx, cookie))
3999
+ return 1;
4000
+
4001
+ hctx->poll_considered++;
4002
+
4003
+ state = current->state;
4004
+ do {
4005
+ int ret;
4006
+
4007
+ hctx->poll_invoked++;
4008
+
4009
+ ret = q->mq_ops->poll(hctx);
4010
+ if (ret > 0) {
4011
+ hctx->poll_success++;
4012
+ __set_current_state(TASK_RUNNING);
4013
+ return ret;
4014
+ }
4015
+
4016
+ if (signal_pending_state(state, current))
4017
+ __set_current_state(TASK_RUNNING);
4018
+
4019
+ if (current->state == TASK_RUNNING)
4020
+ return 1;
4021
+ if (ret < 0 || !spin)
4022
+ break;
4023
+ cpu_relax();
4024
+ } while (!need_resched());
4025
+
4026
+ __set_current_state(TASK_RUNNING);
4027
+ return 0;
4028
+}
4029
+EXPORT_SYMBOL_GPL(blk_poll);
4030
+
4031
+unsigned int blk_mq_rq_cpu(struct request *rq)
4032
+{
4033
+ return rq->mq_ctx->cpu;
4034
+}
4035
+EXPORT_SYMBOL(blk_mq_rq_cpu);
32534036
32544037 static int __init blk_mq_init(void)
32554038 {
4039
+ int i;
4040
+
4041
+ for_each_possible_cpu(i)
4042
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
4043
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4044
+
4045
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4046
+ "block/softirq:dead", NULL,
4047
+ blk_softirq_cpu_dead);
32564048 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
32574049 blk_mq_hctx_notify_dead);
4050
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4051
+ blk_mq_hctx_notify_online,
4052
+ blk_mq_hctx_notify_offline);
32584053 return 0;
32594054 }
32604055 subsys_initcall(blk_mq_init);