hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/block/blk-mq.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Block multiqueue core code
34 *
....@@ -25,30 +26,36 @@
2526 #include <linux/delay.h>
2627 #include <linux/crash_dump.h>
2728 #include <linux/prefetch.h>
29
+#include <linux/blk-crypto.h>
2830
2931 #include <trace/events/block.h>
3032
3133 #include <linux/blk-mq.h>
34
+#include <linux/t10-pi.h>
3235 #include "blk.h"
3336 #include "blk-mq.h"
3437 #include "blk-mq-debugfs.h"
3538 #include "blk-mq-tag.h"
39
+#include "blk-pm.h"
3640 #include "blk-stat.h"
3741 #include "blk-mq-sched.h"
3842 #include "blk-rq-qos.h"
3943
40
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
44
+#include <trace/hooks/block.h>
45
+
46
+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
47
+
4148 static void blk_mq_poll_stats_start(struct request_queue *q);
4249 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
4350
4451 static int blk_mq_poll_stats_bkt(const struct request *rq)
4552 {
46
- int ddir, bytes, bucket;
53
+ int ddir, sectors, bucket;
4754
4855 ddir = rq_data_dir(rq);
49
- bytes = blk_rq_bytes(rq);
56
+ sectors = blk_rq_stats_sectors(rq);
5057
51
- bucket = ddir + 2*(ilog2(bytes) - 9);
58
+ bucket = ddir + 2 * ilog2(sectors);
5259
5360 if (bucket < 0)
5461 return -1;
....@@ -59,7 +66,8 @@
5966 }
6067
6168 /*
62
- * Check if any of the ctx's have pending work in this hardware queue
69
+ * Check if any of the ctx, dispatch list or elevator
70
+ * have pending work in this hardware queue.
6371 */
6472 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
6573 {
....@@ -74,75 +82,67 @@
7482 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
7583 struct blk_mq_ctx *ctx)
7684 {
77
- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78
- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
85
+ const int bit = ctx->index_hw[hctx->type];
86
+
87
+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
88
+ sbitmap_set_bit(&hctx->ctx_map, bit);
7989 }
8090
8191 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
8292 struct blk_mq_ctx *ctx)
8393 {
84
- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
94
+ const int bit = ctx->index_hw[hctx->type];
95
+
96
+ sbitmap_clear_bit(&hctx->ctx_map, bit);
8597 }
8698
8799 struct mq_inflight {
88100 struct hd_struct *part;
89
- unsigned int *inflight;
101
+ unsigned int inflight[2];
90102 };
91103
92
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
104
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93105 struct request *rq, void *priv,
94106 bool reserved)
95107 {
96108 struct mq_inflight *mi = priv;
97109
98
- /*
99
- * index[0] counts the specific partition that was asked for. index[1]
100
- * counts the ones that are active on the whole device, so increment
101
- * that if mi->part is indeed a partition, and not a whole device.
102
- */
103
- if (rq->part == mi->part)
104
- mi->inflight[0]++;
105
- if (mi->part->partno)
106
- mi->inflight[1]++;
107
-}
108
-
109
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
110
- unsigned int inflight[2])
111
-{
112
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113
-
114
- inflight[0] = inflight[1] = 0;
115
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116
-}
117
-
118
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119
- struct request *rq, void *priv,
120
- bool reserved)
121
-{
122
- struct mq_inflight *mi = priv;
123
-
124
- if (rq->part == mi->part)
110
+ if ((!mi->part->partno || rq->part == mi->part) &&
111
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125112 mi->inflight[rq_data_dir(rq)]++;
113
+
114
+ return true;
115
+}
116
+
117
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
118
+{
119
+ struct mq_inflight mi = { .part = part };
120
+
121
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
122
+
123
+ return mi.inflight[0] + mi.inflight[1];
126124 }
127125
128126 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
129127 unsigned int inflight[2])
130128 {
131
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
129
+ struct mq_inflight mi = { .part = part };
132130
133
- inflight[0] = inflight[1] = 0;
134
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
131
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
132
+ inflight[0] = mi.inflight[0];
133
+ inflight[1] = mi.inflight[1];
135134 }
136135
137136 void blk_freeze_queue_start(struct request_queue *q)
138137 {
139
- int freeze_depth;
140
-
141
- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142
- if (freeze_depth == 1) {
138
+ mutex_lock(&q->mq_freeze_lock);
139
+ if (++q->mq_freeze_depth == 1) {
143140 percpu_ref_kill(&q->q_usage_counter);
144
- if (q->mq_ops)
141
+ mutex_unlock(&q->mq_freeze_lock);
142
+ if (queue_is_mq(q))
145143 blk_mq_run_hw_queues(q, false);
144
+ } else {
145
+ mutex_unlock(&q->mq_freeze_lock);
146146 }
147147 }
148148 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
....@@ -176,8 +176,6 @@
176176 * exported to drivers as the only user for unfreeze is blk_mq.
177177 */
178178 blk_freeze_queue_start(q);
179
- if (!q->mq_ops)
180
- blk_drain_queue(q);
181179 blk_mq_freeze_queue_wait(q);
182180 }
183181
....@@ -193,14 +191,14 @@
193191
194192 void blk_mq_unfreeze_queue(struct request_queue *q)
195193 {
196
- int freeze_depth;
197
-
198
- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199
- WARN_ON_ONCE(freeze_depth < 0);
200
- if (!freeze_depth) {
201
- percpu_ref_reinit(&q->q_usage_counter);
194
+ mutex_lock(&q->mq_freeze_lock);
195
+ q->mq_freeze_depth--;
196
+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
197
+ if (!q->mq_freeze_depth) {
198
+ percpu_ref_resurrect(&q->q_usage_counter);
202199 wake_up_all(&q->mq_freeze_wq);
203200 }
201
+ mutex_unlock(&q->mq_freeze_lock);
204202 }
205203 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206204
....@@ -268,40 +266,37 @@
268266 blk_mq_tag_wakeup_all(hctx->tags, true);
269267 }
270268
271
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
269
+/*
270
+ * Only need start/end time stamping if we have iostat or
271
+ * blk stats enabled, or using an IO scheduler.
272
+ */
273
+static inline bool blk_mq_need_time_stamp(struct request *rq)
272274 {
273
- return blk_mq_has_free_tags(hctx->tags);
275
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
274276 }
275
-EXPORT_SYMBOL(blk_mq_can_queue);
276277
277278 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
278
- unsigned int tag, unsigned int op)
279
+ unsigned int tag, u64 alloc_time_ns)
279280 {
280281 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281282 struct request *rq = tags->static_rqs[tag];
282
- req_flags_t rq_flags = 0;
283283
284
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285
- rq->tag = -1;
284
+ if (data->q->elevator) {
285
+ rq->tag = BLK_MQ_NO_TAG;
286286 rq->internal_tag = tag;
287287 } else {
288
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289
- rq_flags = RQF_MQ_INFLIGHT;
290
- atomic_inc(&data->hctx->nr_active);
291
- }
292288 rq->tag = tag;
293
- rq->internal_tag = -1;
294
- data->hctx->tags->rqs[rq->tag] = rq;
289
+ rq->internal_tag = BLK_MQ_NO_TAG;
295290 }
296291
297292 /* csd/requeue_work/fifo_time is initialized before use */
298293 rq->q = data->q;
299294 rq->mq_ctx = data->ctx;
300
- rq->rq_flags = rq_flags;
301
- rq->cpu = -1;
302
- rq->cmd_flags = op;
303
- if (data->flags & BLK_MQ_REQ_PREEMPT)
304
- rq->rq_flags |= RQF_PREEMPT;
295
+ rq->mq_hctx = data->hctx;
296
+ rq->rq_flags = 0;
297
+ rq->cmd_flags = data->cmd_flags;
298
+ if (data->flags & BLK_MQ_REQ_PM)
299
+ rq->rq_flags |= RQF_PM;
305300 if (blk_queue_io_stat(data->q))
306301 rq->rq_flags |= RQF_IO_STAT;
307302 INIT_LIST_HEAD(&rq->queuelist);
....@@ -309,100 +304,110 @@
309304 RB_CLEAR_NODE(&rq->rb_node);
310305 rq->rq_disk = NULL;
311306 rq->part = NULL;
312
- rq->start_time_ns = ktime_get_ns();
307
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
308
+ rq->alloc_time_ns = alloc_time_ns;
309
+#endif
310
+ if (blk_mq_need_time_stamp(rq))
311
+ rq->start_time_ns = ktime_get_ns();
312
+ else
313
+ rq->start_time_ns = 0;
313314 rq->io_start_time_ns = 0;
315
+ rq->stats_sectors = 0;
314316 rq->nr_phys_segments = 0;
315317 #if defined(CONFIG_BLK_DEV_INTEGRITY)
316318 rq->nr_integrity_segments = 0;
317319 #endif
318
- rq->special = NULL;
320
+ blk_crypto_rq_set_defaults(rq);
319321 /* tag was already set */
320
- rq->extra_len = 0;
321
- rq->__deadline = 0;
322
+ WRITE_ONCE(rq->deadline, 0);
322323
323
-#ifdef CONFIG_PREEMPT_RT_FULL
324
- INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
325
-#endif
326
- INIT_LIST_HEAD(&rq->timeout_list);
327324 rq->timeout = 0;
328325
329326 rq->end_io = NULL;
330327 rq->end_io_data = NULL;
331
- rq->next_rq = NULL;
332328
333
-#ifdef CONFIG_BLK_CGROUP
334
- rq->rl = NULL;
335
-#endif
336
-
337
- data->ctx->rq_dispatched[op_is_sync(op)]++;
329
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
338330 refcount_set(&rq->ref, 1);
331
+
332
+ if (!op_is_flush(data->cmd_flags)) {
333
+ struct elevator_queue *e = data->q->elevator;
334
+
335
+ rq->elv.icq = NULL;
336
+ if (e && e->type->ops.prepare_request) {
337
+ if (e->type->icq_cache)
338
+ blk_mq_sched_assign_ioc(rq);
339
+
340
+ e->type->ops.prepare_request(rq);
341
+ rq->rq_flags |= RQF_ELVPRIV;
342
+ }
343
+ }
344
+
345
+ data->hctx->queued++;
346
+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
339347 return rq;
340348 }
341349
342
-static struct request *blk_mq_get_request(struct request_queue *q,
343
- struct bio *bio, unsigned int op,
344
- struct blk_mq_alloc_data *data)
350
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
345351 {
352
+ struct request_queue *q = data->q;
346353 struct elevator_queue *e = q->elevator;
347
- struct request *rq;
354
+ u64 alloc_time_ns = 0;
348355 unsigned int tag;
349
- bool put_ctx_on_error = false;
350356
351
- blk_queue_enter_live(q);
352
- data->q = q;
353
- if (likely(!data->ctx)) {
354
- data->ctx = blk_mq_get_ctx(q);
355
- put_ctx_on_error = true;
356
- }
357
- if (likely(!data->hctx))
358
- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
359
- if (op & REQ_NOWAIT)
357
+ /* alloc_time includes depth and tag waits */
358
+ if (blk_queue_rq_alloc_time(q))
359
+ alloc_time_ns = ktime_get_ns();
360
+
361
+ if (data->cmd_flags & REQ_NOWAIT)
360362 data->flags |= BLK_MQ_REQ_NOWAIT;
361363
362364 if (e) {
363
- data->flags |= BLK_MQ_REQ_INTERNAL;
364
-
365365 /*
366366 * Flush requests are special and go directly to the
367367 * dispatch list. Don't include reserved tags in the
368368 * limiting, as it isn't useful.
369369 */
370
- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
370
+ if (!op_is_flush(data->cmd_flags) &&
371
+ e->type->ops.limit_depth &&
371372 !(data->flags & BLK_MQ_REQ_RESERVED))
372
- e->type->ops.mq.limit_depth(op, data);
373
- } else {
373
+ e->type->ops.limit_depth(data->cmd_flags, data);
374
+ }
375
+
376
+retry:
377
+ data->ctx = blk_mq_get_ctx(q);
378
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
379
+ if (!e)
374380 blk_mq_tag_busy(data->hctx);
375
- }
376381
382
+ /*
383
+ * Waiting allocations only fail because of an inactive hctx. In that
384
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
385
+ * should have migrated us to an online CPU by now.
386
+ */
377387 tag = blk_mq_get_tag(data);
378
- if (tag == BLK_MQ_TAG_FAIL) {
379
- if (put_ctx_on_error) {
380
- blk_mq_put_ctx(data->ctx);
381
- data->ctx = NULL;
382
- }
383
- blk_queue_exit(q);
384
- return NULL;
385
- }
388
+ if (tag == BLK_MQ_NO_TAG) {
389
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
390
+ return NULL;
386391
387
- rq = blk_mq_rq_ctx_init(data, tag, op);
388
- if (!op_is_flush(op)) {
389
- rq->elv.icq = NULL;
390
- if (e && e->type->ops.mq.prepare_request) {
391
- if (e->type->icq_cache && rq_ioc(bio))
392
- blk_mq_sched_assign_ioc(rq, bio);
393
-
394
- e->type->ops.mq.prepare_request(rq, bio);
395
- rq->rq_flags |= RQF_ELVPRIV;
396
- }
392
+ /*
393
+ * Give up the CPU and sleep for a random short time to ensure
394
+ * that thread using a realtime scheduling class are migrated
395
+ * off the CPU, and thus off the hctx that is going away.
396
+ */
397
+ msleep(3);
398
+ goto retry;
397399 }
398
- data->hctx->queued++;
399
- return rq;
400
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
400401 }
401402
402403 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
403404 blk_mq_req_flags_t flags)
404405 {
405
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
406
+ struct blk_mq_alloc_data data = {
407
+ .q = q,
408
+ .flags = flags,
409
+ .cmd_flags = op,
410
+ };
406411 struct request *rq;
407412 int ret;
408413
....@@ -410,28 +415,35 @@
410415 if (ret)
411416 return ERR_PTR(ret);
412417
413
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
414
- blk_queue_exit(q);
415
-
418
+ rq = __blk_mq_alloc_request(&data);
416419 if (!rq)
417
- return ERR_PTR(-EWOULDBLOCK);
418
-
419
- blk_mq_put_ctx(alloc_data.ctx);
420
-
420
+ goto out_queue_exit;
421421 rq->__data_len = 0;
422422 rq->__sector = (sector_t) -1;
423423 rq->bio = rq->biotail = NULL;
424424 return rq;
425
+out_queue_exit:
426
+ blk_queue_exit(q);
427
+ return ERR_PTR(-EWOULDBLOCK);
425428 }
426429 EXPORT_SYMBOL(blk_mq_alloc_request);
427430
428431 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
429432 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
430433 {
431
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
432
- struct request *rq;
434
+ struct blk_mq_alloc_data data = {
435
+ .q = q,
436
+ .flags = flags,
437
+ .cmd_flags = op,
438
+ };
439
+ u64 alloc_time_ns = 0;
433440 unsigned int cpu;
441
+ unsigned int tag;
434442 int ret;
443
+
444
+ /* alloc_time includes depth and tag waits */
445
+ if (blk_queue_rq_alloc_time(q))
446
+ alloc_time_ns = ktime_get_ns();
435447
436448 /*
437449 * If the tag allocator sleeps we could get an allocation for a
....@@ -439,7 +451,7 @@
439451 * allocator for this for the rare use case of a command tied to
440452 * a specific queue.
441453 */
442
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
454
+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
443455 return ERR_PTR(-EINVAL);
444456
445457 if (hctx_idx >= q->nr_hw_queues)
....@@ -453,21 +465,27 @@
453465 * Check if the hardware context is actually mapped to anything.
454466 * If not tell the caller that it should skip this queue.
455467 */
456
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
457
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
458
- blk_queue_exit(q);
459
- return ERR_PTR(-EXDEV);
460
- }
461
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
462
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
468
+ ret = -EXDEV;
469
+ data.hctx = q->queue_hw_ctx[hctx_idx];
470
+ if (!blk_mq_hw_queue_mapped(data.hctx))
471
+ goto out_queue_exit;
472
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
473
+ if (cpu >= nr_cpu_ids)
474
+ goto out_queue_exit;
475
+ data.ctx = __blk_mq_get_ctx(q, cpu);
463476
464
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
477
+ if (!q->elevator)
478
+ blk_mq_tag_busy(data.hctx);
479
+
480
+ ret = -EWOULDBLOCK;
481
+ tag = blk_mq_get_tag(&data);
482
+ if (tag == BLK_MQ_NO_TAG)
483
+ goto out_queue_exit;
484
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
485
+
486
+out_queue_exit:
465487 blk_queue_exit(q);
466
-
467
- if (!rq)
468
- return ERR_PTR(-EWOULDBLOCK);
469
-
470
- return rq;
488
+ return ERR_PTR(ret);
471489 }
472490 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
473491
....@@ -475,13 +493,16 @@
475493 {
476494 struct request_queue *q = rq->q;
477495 struct blk_mq_ctx *ctx = rq->mq_ctx;
478
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
496
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
479497 const int sched_tag = rq->internal_tag;
480498
481
- if (rq->tag != -1)
482
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
483
- if (sched_tag != -1)
484
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
499
+ blk_crypto_free_request(rq);
500
+ blk_pm_mark_last_busy(rq);
501
+ rq->mq_hctx = NULL;
502
+ if (rq->tag != BLK_MQ_NO_TAG)
503
+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
504
+ if (sched_tag != BLK_MQ_NO_TAG)
505
+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
485506 blk_mq_sched_restart(hctx);
486507 blk_queue_exit(q);
487508 }
....@@ -491,11 +512,11 @@
491512 struct request_queue *q = rq->q;
492513 struct elevator_queue *e = q->elevator;
493514 struct blk_mq_ctx *ctx = rq->mq_ctx;
494
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
515
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
495516
496517 if (rq->rq_flags & RQF_ELVPRIV) {
497
- if (e && e->type->ops.mq.finish_request)
498
- e->type->ops.mq.finish_request(rq);
518
+ if (e && e->type->ops.finish_request)
519
+ e->type->ops.finish_request(rq);
499520 if (rq->elv.icq) {
500521 put_io_context(rq->elv.icq->ioc);
501522 rq->elv.icq = NULL;
....@@ -504,15 +525,12 @@
504525
505526 ctx->rq_completed[rq_is_sync(rq)]++;
506527 if (rq->rq_flags & RQF_MQ_INFLIGHT)
507
- atomic_dec(&hctx->nr_active);
528
+ __blk_mq_dec_active_requests(hctx);
508529
509530 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
510531 laptop_io_completion(q->backing_dev_info);
511532
512533 rq_qos_done(q, rq);
513
-
514
- if (blk_rq_rl(rq))
515
- blk_put_rl(blk_rq_rl(rq));
516534
517535 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
518536 if (refcount_dec_and_test(&rq->ref))
....@@ -522,12 +540,17 @@
522540
523541 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
524542 {
525
- u64 now = ktime_get_ns();
543
+ u64 now = 0;
544
+
545
+ if (blk_mq_need_time_stamp(rq))
546
+ now = ktime_get_ns();
526547
527548 if (rq->rq_flags & RQF_STATS) {
528549 blk_mq_poll_stats_start(rq->q);
529550 blk_stat_add(rq, now);
530551 }
552
+
553
+ blk_mq_sched_completed_request(rq, now);
531554
532555 blk_account_io_done(rq, now);
533556
....@@ -535,8 +558,6 @@
535558 rq_qos_done(rq->q, rq);
536559 rq->end_io(rq, error);
537560 } else {
538
- if (unlikely(blk_bidi_rq(rq)))
539
- blk_mq_free_request(rq->next_rq);
540561 blk_mq_free_request(rq);
541562 }
542563 }
....@@ -550,63 +571,120 @@
550571 }
551572 EXPORT_SYMBOL(blk_mq_end_request);
552573
553
-#ifdef CONFIG_PREEMPT_RT_FULL
554
-
555
-void __blk_mq_complete_request_remote_work(struct work_struct *work)
574
+static void blk_complete_reqs(struct llist_head *list)
556575 {
557
- struct request *rq = container_of(work, struct request, work);
576
+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
577
+ struct request *rq, *next;
558578
559
- rq->q->softirq_done_fn(rq);
579
+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
580
+ rq->q->mq_ops->complete(rq);
560581 }
561582
562
-#else
583
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
584
+{
585
+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
586
+}
587
+
588
+static int blk_softirq_cpu_dead(unsigned int cpu)
589
+{
590
+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
591
+ return 0;
592
+}
563593
564594 static void __blk_mq_complete_request_remote(void *data)
565595 {
566
- struct request *rq = data;
567
-
568
- rq->q->softirq_done_fn(rq);
596
+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
569597 }
570
-#endif
571598
572
-static void __blk_mq_complete_request(struct request *rq)
599
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
573600 {
574
- struct blk_mq_ctx *ctx = rq->mq_ctx;
575
- bool shared = false;
576
- int cpu;
601
+ int cpu = raw_smp_processor_id();
577602
578
- if (!blk_mq_mark_complete(rq))
579
- return;
580
- if (rq->internal_tag != -1)
581
- blk_mq_sched_completed_request(rq);
603
+ if (!IS_ENABLED(CONFIG_SMP) ||
604
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
605
+ return false;
606
+ /*
607
+ * With force threaded interrupts enabled, raising softirq from an SMP
608
+ * function call will always result in waking the ksoftirqd thread.
609
+ * This is probably worse than completing the request on a different
610
+ * cache domain.
611
+ */
612
+ if (force_irqthreads)
613
+ return false;
582614
583
- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
584
- rq->q->softirq_done_fn(rq);
585
- return;
586
- }
615
+ /* same CPU or cache domain? Complete locally */
616
+ if (cpu == rq->mq_ctx->cpu ||
617
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
618
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
619
+ return false;
587620
588
- cpu = get_cpu_light();
589
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
590
- shared = cpus_share_cache(cpu, ctx->cpu);
621
+ /* don't try to IPI to an offline CPU */
622
+ return cpu_online(rq->mq_ctx->cpu);
623
+}
591624
592
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
593
-#ifdef CONFIG_PREEMPT_RT_FULL
594
- /*
595
- * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
596
- * here. But we could try to invoke it one the CPU like this.
597
- */
598
- schedule_work_on(ctx->cpu, &rq->work);
599
-#else
625
+static void blk_mq_complete_send_ipi(struct request *rq)
626
+{
627
+ struct llist_head *list;
628
+ unsigned int cpu;
629
+
630
+ cpu = rq->mq_ctx->cpu;
631
+ list = &per_cpu(blk_cpu_done, cpu);
632
+ if (llist_add(&rq->ipi_list, list)) {
600633 rq->csd.func = __blk_mq_complete_request_remote;
601634 rq->csd.info = rq;
602635 rq->csd.flags = 0;
603
- smp_call_function_single_async(ctx->cpu, &rq->csd);
604
-#endif
605
- } else {
606
- rq->q->softirq_done_fn(rq);
636
+ smp_call_function_single_async(cpu, &rq->csd);
607637 }
608
- put_cpu_light();
609638 }
639
+
640
+static void blk_mq_raise_softirq(struct request *rq)
641
+{
642
+ struct llist_head *list;
643
+
644
+ preempt_disable();
645
+ list = this_cpu_ptr(&blk_cpu_done);
646
+ if (llist_add(&rq->ipi_list, list))
647
+ raise_softirq(BLOCK_SOFTIRQ);
648
+ preempt_enable();
649
+}
650
+
651
+bool blk_mq_complete_request_remote(struct request *rq)
652
+{
653
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
654
+
655
+ /*
656
+ * For a polled request, always complete locallly, it's pointless
657
+ * to redirect the completion.
658
+ */
659
+ if (rq->cmd_flags & REQ_HIPRI)
660
+ return false;
661
+
662
+ if (blk_mq_complete_need_ipi(rq)) {
663
+ blk_mq_complete_send_ipi(rq);
664
+ return true;
665
+ }
666
+
667
+ if (rq->q->nr_hw_queues == 1) {
668
+ blk_mq_raise_softirq(rq);
669
+ return true;
670
+ }
671
+ return false;
672
+}
673
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
674
+
675
+/**
676
+ * blk_mq_complete_request - end I/O on a request
677
+ * @rq: the request being processed
678
+ *
679
+ * Description:
680
+ * Complete a request by scheduling the ->complete_rq operation.
681
+ **/
682
+void blk_mq_complete_request(struct request *rq)
683
+{
684
+ if (!blk_mq_complete_request_remote(rq))
685
+ rq->q->mq_ops->complete(rq);
686
+}
687
+EXPORT_SYMBOL(blk_mq_complete_request);
610688
611689 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
612690 __releases(hctx->srcu)
....@@ -629,40 +707,22 @@
629707 }
630708
631709 /**
632
- * blk_mq_complete_request - end I/O on a request
633
- * @rq: the request being processed
710
+ * blk_mq_start_request - Start processing a request
711
+ * @rq: Pointer to request to be started
634712 *
635
- * Description:
636
- * Ends all I/O on a request. It does not handle partial completions.
637
- * The actual completion happens out-of-order, through a IPI handler.
638
- **/
639
-void blk_mq_complete_request(struct request *rq)
640
-{
641
- if (unlikely(blk_should_fake_timeout(rq->q)))
642
- return;
643
- __blk_mq_complete_request(rq);
644
-}
645
-EXPORT_SYMBOL(blk_mq_complete_request);
646
-
647
-int blk_mq_request_started(struct request *rq)
648
-{
649
- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
650
-}
651
-EXPORT_SYMBOL_GPL(blk_mq_request_started);
652
-
713
+ * Function used by device drivers to notify the block layer that a request
714
+ * is going to be processed now, so blk layer can do proper initializations
715
+ * such as starting the timeout timer.
716
+ */
653717 void blk_mq_start_request(struct request *rq)
654718 {
655719 struct request_queue *q = rq->q;
656
-
657
- blk_mq_sched_started_request(rq);
658720
659721 trace_block_rq_issue(q, rq);
660722
661723 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
662724 rq->io_start_time_ns = ktime_get_ns();
663
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
664
- rq->throtl_size = blk_rq_sectors(rq);
665
-#endif
725
+ rq->stats_sectors = blk_rq_sectors(rq);
666726 rq->rq_flags |= RQF_STATS;
667727 rq_qos_issue(q, rq);
668728 }
....@@ -672,14 +732,10 @@
672732 blk_add_timer(rq);
673733 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
674734
675
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
676
- /*
677
- * Make sure space for the drain appears. We know we can do
678
- * this because max_hw_segments has been adjusted to be one
679
- * fewer than the device can handle.
680
- */
681
- rq->nr_phys_segments++;
682
- }
735
+#ifdef CONFIG_BLK_DEV_INTEGRITY
736
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
737
+ q->integrity.profile->prepare_fn(rq);
738
+#endif
683739 }
684740 EXPORT_SYMBOL(blk_mq_start_request);
685741
....@@ -695,8 +751,6 @@
695751 if (blk_mq_request_started(rq)) {
696752 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
697753 rq->rq_flags &= ~RQF_TIMED_OUT;
698
- if (q->dma_drain_size && blk_rq_bytes(rq))
699
- rq->nr_phys_segments--;
700754 }
701755 }
702756
....@@ -707,7 +761,6 @@
707761 /* this request will be re-inserted to io scheduler queue */
708762 blk_mq_sched_requeue_request(rq);
709763
710
- BUG_ON(blk_queued_rq(rq));
711764 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
712765 }
713766 EXPORT_SYMBOL(blk_mq_requeue_request);
....@@ -735,7 +788,7 @@
735788 * merge.
736789 */
737790 if (rq->rq_flags & RQF_DONTPREP)
738
- blk_mq_request_bypass_insert(rq, false);
791
+ blk_mq_request_bypass_insert(rq, false, false);
739792 else
740793 blk_mq_sched_insert_request(rq, true, false, false);
741794 }
....@@ -773,7 +826,6 @@
773826 if (kick_requeue_list)
774827 blk_mq_kick_requeue_list(q);
775828 }
776
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
777829
778830 void blk_mq_kick_requeue_list(struct request_queue *q)
779831 {
....@@ -800,6 +852,32 @@
800852 }
801853 EXPORT_SYMBOL(blk_mq_tag_to_rq);
802854
855
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
856
+ void *priv, bool reserved)
857
+{
858
+ /*
859
+ * If we find a request that isn't idle and the queue matches,
860
+ * we know the queue is busy. Return false to stop the iteration.
861
+ */
862
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
863
+ bool *busy = priv;
864
+
865
+ *busy = true;
866
+ return false;
867
+ }
868
+
869
+ return true;
870
+}
871
+
872
+bool blk_mq_queue_inflight(struct request_queue *q)
873
+{
874
+ bool busy = false;
875
+
876
+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
877
+ return busy;
878
+}
879
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
880
+
803881 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
804882 {
805883 req->rq_flags |= RQF_TIMED_OUT;
....@@ -824,7 +902,7 @@
824902 if (rq->rq_flags & RQF_TIMED_OUT)
825903 return false;
826904
827
- deadline = blk_rq_deadline(rq);
905
+ deadline = READ_ONCE(rq->deadline);
828906 if (time_after_eq(jiffies, deadline))
829907 return true;
830908
....@@ -835,43 +913,29 @@
835913 return false;
836914 }
837915
838
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
916
+void blk_mq_put_rq_ref(struct request *rq)
917
+{
918
+ if (is_flush_rq(rq))
919
+ rq->end_io(rq, 0);
920
+ else if (refcount_dec_and_test(&rq->ref))
921
+ __blk_mq_free_request(rq);
922
+}
923
+
924
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
839925 struct request *rq, void *priv, bool reserved)
840926 {
841927 unsigned long *next = priv;
842928
843929 /*
844
- * Just do a quick check if it is expired before locking the request in
845
- * so we're not unnecessarilly synchronizing across CPUs.
846
- */
847
- if (!blk_mq_req_expired(rq, next))
848
- return;
849
-
850
- /*
851
- * We have reason to believe the request may be expired. Take a
852
- * reference on the request to lock this request lifetime into its
853
- * currently allocated context to prevent it from being reallocated in
854
- * the event the completion by-passes this timeout handler.
855
- *
856
- * If the reference was already released, then the driver beat the
857
- * timeout handler to posting a natural completion.
858
- */
859
- if (!refcount_inc_not_zero(&rq->ref))
860
- return;
861
-
862
- /*
863
- * The request is now locked and cannot be reallocated underneath the
864
- * timeout handler's processing. Re-verify this exact request is truly
865
- * expired; if it is not expired, then the request was completed and
866
- * reallocated as a new request.
930
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
931
+ * be reallocated underneath the timeout handler's processing, then
932
+ * the expire check is reliable. If the request is not expired, then
933
+ * it was completed and reallocated as a new request after returning
934
+ * from blk_mq_check_expired().
867935 */
868936 if (blk_mq_req_expired(rq, next))
869937 blk_mq_rq_timed_out(rq, reserved);
870
-
871
- if (is_flush_rq(rq, hctx))
872
- rq->end_io(rq, 0);
873
- else if (refcount_dec_and_test(&rq->ref))
874
- __blk_mq_free_request(rq);
938
+ return true;
875939 }
876940
877941 static void blk_mq_timeout_work(struct work_struct *work)
....@@ -928,9 +992,10 @@
928992 struct flush_busy_ctx_data *flush_data = data;
929993 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
930994 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
995
+ enum hctx_type type = hctx->type;
931996
932997 spin_lock(&ctx->lock);
933
- list_splice_tail_init(&ctx->rq_list, flush_data->list);
998
+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
934999 sbitmap_clear_bit(sb, bitnr);
9351000 spin_unlock(&ctx->lock);
9361001 return true;
....@@ -962,12 +1027,13 @@
9621027 struct dispatch_rq_data *dispatch_data = data;
9631028 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
9641029 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1030
+ enum hctx_type type = hctx->type;
9651031
9661032 spin_lock(&ctx->lock);
967
- if (!list_empty(&ctx->rq_list)) {
968
- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
1033
+ if (!list_empty(&ctx->rq_lists[type])) {
1034
+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
9691035 list_del_init(&dispatch_data->rq->queuelist);
970
- if (list_empty(&ctx->rq_list))
1036
+ if (list_empty(&ctx->rq_lists[type]))
9711037 sbitmap_clear_bit(sb, bitnr);
9721038 }
9731039 spin_unlock(&ctx->lock);
....@@ -978,7 +1044,7 @@
9781044 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
9791045 struct blk_mq_ctx *start)
9801046 {
981
- unsigned off = start ? start->index_hw : 0;
1047
+ unsigned off = start ? start->index_hw[hctx->type] : 0;
9821048 struct dispatch_rq_data data = {
9831049 .hctx = hctx,
9841050 .rq = NULL,
....@@ -998,33 +1064,44 @@
9981064 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
9991065 }
10001066
1001
-bool blk_mq_get_driver_tag(struct request *rq)
1067
+static bool __blk_mq_get_driver_tag(struct request *rq)
10021068 {
1003
- struct blk_mq_alloc_data data = {
1004
- .q = rq->q,
1005
- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
1006
- .flags = BLK_MQ_REQ_NOWAIT,
1007
- };
1008
- bool shared;
1069
+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
1070
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1071
+ int tag;
10091072
1010
- if (rq->tag != -1)
1011
- goto done;
1073
+ blk_mq_tag_busy(rq->mq_hctx);
10121074
1013
- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1014
- data.flags |= BLK_MQ_REQ_RESERVED;
1015
-
1016
- shared = blk_mq_tag_busy(data.hctx);
1017
- rq->tag = blk_mq_get_tag(&data);
1018
- if (rq->tag >= 0) {
1019
- if (shared) {
1020
- rq->rq_flags |= RQF_MQ_INFLIGHT;
1021
- atomic_inc(&data.hctx->nr_active);
1022
- }
1023
- data.hctx->tags->rqs[rq->tag] = rq;
1075
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1076
+ bt = rq->mq_hctx->tags->breserved_tags;
1077
+ tag_offset = 0;
1078
+ } else {
1079
+ if (!hctx_may_queue(rq->mq_hctx, bt))
1080
+ return false;
10241081 }
10251082
1026
-done:
1027
- return rq->tag != -1;
1083
+ tag = __sbitmap_queue_get(bt);
1084
+ if (tag == BLK_MQ_NO_TAG)
1085
+ return false;
1086
+
1087
+ rq->tag = tag + tag_offset;
1088
+ return true;
1089
+}
1090
+
1091
+static bool blk_mq_get_driver_tag(struct request *rq)
1092
+{
1093
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1094
+
1095
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1096
+ return false;
1097
+
1098
+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1099
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1100
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
1101
+ __blk_mq_inc_active_requests(hctx);
1102
+ }
1103
+ hctx->tags->rqs[rq->tag] = rq;
1104
+ return true;
10281105 }
10291106
10301107 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
....@@ -1035,7 +1112,13 @@
10351112 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
10361113
10371114 spin_lock(&hctx->dispatch_wait_lock);
1038
- list_del_init(&wait->entry);
1115
+ if (!list_empty(&wait->entry)) {
1116
+ struct sbitmap_queue *sbq;
1117
+
1118
+ list_del_init(&wait->entry);
1119
+ sbq = hctx->tags->bitmap_tags;
1120
+ atomic_dec(&sbq->ws_active);
1121
+ }
10391122 spin_unlock(&hctx->dispatch_wait_lock);
10401123
10411124 blk_mq_run_hw_queue(hctx, true);
....@@ -1051,13 +1134,13 @@
10511134 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
10521135 struct request *rq)
10531136 {
1137
+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
10541138 struct wait_queue_head *wq;
10551139 wait_queue_entry_t *wait;
10561140 bool ret;
10571141
1058
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1059
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1060
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1142
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1143
+ blk_mq_sched_mark_restart_hctx(hctx);
10611144
10621145 /*
10631146 * It's possible that a tag was freed in the window between the
....@@ -1074,7 +1157,7 @@
10741157 if (!list_empty_careful(&wait->entry))
10751158 return false;
10761159
1077
- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1160
+ wq = &bt_wait_ptr(sbq, hctx)->wait;
10781161
10791162 spin_lock_irq(&wq->lock);
10801163 spin_lock(&hctx->dispatch_wait_lock);
....@@ -1084,6 +1167,7 @@
10841167 return false;
10851168 }
10861169
1170
+ atomic_inc(&sbq->ws_active);
10871171 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
10881172 __add_wait_queue(wq, wait);
10891173
....@@ -1104,6 +1188,7 @@
11041188 * someone else gets the wakeup.
11051189 */
11061190 list_del_init(&wait->entry);
1191
+ atomic_dec(&sbq->ws_active);
11071192 spin_unlock(&hctx->dispatch_wait_lock);
11081193 spin_unlock_irq(&wq->lock);
11091194
....@@ -1122,9 +1207,6 @@
11221207 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
11231208 {
11241209 unsigned int ewma;
1125
-
1126
- if (hctx->queue->elevator)
1127
- return;
11281210
11291211 ewma = hctx->dispatch_busy;
11301212
....@@ -1158,22 +1240,83 @@
11581240 __blk_mq_requeue_request(rq);
11591241 }
11601242
1243
+static void blk_mq_handle_zone_resource(struct request *rq,
1244
+ struct list_head *zone_list)
1245
+{
1246
+ /*
1247
+ * If we end up here it is because we cannot dispatch a request to a
1248
+ * specific zone due to LLD level zone-write locking or other zone
1249
+ * related resource not being available. In this case, set the request
1250
+ * aside in zone_list for retrying it later.
1251
+ */
1252
+ list_add(&rq->queuelist, zone_list);
1253
+ __blk_mq_requeue_request(rq);
1254
+}
1255
+
1256
+enum prep_dispatch {
1257
+ PREP_DISPATCH_OK,
1258
+ PREP_DISPATCH_NO_TAG,
1259
+ PREP_DISPATCH_NO_BUDGET,
1260
+};
1261
+
1262
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1263
+ bool need_budget)
1264
+{
1265
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1266
+
1267
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
1268
+ blk_mq_put_driver_tag(rq);
1269
+ return PREP_DISPATCH_NO_BUDGET;
1270
+ }
1271
+
1272
+ if (!blk_mq_get_driver_tag(rq)) {
1273
+ /*
1274
+ * The initial allocation attempt failed, so we need to
1275
+ * rerun the hardware queue when a tag is freed. The
1276
+ * waitqueue takes care of that. If the queue is run
1277
+ * before we add this entry back on the dispatch list,
1278
+ * we'll re-run it below.
1279
+ */
1280
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
1281
+ /*
1282
+ * All budgets not got from this function will be put
1283
+ * together during handling partial dispatch
1284
+ */
1285
+ if (need_budget)
1286
+ blk_mq_put_dispatch_budget(rq->q);
1287
+ return PREP_DISPATCH_NO_TAG;
1288
+ }
1289
+ }
1290
+
1291
+ return PREP_DISPATCH_OK;
1292
+}
1293
+
1294
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1295
+static void blk_mq_release_budgets(struct request_queue *q,
1296
+ unsigned int nr_budgets)
1297
+{
1298
+ int i;
1299
+
1300
+ for (i = 0; i < nr_budgets; i++)
1301
+ blk_mq_put_dispatch_budget(q);
1302
+}
1303
+
11611304 /*
11621305 * Returns true if we did some work AND can potentially do more.
11631306 */
1164
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1165
- bool got_budget)
1307
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1308
+ unsigned int nr_budgets)
11661309 {
1167
- struct blk_mq_hw_ctx *hctx;
1310
+ enum prep_dispatch prep;
1311
+ struct request_queue *q = hctx->queue;
11681312 struct request *rq, *nxt;
1169
- bool no_tag = false;
11701313 int errors, queued;
11711314 blk_status_t ret = BLK_STS_OK;
1315
+ LIST_HEAD(zone_list);
1316
+ bool needs_resource = false;
11721317
11731318 if (list_empty(list))
11741319 return false;
1175
-
1176
- WARN_ON(!list_is_singular(list) && got_budget);
11771320
11781321 /*
11791322 * Now process all the entries, sending them to the driver.
....@@ -1184,29 +1327,10 @@
11841327
11851328 rq = list_first_entry(list, struct request, queuelist);
11861329
1187
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1188
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1330
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
1331
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1332
+ if (prep != PREP_DISPATCH_OK)
11891333 break;
1190
-
1191
- if (!blk_mq_get_driver_tag(rq)) {
1192
- /*
1193
- * The initial allocation attempt failed, so we need to
1194
- * rerun the hardware queue when a tag is freed. The
1195
- * waitqueue takes care of that. If the queue is run
1196
- * before we add this entry back on the dispatch list,
1197
- * we'll re-run it below.
1198
- */
1199
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1200
- blk_mq_put_dispatch_budget(hctx);
1201
- /*
1202
- * For non-shared tags, the RESTART check
1203
- * will suffice.
1204
- */
1205
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1206
- no_tag = true;
1207
- break;
1208
- }
1209
- }
12101334
12111335 list_del_init(&rq->queuelist);
12121336
....@@ -1223,32 +1347,63 @@
12231347 bd.last = !blk_mq_get_driver_tag(nxt);
12241348 }
12251349
1350
+ /*
1351
+ * once the request is queued to lld, no need to cover the
1352
+ * budget any more
1353
+ */
1354
+ if (nr_budgets)
1355
+ nr_budgets--;
12261356 ret = q->mq_ops->queue_rq(hctx, &bd);
1227
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1228
- blk_mq_handle_dev_resource(rq, list);
1357
+ switch (ret) {
1358
+ case BLK_STS_OK:
1359
+ queued++;
12291360 break;
1230
- }
1231
-
1232
- if (unlikely(ret != BLK_STS_OK)) {
1361
+ case BLK_STS_RESOURCE:
1362
+ needs_resource = true;
1363
+ fallthrough;
1364
+ case BLK_STS_DEV_RESOURCE:
1365
+ blk_mq_handle_dev_resource(rq, list);
1366
+ goto out;
1367
+ case BLK_STS_ZONE_RESOURCE:
1368
+ /*
1369
+ * Move the request to zone_list and keep going through
1370
+ * the dispatch list to find more requests the drive can
1371
+ * accept.
1372
+ */
1373
+ blk_mq_handle_zone_resource(rq, &zone_list);
1374
+ needs_resource = true;
1375
+ break;
1376
+ default:
12331377 errors++;
12341378 blk_mq_end_request(rq, BLK_STS_IOERR);
1235
- continue;
12361379 }
1237
-
1238
- queued++;
12391380 } while (!list_empty(list));
1381
+out:
1382
+ if (!list_empty(&zone_list))
1383
+ list_splice_tail_init(&zone_list, list);
12401384
12411385 hctx->dispatched[queued_to_index(queued)]++;
12421386
1387
+ /* If we didn't flush the entire list, we could have told the driver
1388
+ * there was more coming, but that turned out to be a lie.
1389
+ */
1390
+ if ((!list_empty(list) || errors || needs_resource ||
1391
+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
1392
+ q->mq_ops->commit_rqs(hctx);
12431393 /*
12441394 * Any items that need requeuing? Stuff them into hctx->dispatch,
12451395 * that is where we will continue on next queue run.
12461396 */
12471397 if (!list_empty(list)) {
12481398 bool needs_restart;
1399
+ /* For non-shared tags, the RESTART check will suffice */
1400
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1401
+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1402
+
1403
+ blk_mq_release_budgets(q, nr_budgets);
12491404
12501405 spin_lock(&hctx->lock);
1251
- list_splice_init(list, &hctx->dispatch);
1406
+ list_splice_tail_init(list, &hctx->dispatch);
12521407 spin_unlock(&hctx->lock);
12531408
12541409 /*
....@@ -1282,13 +1437,17 @@
12821437 *
12831438 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
12841439 * bit is set, run queue after a delay to avoid IO stalls
1285
- * that could otherwise occur if the queue is idle.
1440
+ * that could otherwise occur if the queue is idle. We'll do
1441
+ * similar if we couldn't get budget or couldn't lock a zone
1442
+ * and SCHED_RESTART is set.
12861443 */
12871444 needs_restart = blk_mq_sched_needs_restart(hctx);
1445
+ if (prep == PREP_DISPATCH_NO_BUDGET)
1446
+ needs_resource = true;
12881447 if (!needs_restart ||
12891448 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
12901449 blk_mq_run_hw_queue(hctx, true);
1291
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
1450
+ else if (needs_restart && needs_resource)
12921451 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
12931452
12941453 blk_mq_update_dispatch_busy(hctx, true);
....@@ -1296,16 +1455,15 @@
12961455 } else
12971456 blk_mq_update_dispatch_busy(hctx, false);
12981457
1299
- /*
1300
- * If the host/device is unable to accept more work, inform the
1301
- * caller of that.
1302
- */
1303
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1304
- return false;
1305
-
13061458 return (queued + errors) != 0;
13071459 }
13081460
1461
+/**
1462
+ * __blk_mq_run_hw_queue - Run a hardware queue.
1463
+ * @hctx: Pointer to the hardware queue to run.
1464
+ *
1465
+ * Send pending requests to the hardware.
1466
+ */
13091467 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
13101468 {
13111469 int srcu_idx;
....@@ -1403,6 +1561,15 @@
14031561 return next_cpu;
14041562 }
14051563
1564
+/**
1565
+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1566
+ * @hctx: Pointer to the hardware queue to run.
1567
+ * @async: If we want to run the queue asynchronously.
1568
+ * @msecs: Microseconds of delay to wait before running the queue.
1569
+ *
1570
+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
1571
+ * with a delay of @msecs.
1572
+ */
14061573 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
14071574 unsigned long msecs)
14081575 {
....@@ -1424,13 +1591,29 @@
14241591 msecs_to_jiffies(msecs));
14251592 }
14261593
1594
+/**
1595
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1596
+ * @hctx: Pointer to the hardware queue to run.
1597
+ * @msecs: Microseconds of delay to wait before running the queue.
1598
+ *
1599
+ * Run a hardware queue asynchronously with a delay of @msecs.
1600
+ */
14271601 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
14281602 {
14291603 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
14301604 }
14311605 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
14321606
1433
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1607
+/**
1608
+ * blk_mq_run_hw_queue - Start to run a hardware queue.
1609
+ * @hctx: Pointer to the hardware queue to run.
1610
+ * @async: If we want to run the queue asynchronously.
1611
+ *
1612
+ * Check if the request queue is not in a quiesced state and if there are
1613
+ * pending requests to be sent. If this is true, run the queue to send requests
1614
+ * to hardware.
1615
+ */
1616
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
14341617 {
14351618 int srcu_idx;
14361619 bool need_run;
....@@ -1448,28 +1631,101 @@
14481631 blk_mq_hctx_has_pending(hctx);
14491632 hctx_unlock(hctx, srcu_idx);
14501633
1451
- if (need_run) {
1634
+ if (need_run)
14521635 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1453
- return true;
1454
- }
1455
-
1456
- return false;
14571636 }
14581637 EXPORT_SYMBOL(blk_mq_run_hw_queue);
14591638
1639
+/*
1640
+ * Is the request queue handled by an IO scheduler that does not respect
1641
+ * hardware queues when dispatching?
1642
+ */
1643
+static bool blk_mq_has_sqsched(struct request_queue *q)
1644
+{
1645
+ struct elevator_queue *e = q->elevator;
1646
+
1647
+ if (e && e->type->ops.dispatch_request &&
1648
+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1649
+ return true;
1650
+ return false;
1651
+}
1652
+
1653
+/*
1654
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
1655
+ * scheduler.
1656
+ */
1657
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1658
+{
1659
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1660
+ /*
1661
+ * If the IO scheduler does not respect hardware queues when
1662
+ * dispatching, we just don't bother with multiple HW queues and
1663
+ * dispatch from hctx for the current CPU since running multiple queues
1664
+ * just causes lock contention inside the scheduler and pointless cache
1665
+ * bouncing.
1666
+ */
1667
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
1668
+
1669
+ if (!blk_mq_hctx_stopped(hctx))
1670
+ return hctx;
1671
+ return NULL;
1672
+}
1673
+
1674
+/**
1675
+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
1676
+ * @q: Pointer to the request queue to run.
1677
+ * @async: If we want to run the queue asynchronously.
1678
+ */
14601679 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
14611680 {
1462
- struct blk_mq_hw_ctx *hctx;
1681
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
14631682 int i;
14641683
1684
+ sq_hctx = NULL;
1685
+ if (blk_mq_has_sqsched(q))
1686
+ sq_hctx = blk_mq_get_sq_hctx(q);
14651687 queue_for_each_hw_ctx(q, hctx, i) {
14661688 if (blk_mq_hctx_stopped(hctx))
14671689 continue;
1468
-
1469
- blk_mq_run_hw_queue(hctx, async);
1690
+ /*
1691
+ * Dispatch from this hctx either if there's no hctx preferred
1692
+ * by IO scheduler or if it has requests that bypass the
1693
+ * scheduler.
1694
+ */
1695
+ if (!sq_hctx || sq_hctx == hctx ||
1696
+ !list_empty_careful(&hctx->dispatch))
1697
+ blk_mq_run_hw_queue(hctx, async);
14701698 }
14711699 }
14721700 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1701
+
1702
+/**
1703
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1704
+ * @q: Pointer to the request queue to run.
1705
+ * @msecs: Microseconds of delay to wait before running the queues.
1706
+ */
1707
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1708
+{
1709
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
1710
+ int i;
1711
+
1712
+ sq_hctx = NULL;
1713
+ if (blk_mq_has_sqsched(q))
1714
+ sq_hctx = blk_mq_get_sq_hctx(q);
1715
+ queue_for_each_hw_ctx(q, hctx, i) {
1716
+ if (blk_mq_hctx_stopped(hctx))
1717
+ continue;
1718
+ /*
1719
+ * Dispatch from this hctx either if there's no hctx preferred
1720
+ * by IO scheduler or if it has requests that bypass the
1721
+ * scheduler.
1722
+ */
1723
+ if (!sq_hctx || sq_hctx == hctx ||
1724
+ !list_empty_careful(&hctx->dispatch))
1725
+ blk_mq_delay_run_hw_queue(hctx, msecs);
1726
+ }
1727
+}
1728
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
14731729
14741730 /**
14751731 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
....@@ -1574,7 +1830,7 @@
15741830 /*
15751831 * If we are stopped, don't run the queue.
15761832 */
1577
- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1833
+ if (blk_mq_hctx_stopped(hctx))
15781834 return;
15791835
15801836 __blk_mq_run_hw_queue(hctx);
....@@ -1585,15 +1841,16 @@
15851841 bool at_head)
15861842 {
15871843 struct blk_mq_ctx *ctx = rq->mq_ctx;
1844
+ enum hctx_type type = hctx->type;
15881845
15891846 lockdep_assert_held(&ctx->lock);
15901847
15911848 trace_block_rq_insert(hctx->queue, rq);
15921849
15931850 if (at_head)
1594
- list_add(&rq->queuelist, &ctx->rq_list);
1851
+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
15951852 else
1596
- list_add_tail(&rq->queuelist, &ctx->rq_list);
1853
+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
15971854 }
15981855
15991856 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
....@@ -1607,17 +1864,25 @@
16071864 blk_mq_hctx_mark_pending(hctx, ctx);
16081865 }
16091866
1610
-/*
1867
+/**
1868
+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1869
+ * @rq: Pointer to request to be inserted.
1870
+ * @at_head: true if the request should be inserted at the head of the list.
1871
+ * @run_queue: If we should run the hardware queue after inserting the request.
1872
+ *
16111873 * Should only be used carefully, when the caller knows we want to
16121874 * bypass a potential IO scheduler on the target device.
16131875 */
1614
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1876
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1877
+ bool run_queue)
16151878 {
1616
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1617
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1879
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
16181880
16191881 spin_lock(&hctx->lock);
1620
- list_add_tail(&rq->queuelist, &hctx->dispatch);
1882
+ if (at_head)
1883
+ list_add(&rq->queuelist, &hctx->dispatch);
1884
+ else
1885
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
16211886 spin_unlock(&hctx->lock);
16221887
16231888 if (run_queue)
....@@ -1629,6 +1894,7 @@
16291894
16301895 {
16311896 struct request *rq;
1897
+ enum hctx_type type = hctx->type;
16321898
16331899 /*
16341900 * preemption doesn't flush plug list, so it's possible ctx->cpu is
....@@ -1640,95 +1906,87 @@
16401906 }
16411907
16421908 spin_lock(&ctx->lock);
1643
- list_splice_tail_init(list, &ctx->rq_list);
1909
+ list_splice_tail_init(list, &ctx->rq_lists[type]);
16441910 blk_mq_hctx_mark_pending(hctx, ctx);
16451911 spin_unlock(&ctx->lock);
16461912 }
16471913
1648
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1914
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
16491915 {
16501916 struct request *rqa = container_of(a, struct request, queuelist);
16511917 struct request *rqb = container_of(b, struct request, queuelist);
16521918
1653
- return !(rqa->mq_ctx < rqb->mq_ctx ||
1654
- (rqa->mq_ctx == rqb->mq_ctx &&
1655
- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1919
+ if (rqa->mq_ctx != rqb->mq_ctx)
1920
+ return rqa->mq_ctx > rqb->mq_ctx;
1921
+ if (rqa->mq_hctx != rqb->mq_hctx)
1922
+ return rqa->mq_hctx > rqb->mq_hctx;
1923
+
1924
+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
16561925 }
16571926
16581927 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
16591928 {
1660
- struct blk_mq_ctx *this_ctx;
1661
- struct request_queue *this_q;
1662
- struct request *rq;
16631929 LIST_HEAD(list);
1664
- LIST_HEAD(ctx_list);
1665
- unsigned int depth;
16661930
1931
+ if (list_empty(&plug->mq_list))
1932
+ return;
16671933 list_splice_init(&plug->mq_list, &list);
16681934
1669
- list_sort(NULL, &list, plug_ctx_cmp);
1935
+ if (plug->rq_count > 2 && plug->multiple_queues)
1936
+ list_sort(NULL, &list, plug_rq_cmp);
16701937
1671
- this_q = NULL;
1672
- this_ctx = NULL;
1673
- depth = 0;
1938
+ plug->rq_count = 0;
16741939
1675
- while (!list_empty(&list)) {
1676
- rq = list_entry_rq(list.next);
1677
- list_del_init(&rq->queuelist);
1678
- BUG_ON(!rq->q);
1679
- if (rq->mq_ctx != this_ctx) {
1680
- if (this_ctx) {
1681
- trace_block_unplug(this_q, depth, !from_schedule);
1682
- blk_mq_sched_insert_requests(this_q, this_ctx,
1683
- &ctx_list,
1684
- from_schedule);
1685
- }
1940
+ do {
1941
+ struct list_head rq_list;
1942
+ struct request *rq, *head_rq = list_entry_rq(list.next);
1943
+ struct list_head *pos = &head_rq->queuelist; /* skip first */
1944
+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1945
+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1946
+ unsigned int depth = 1;
16861947
1687
- this_ctx = rq->mq_ctx;
1688
- this_q = rq->q;
1689
- depth = 0;
1948
+ list_for_each_continue(pos, &list) {
1949
+ rq = list_entry_rq(pos);
1950
+ BUG_ON(!rq->q);
1951
+ if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1952
+ break;
1953
+ depth++;
16901954 }
16911955
1692
- depth++;
1693
- list_add_tail(&rq->queuelist, &ctx_list);
1694
- }
1695
-
1696
- /*
1697
- * If 'this_ctx' is set, we know we have entries to complete
1698
- * on 'ctx_list'. Do those.
1699
- */
1700
- if (this_ctx) {
1701
- trace_block_unplug(this_q, depth, !from_schedule);
1702
- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1956
+ list_cut_before(&rq_list, &list, pos);
1957
+ trace_block_unplug(head_rq->q, depth, !from_schedule);
1958
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
17031959 from_schedule);
1704
- }
1960
+ } while(!list_empty(&list));
17051961 }
17061962
1707
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1963
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1964
+ unsigned int nr_segs)
17081965 {
1709
- blk_init_request_from_bio(rq, bio);
1966
+ int err;
17101967
1711
- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1968
+ if (bio->bi_opf & REQ_RAHEAD)
1969
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
17121970
1713
- blk_account_io_start(rq, true);
1714
-}
1971
+ rq->__sector = bio->bi_iter.bi_sector;
1972
+ rq->write_hint = bio->bi_write_hint;
1973
+ blk_rq_bio_prep(rq, bio, nr_segs);
17151974
1716
-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1717
-{
1718
- if (rq->tag != -1)
1719
- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1975
+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
1976
+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1977
+ WARN_ON_ONCE(err);
17201978
1721
- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1979
+ blk_account_io_start(rq);
17221980 }
17231981
17241982 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
17251983 struct request *rq,
1726
- blk_qc_t *cookie)
1984
+ blk_qc_t *cookie, bool last)
17271985 {
17281986 struct request_queue *q = rq->q;
17291987 struct blk_mq_queue_data bd = {
17301988 .rq = rq,
1731
- .last = true,
1989
+ .last = last,
17321990 };
17331991 blk_qc_t new_cookie;
17341992 blk_status_t ret;
....@@ -1763,7 +2021,7 @@
17632021 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17642022 struct request *rq,
17652023 blk_qc_t *cookie,
1766
- bool bypass_insert)
2024
+ bool bypass_insert, bool last)
17672025 {
17682026 struct request_queue *q = rq->q;
17692027 bool run_queue = true;
....@@ -1784,23 +2042,35 @@
17842042 if (q->elevator && !bypass_insert)
17852043 goto insert;
17862044
1787
- if (!blk_mq_get_dispatch_budget(hctx))
2045
+ if (!blk_mq_get_dispatch_budget(q))
17882046 goto insert;
17892047
17902048 if (!blk_mq_get_driver_tag(rq)) {
1791
- blk_mq_put_dispatch_budget(hctx);
2049
+ blk_mq_put_dispatch_budget(q);
17922050 goto insert;
17932051 }
17942052
1795
- return __blk_mq_issue_directly(hctx, rq, cookie);
2053
+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
17962054 insert:
17972055 if (bypass_insert)
17982056 return BLK_STS_RESOURCE;
17992057
1800
- blk_mq_request_bypass_insert(rq, run_queue);
2058
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
2059
+
18012060 return BLK_STS_OK;
18022061 }
18032062
2063
+/**
2064
+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2065
+ * @hctx: Pointer of the associated hardware queue.
2066
+ * @rq: Pointer to request to be sent.
2067
+ * @cookie: Request queue cookie.
2068
+ *
2069
+ * If the device has enough resources to accept a new request now, send the
2070
+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
2071
+ * we can try send it another time in the future. Requests inserted at this
2072
+ * queue have higher priority.
2073
+ */
18042074 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
18052075 struct request *rq, blk_qc_t *cookie)
18062076 {
....@@ -1811,25 +2081,24 @@
18112081
18122082 hctx_lock(hctx, &srcu_idx);
18132083
1814
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
2084
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
18152085 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1816
- blk_mq_request_bypass_insert(rq, true);
2086
+ blk_mq_request_bypass_insert(rq, false, true);
18172087 else if (ret != BLK_STS_OK)
18182088 blk_mq_end_request(rq, ret);
18192089
18202090 hctx_unlock(hctx, srcu_idx);
18212091 }
18222092
1823
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
2093
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
18242094 {
18252095 blk_status_t ret;
18262096 int srcu_idx;
18272097 blk_qc_t unused_cookie;
1828
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1829
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
2098
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
18302099
18312100 hctx_lock(hctx, &srcu_idx);
1832
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
2101
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
18332102 hctx_unlock(hctx, srcu_idx);
18342103
18352104 return ret;
....@@ -1838,104 +2107,169 @@
18382107 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
18392108 struct list_head *list)
18402109 {
2110
+ int queued = 0;
2111
+ int errors = 0;
2112
+
18412113 while (!list_empty(list)) {
18422114 blk_status_t ret;
18432115 struct request *rq = list_first_entry(list, struct request,
18442116 queuelist);
18452117
18462118 list_del_init(&rq->queuelist);
1847
- ret = blk_mq_request_issue_directly(rq);
2119
+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
18482120 if (ret != BLK_STS_OK) {
2121
+ errors++;
18492122 if (ret == BLK_STS_RESOURCE ||
18502123 ret == BLK_STS_DEV_RESOURCE) {
1851
- blk_mq_request_bypass_insert(rq,
2124
+ blk_mq_request_bypass_insert(rq, false,
18522125 list_empty(list));
18532126 break;
18542127 }
18552128 blk_mq_end_request(rq, ret);
1856
- }
2129
+ } else
2130
+ queued++;
2131
+ }
2132
+
2133
+ /*
2134
+ * If we didn't flush the entire list, we could have told
2135
+ * the driver there was more coming, but that turned out to
2136
+ * be a lie.
2137
+ */
2138
+ if ((!list_empty(list) || errors) &&
2139
+ hctx->queue->mq_ops->commit_rqs && queued)
2140
+ hctx->queue->mq_ops->commit_rqs(hctx);
2141
+}
2142
+
2143
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2144
+{
2145
+ list_add_tail(&rq->queuelist, &plug->mq_list);
2146
+ plug->rq_count++;
2147
+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2148
+ struct request *tmp;
2149
+
2150
+ tmp = list_first_entry(&plug->mq_list, struct request,
2151
+ queuelist);
2152
+ if (tmp->q != rq->q)
2153
+ plug->multiple_queues = true;
18572154 }
18582155 }
18592156
1860
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2157
+/*
2158
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
2159
+ * queues. This is important for md arrays to benefit from merging
2160
+ * requests.
2161
+ */
2162
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
18612163 {
2164
+ if (plug->multiple_queues)
2165
+ return BLK_MAX_REQUEST_COUNT * 2;
2166
+ return BLK_MAX_REQUEST_COUNT;
2167
+}
2168
+
2169
+/**
2170
+ * blk_mq_submit_bio - Create and send a request to block device.
2171
+ * @bio: Bio pointer.
2172
+ *
2173
+ * Builds up a request structure from @q and @bio and send to the device. The
2174
+ * request may not be queued directly to hardware if:
2175
+ * * This request can be merged with another one
2176
+ * * We want to place request at plug queue for possible future merging
2177
+ * * There is an IO scheduler active at this queue
2178
+ *
2179
+ * It will not queue the request if there is an error with the bio, or at the
2180
+ * request creation.
2181
+ *
2182
+ * Returns: Request queue cookie.
2183
+ */
2184
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
2185
+{
2186
+ struct request_queue *q = bio->bi_disk->queue;
18622187 const int is_sync = op_is_sync(bio->bi_opf);
18632188 const int is_flush_fua = op_is_flush(bio->bi_opf);
1864
- struct blk_mq_alloc_data data = { .flags = 0 };
2189
+ struct blk_mq_alloc_data data = {
2190
+ .q = q,
2191
+ };
18652192 struct request *rq;
1866
- unsigned int request_count = 0;
18672193 struct blk_plug *plug;
18682194 struct request *same_queue_rq = NULL;
2195
+ unsigned int nr_segs;
18692196 blk_qc_t cookie;
2197
+ blk_status_t ret;
18702198
18712199 blk_queue_bounce(q, &bio);
1872
-
1873
- blk_queue_split(q, &bio);
2200
+ __blk_queue_split(&bio, &nr_segs);
18742201
18752202 if (!bio_integrity_prep(bio))
1876
- return BLK_QC_T_NONE;
2203
+ goto queue_exit;
18772204
18782205 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1879
- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1880
- return BLK_QC_T_NONE;
2206
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2207
+ goto queue_exit;
18812208
1882
- if (blk_mq_sched_bio_merge(q, bio))
1883
- return BLK_QC_T_NONE;
2209
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2210
+ goto queue_exit;
18842211
1885
- rq_qos_throttle(q, bio, NULL);
2212
+ rq_qos_throttle(q, bio);
18862213
1887
- trace_block_getrq(q, bio, bio->bi_opf);
1888
-
1889
- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
2214
+ data.cmd_flags = bio->bi_opf;
2215
+ rq = __blk_mq_alloc_request(&data);
18902216 if (unlikely(!rq)) {
18912217 rq_qos_cleanup(q, bio);
18922218 if (bio->bi_opf & REQ_NOWAIT)
18932219 bio_wouldblock_error(bio);
1894
- return BLK_QC_T_NONE;
2220
+ goto queue_exit;
18952221 }
2222
+
2223
+ trace_block_getrq(q, bio, bio->bi_opf);
18962224
18972225 rq_qos_track(q, rq, bio);
18982226
18992227 cookie = request_to_qc_t(data.hctx, rq);
19002228
1901
- plug = current->plug;
1902
- if (unlikely(is_flush_fua)) {
1903
- blk_mq_put_ctx(data.ctx);
1904
- blk_mq_bio_to_request(rq, bio);
2229
+ blk_mq_bio_to_request(rq, bio, nr_segs);
19052230
1906
- /* bypass scheduler for flush rq */
2231
+ ret = blk_crypto_init_request(rq);
2232
+ if (ret != BLK_STS_OK) {
2233
+ bio->bi_status = ret;
2234
+ bio_endio(bio);
2235
+ blk_mq_free_request(rq);
2236
+ return BLK_QC_T_NONE;
2237
+ }
2238
+
2239
+ plug = blk_mq_plug(q, bio);
2240
+ if (unlikely(is_flush_fua)) {
2241
+ /* Bypass scheduler for flush requests */
19072242 blk_insert_flush(rq);
19082243 blk_mq_run_hw_queue(data.hctx, true);
1909
- } else if (plug && q->nr_hw_queues == 1) {
1910
- struct request *last = NULL;
1911
-
1912
- blk_mq_put_ctx(data.ctx);
1913
- blk_mq_bio_to_request(rq, bio);
1914
-
2244
+ } else if (plug && (q->nr_hw_queues == 1 ||
2245
+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
2246
+ q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
19152247 /*
1916
- * @request_count may become stale because of schedule
1917
- * out, so check the list again.
2248
+ * Use plugging if we have a ->commit_rqs() hook as well, as
2249
+ * we know the driver uses bd->last in a smart fashion.
2250
+ *
2251
+ * Use normal plugging if this disk is slow HDD, as sequential
2252
+ * IO may benefit a lot from plug merging.
19182253 */
1919
- if (list_empty(&plug->mq_list))
1920
- request_count = 0;
1921
- else if (blk_queue_nomerges(q))
1922
- request_count = blk_plug_queued_count(q);
2254
+ unsigned int request_count = plug->rq_count;
2255
+ struct request *last = NULL;
19232256
19242257 if (!request_count)
19252258 trace_block_plug(q);
19262259 else
19272260 last = list_entry_rq(plug->mq_list.prev);
19282261
1929
- if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2262
+ if (request_count >= blk_plug_max_rq_count(plug) || (last &&
19302263 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
19312264 blk_flush_plug_list(plug, false);
19322265 trace_block_plug(q);
19332266 }
19342267
1935
- list_add_tail(&rq->queuelist, &plug->mq_list);
2268
+ blk_add_rq_to_plug(plug, rq);
2269
+ } else if (q->elevator) {
2270
+ /* Insert the request at the IO scheduler queue */
2271
+ blk_mq_sched_insert_request(rq, false, true, true);
19362272 } else if (plug && !blk_queue_nomerges(q)) {
1937
- blk_mq_bio_to_request(rq, bio);
1938
-
19392273 /*
19402274 * We do limited plugging. If the bio can be merged, do that.
19412275 * Otherwise the existing request in the plug list will be
....@@ -1945,30 +2279,74 @@
19452279 */
19462280 if (list_empty(&plug->mq_list))
19472281 same_queue_rq = NULL;
1948
- if (same_queue_rq)
2282
+ if (same_queue_rq) {
19492283 list_del_init(&same_queue_rq->queuelist);
1950
- list_add_tail(&rq->queuelist, &plug->mq_list);
1951
-
1952
- blk_mq_put_ctx(data.ctx);
2284
+ plug->rq_count--;
2285
+ }
2286
+ blk_add_rq_to_plug(plug, rq);
2287
+ trace_block_plug(q);
19532288
19542289 if (same_queue_rq) {
1955
- data.hctx = blk_mq_map_queue(q,
1956
- same_queue_rq->mq_ctx->cpu);
2290
+ data.hctx = same_queue_rq->mq_hctx;
2291
+ trace_block_unplug(q, 1, true);
19572292 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
19582293 &cookie);
19592294 }
1960
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1961
- !data.hctx->dispatch_busy)) {
1962
- blk_mq_put_ctx(data.ctx);
1963
- blk_mq_bio_to_request(rq, bio);
2295
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
2296
+ !data.hctx->dispatch_busy) {
2297
+ /*
2298
+ * There is no scheduler and we can try to send directly
2299
+ * to the hardware.
2300
+ */
19642301 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
19652302 } else {
1966
- blk_mq_put_ctx(data.ctx);
1967
- blk_mq_bio_to_request(rq, bio);
2303
+ /* Default case. */
19682304 blk_mq_sched_insert_request(rq, false, true, true);
19692305 }
19702306
19712307 return cookie;
2308
+queue_exit:
2309
+ blk_queue_exit(q);
2310
+ return BLK_QC_T_NONE;
2311
+}
2312
+
2313
+static size_t order_to_size(unsigned int order)
2314
+{
2315
+ return (size_t)PAGE_SIZE << order;
2316
+}
2317
+
2318
+/* called before freeing request pool in @tags */
2319
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
2320
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
2321
+{
2322
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
2323
+ struct page *page;
2324
+ unsigned long flags;
2325
+
2326
+ list_for_each_entry(page, &tags->page_list, lru) {
2327
+ unsigned long start = (unsigned long)page_address(page);
2328
+ unsigned long end = start + order_to_size(page->private);
2329
+ int i;
2330
+
2331
+ for (i = 0; i < set->queue_depth; i++) {
2332
+ struct request *rq = drv_tags->rqs[i];
2333
+ unsigned long rq_addr = (unsigned long)rq;
2334
+
2335
+ if (rq_addr >= start && rq_addr < end) {
2336
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
2337
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
2338
+ }
2339
+ }
2340
+ }
2341
+
2342
+ /*
2343
+ * Wait until all pending iteration is done.
2344
+ *
2345
+ * Request reference is cleared and it is guaranteed to be observed
2346
+ * after the ->lock is released.
2347
+ */
2348
+ spin_lock_irqsave(&drv_tags->lock, flags);
2349
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
19722350 }
19732351
19742352 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
....@@ -1989,42 +2367,44 @@
19892367 }
19902368 }
19912369
2370
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
2371
+
19922372 while (!list_empty(&tags->page_list)) {
19932373 page = list_first_entry(&tags->page_list, struct page, lru);
19942374 list_del_init(&page->lru);
19952375 /*
19962376 * Remove kmemleak object previously allocated in
1997
- * blk_mq_init_rq_map().
2377
+ * blk_mq_alloc_rqs().
19982378 */
19992379 kmemleak_free(page_address(page));
20002380 __free_pages(page, page->private);
20012381 }
20022382 }
20032383
2004
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2384
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
20052385 {
20062386 kfree(tags->rqs);
20072387 tags->rqs = NULL;
20082388 kfree(tags->static_rqs);
20092389 tags->static_rqs = NULL;
20102390
2011
- blk_mq_free_tags(tags);
2391
+ blk_mq_free_tags(tags, flags);
20122392 }
20132393
20142394 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
20152395 unsigned int hctx_idx,
20162396 unsigned int nr_tags,
2017
- unsigned int reserved_tags)
2397
+ unsigned int reserved_tags,
2398
+ unsigned int flags)
20182399 {
20192400 struct blk_mq_tags *tags;
20202401 int node;
20212402
2022
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2403
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20232404 if (node == NUMA_NO_NODE)
20242405 node = set->numa_node;
20252406
2026
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2027
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2407
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
20282408 if (!tags)
20292409 return NULL;
20302410
....@@ -2032,7 +2412,7 @@
20322412 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
20332413 node);
20342414 if (!tags->rqs) {
2035
- blk_mq_free_tags(tags);
2415
+ blk_mq_free_tags(tags, flags);
20362416 return NULL;
20372417 }
20382418
....@@ -2041,16 +2421,11 @@
20412421 node);
20422422 if (!tags->static_rqs) {
20432423 kfree(tags->rqs);
2044
- blk_mq_free_tags(tags);
2424
+ blk_mq_free_tags(tags, flags);
20452425 return NULL;
20462426 }
20472427
20482428 return tags;
2049
-}
2050
-
2051
-static size_t order_to_size(unsigned int order)
2052
-{
2053
- return (size_t)PAGE_SIZE << order;
20542429 }
20552430
20562431 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
....@@ -2075,7 +2450,7 @@
20752450 size_t rq_size, left;
20762451 int node;
20772452
2078
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2453
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20792454 if (node == NUMA_NO_NODE)
20802455 node = set->numa_node;
20812456
....@@ -2087,6 +2462,7 @@
20872462 */
20882463 rq_size = round_up(sizeof(struct request) + set->cmd_size,
20892464 cache_line_size());
2465
+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
20902466 left = rq_size * depth;
20912467
20922468 for (i = 0; i < depth; ) {
....@@ -2145,6 +2521,86 @@
21452521 return -ENOMEM;
21462522 }
21472523
2524
+struct rq_iter_data {
2525
+ struct blk_mq_hw_ctx *hctx;
2526
+ bool has_rq;
2527
+};
2528
+
2529
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2530
+{
2531
+ struct rq_iter_data *iter_data = data;
2532
+
2533
+ if (rq->mq_hctx != iter_data->hctx)
2534
+ return true;
2535
+ iter_data->has_rq = true;
2536
+ return false;
2537
+}
2538
+
2539
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2540
+{
2541
+ struct blk_mq_tags *tags = hctx->sched_tags ?
2542
+ hctx->sched_tags : hctx->tags;
2543
+ struct rq_iter_data data = {
2544
+ .hctx = hctx,
2545
+ };
2546
+
2547
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2548
+ return data.has_rq;
2549
+}
2550
+
2551
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2552
+ struct blk_mq_hw_ctx *hctx)
2553
+{
2554
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2555
+ return false;
2556
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2557
+ return false;
2558
+ return true;
2559
+}
2560
+
2561
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2562
+{
2563
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2564
+ struct blk_mq_hw_ctx, cpuhp_online);
2565
+
2566
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2567
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
2568
+ return 0;
2569
+
2570
+ /*
2571
+ * Prevent new request from being allocated on the current hctx.
2572
+ *
2573
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
2574
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2575
+ * seen once we return from the tag allocator.
2576
+ */
2577
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2578
+ smp_mb__after_atomic();
2579
+
2580
+ /*
2581
+ * Try to grab a reference to the queue and wait for any outstanding
2582
+ * requests. If we could not grab a reference the queue has been
2583
+ * frozen and there are no requests.
2584
+ */
2585
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2586
+ while (blk_mq_hctx_has_requests(hctx))
2587
+ msleep(5);
2588
+ percpu_ref_put(&hctx->queue->q_usage_counter);
2589
+ }
2590
+
2591
+ return 0;
2592
+}
2593
+
2594
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2595
+{
2596
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2597
+ struct blk_mq_hw_ctx, cpuhp_online);
2598
+
2599
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
2600
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2601
+ return 0;
2602
+}
2603
+
21482604 /*
21492605 * 'cpu' is going away. splice any existing rq_list entries from this
21502606 * software queue to the hw queue dispatch list, and ensure that it
....@@ -2155,13 +2611,18 @@
21552611 struct blk_mq_hw_ctx *hctx;
21562612 struct blk_mq_ctx *ctx;
21572613 LIST_HEAD(tmp);
2614
+ enum hctx_type type;
21582615
21592616 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2617
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
2618
+ return 0;
2619
+
21602620 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2621
+ type = hctx->type;
21612622
21622623 spin_lock(&ctx->lock);
2163
- if (!list_empty(&ctx->rq_list)) {
2164
- list_splice_init(&ctx->rq_list, &tmp);
2624
+ if (!list_empty(&ctx->rq_lists[type])) {
2625
+ list_splice_init(&ctx->rq_lists[type], &tmp);
21652626 blk_mq_hctx_clear_pending(hctx, ctx);
21662627 }
21672628 spin_unlock(&ctx->lock);
....@@ -2179,8 +2640,40 @@
21792640
21802641 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
21812642 {
2643
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2644
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2645
+ &hctx->cpuhp_online);
21822646 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
21832647 &hctx->cpuhp_dead);
2648
+}
2649
+
2650
+/*
2651
+ * Before freeing hw queue, clearing the flush request reference in
2652
+ * tags->rqs[] for avoiding potential UAF.
2653
+ */
2654
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
2655
+ unsigned int queue_depth, struct request *flush_rq)
2656
+{
2657
+ int i;
2658
+ unsigned long flags;
2659
+
2660
+ /* The hw queue may not be mapped yet */
2661
+ if (!tags)
2662
+ return;
2663
+
2664
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
2665
+
2666
+ for (i = 0; i < queue_depth; i++)
2667
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
2668
+
2669
+ /*
2670
+ * Wait until all pending iteration is done.
2671
+ *
2672
+ * Request reference is cleared and it is guaranteed to be observed
2673
+ * after the ->lock is released.
2674
+ */
2675
+ spin_lock_irqsave(&tags->lock, flags);
2676
+ spin_unlock_irqrestore(&tags->lock, flags);
21842677 }
21852678
21862679 /* hctx->ctxs will be freed in queue's release handler */
....@@ -2188,18 +2681,24 @@
21882681 struct blk_mq_tag_set *set,
21892682 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
21902683 {
2191
- blk_mq_debugfs_unregister_hctx(hctx);
2684
+ struct request *flush_rq = hctx->fq->flush_rq;
21922685
21932686 if (blk_mq_hw_queue_mapped(hctx))
21942687 blk_mq_tag_idle(hctx);
21952688
2689
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
2690
+ set->queue_depth, flush_rq);
21962691 if (set->ops->exit_request)
2197
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2692
+ set->ops->exit_request(set, flush_rq, hctx_idx);
21982693
21992694 if (set->ops->exit_hctx)
22002695 set->ops->exit_hctx(hctx, hctx_idx);
22012696
22022697 blk_mq_remove_cpuhp(hctx);
2698
+
2699
+ spin_lock(&q->unused_hctx_lock);
2700
+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
2701
+ spin_unlock(&q->unused_hctx_lock);
22032702 }
22042703
22052704 static void blk_mq_exit_hw_queues(struct request_queue *q,
....@@ -2211,112 +2710,160 @@
22112710 queue_for_each_hw_ctx(q, hctx, i) {
22122711 if (i == nr_queue)
22132712 break;
2713
+ blk_mq_debugfs_unregister_hctx(hctx);
22142714 blk_mq_exit_hctx(q, set, hctx, i);
22152715 }
2716
+}
2717
+
2718
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2719
+{
2720
+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2721
+
2722
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2723
+ __alignof__(struct blk_mq_hw_ctx)) !=
2724
+ sizeof(struct blk_mq_hw_ctx));
2725
+
2726
+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
2727
+ hw_ctx_size += sizeof(struct srcu_struct);
2728
+
2729
+ return hw_ctx_size;
22162730 }
22172731
22182732 static int blk_mq_init_hctx(struct request_queue *q,
22192733 struct blk_mq_tag_set *set,
22202734 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
22212735 {
2222
- int node;
2736
+ hctx->queue_num = hctx_idx;
22232737
2224
- node = hctx->numa_node;
2738
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2739
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2740
+ &hctx->cpuhp_online);
2741
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2742
+
2743
+ hctx->tags = set->tags[hctx_idx];
2744
+
2745
+ if (set->ops->init_hctx &&
2746
+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2747
+ goto unregister_cpu_notifier;
2748
+
2749
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2750
+ hctx->numa_node))
2751
+ goto exit_hctx;
2752
+ return 0;
2753
+
2754
+ exit_hctx:
2755
+ if (set->ops->exit_hctx)
2756
+ set->ops->exit_hctx(hctx, hctx_idx);
2757
+ unregister_cpu_notifier:
2758
+ blk_mq_remove_cpuhp(hctx);
2759
+ return -1;
2760
+}
2761
+
2762
+static struct blk_mq_hw_ctx *
2763
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2764
+ int node)
2765
+{
2766
+ struct blk_mq_hw_ctx *hctx;
2767
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2768
+
2769
+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2770
+ if (!hctx)
2771
+ goto fail_alloc_hctx;
2772
+
2773
+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2774
+ goto free_hctx;
2775
+
2776
+ atomic_set(&hctx->nr_active, 0);
22252777 if (node == NUMA_NO_NODE)
2226
- node = hctx->numa_node = set->numa_node;
2778
+ node = set->numa_node;
2779
+ hctx->numa_node = node;
22272780
22282781 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
22292782 spin_lock_init(&hctx->lock);
22302783 INIT_LIST_HEAD(&hctx->dispatch);
22312784 hctx->queue = q;
2232
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2785
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
22332786
2234
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2235
-
2236
- hctx->tags = set->tags[hctx_idx];
2787
+ INIT_LIST_HEAD(&hctx->hctx_list);
22372788
22382789 /*
22392790 * Allocate space for all possible cpus to avoid allocation at
22402791 * runtime
22412792 */
22422793 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2243
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
2794
+ gfp, node);
22442795 if (!hctx->ctxs)
2245
- goto unregister_cpu_notifier;
2796
+ goto free_cpumask;
22462797
22472798 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2248
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
2799
+ gfp, node))
22492800 goto free_ctxs;
2250
-
22512801 hctx->nr_ctx = 0;
22522802
22532803 spin_lock_init(&hctx->dispatch_wait_lock);
22542804 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
22552805 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
22562806
2257
- if (set->ops->init_hctx &&
2258
- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2259
- goto free_bitmap;
2260
-
2261
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2262
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
2807
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
22632808 if (!hctx->fq)
2264
- goto exit_hctx;
2265
-
2266
- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2267
- goto free_fq;
2809
+ goto free_bitmap;
22682810
22692811 if (hctx->flags & BLK_MQ_F_BLOCKING)
22702812 init_srcu_struct(hctx->srcu);
2813
+ blk_mq_hctx_kobj_init(hctx);
22712814
2272
- blk_mq_debugfs_register_hctx(q, hctx);
2815
+ return hctx;
22732816
2274
- return 0;
2275
-
2276
- free_fq:
2277
- blk_free_flush_queue(hctx->fq);
2278
- exit_hctx:
2279
- if (set->ops->exit_hctx)
2280
- set->ops->exit_hctx(hctx, hctx_idx);
22812817 free_bitmap:
22822818 sbitmap_free(&hctx->ctx_map);
22832819 free_ctxs:
22842820 kfree(hctx->ctxs);
2285
- unregister_cpu_notifier:
2286
- blk_mq_remove_cpuhp(hctx);
2287
- return -1;
2821
+ free_cpumask:
2822
+ free_cpumask_var(hctx->cpumask);
2823
+ free_hctx:
2824
+ kfree(hctx);
2825
+ fail_alloc_hctx:
2826
+ return NULL;
22882827 }
22892828
22902829 static void blk_mq_init_cpu_queues(struct request_queue *q,
22912830 unsigned int nr_hw_queues)
22922831 {
2293
- unsigned int i;
2832
+ struct blk_mq_tag_set *set = q->tag_set;
2833
+ unsigned int i, j;
22942834
22952835 for_each_possible_cpu(i) {
22962836 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
22972837 struct blk_mq_hw_ctx *hctx;
2838
+ int k;
22982839
22992840 __ctx->cpu = i;
23002841 spin_lock_init(&__ctx->lock);
2301
- INIT_LIST_HEAD(&__ctx->rq_list);
2842
+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2843
+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2844
+
23022845 __ctx->queue = q;
23032846
23042847 /*
23052848 * Set local node, IFF we have more than one hw queue. If
23062849 * not, we remain on the home node of the device
23072850 */
2308
- hctx = blk_mq_map_queue(q, i);
2309
- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2310
- hctx->numa_node = local_memory_node(cpu_to_node(i));
2851
+ for (j = 0; j < set->nr_maps; j++) {
2852
+ hctx = blk_mq_map_queue_type(q, j, i);
2853
+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2854
+ hctx->numa_node = cpu_to_node(i);
2855
+ }
23112856 }
23122857 }
23132858
2314
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2859
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2860
+ int hctx_idx)
23152861 {
2862
+ unsigned int flags = set->flags;
23162863 int ret = 0;
23172864
23182865 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2319
- set->queue_depth, set->reserved_tags);
2866
+ set->queue_depth, set->reserved_tags, flags);
23202867 if (!set->tags[hctx_idx])
23212868 return false;
23222869
....@@ -2325,7 +2872,7 @@
23252872 if (!ret)
23262873 return true;
23272874
2328
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2875
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23292876 set->tags[hctx_idx] = NULL;
23302877 return false;
23312878 }
....@@ -2333,16 +2880,18 @@
23332880 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
23342881 unsigned int hctx_idx)
23352882 {
2336
- if (set->tags[hctx_idx]) {
2883
+ unsigned int flags = set->flags;
2884
+
2885
+ if (set->tags && set->tags[hctx_idx]) {
23372886 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2338
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2887
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23392888 set->tags[hctx_idx] = NULL;
23402889 }
23412890 }
23422891
23432892 static void blk_mq_map_swqueue(struct request_queue *q)
23442893 {
2345
- unsigned int i, hctx_idx;
2894
+ unsigned int i, j, hctx_idx;
23462895 struct blk_mq_hw_ctx *hctx;
23472896 struct blk_mq_ctx *ctx;
23482897 struct blk_mq_tag_set *set = q->tag_set;
....@@ -2359,25 +2908,52 @@
23592908 * If the cpu isn't present, the cpu is mapped to first hctx.
23602909 */
23612910 for_each_possible_cpu(i) {
2362
- hctx_idx = q->mq_map[i];
2363
- /* unmapped hw queue can be remapped after CPU topo changed */
2364
- if (!set->tags[hctx_idx] &&
2365
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2366
- /*
2367
- * If tags initialization fail for some hctx,
2368
- * that hctx won't be brought online. In this
2369
- * case, remap the current ctx to hctx[0] which
2370
- * is guaranteed to always have tags allocated
2371
- */
2372
- q->mq_map[i] = 0;
2373
- }
23742911
23752912 ctx = per_cpu_ptr(q->queue_ctx, i);
2376
- hctx = blk_mq_map_queue(q, i);
2913
+ for (j = 0; j < set->nr_maps; j++) {
2914
+ if (!set->map[j].nr_queues) {
2915
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2916
+ HCTX_TYPE_DEFAULT, i);
2917
+ continue;
2918
+ }
2919
+ hctx_idx = set->map[j].mq_map[i];
2920
+ /* unmapped hw queue can be remapped after CPU topo changed */
2921
+ if (!set->tags[hctx_idx] &&
2922
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2923
+ /*
2924
+ * If tags initialization fail for some hctx,
2925
+ * that hctx won't be brought online. In this
2926
+ * case, remap the current ctx to hctx[0] which
2927
+ * is guaranteed to always have tags allocated
2928
+ */
2929
+ set->map[j].mq_map[i] = 0;
2930
+ }
23772931
2378
- cpumask_set_cpu(i, hctx->cpumask);
2379
- ctx->index_hw = hctx->nr_ctx;
2380
- hctx->ctxs[hctx->nr_ctx++] = ctx;
2932
+ hctx = blk_mq_map_queue_type(q, j, i);
2933
+ ctx->hctxs[j] = hctx;
2934
+ /*
2935
+ * If the CPU is already set in the mask, then we've
2936
+ * mapped this one already. This can happen if
2937
+ * devices share queues across queue maps.
2938
+ */
2939
+ if (cpumask_test_cpu(i, hctx->cpumask))
2940
+ continue;
2941
+
2942
+ cpumask_set_cpu(i, hctx->cpumask);
2943
+ hctx->type = j;
2944
+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
2945
+ hctx->ctxs[hctx->nr_ctx++] = ctx;
2946
+
2947
+ /*
2948
+ * If the nr_ctx type overflows, we have exceeded the
2949
+ * amount of sw queues we can support.
2950
+ */
2951
+ BUG_ON(!hctx->nr_ctx);
2952
+ }
2953
+
2954
+ for (; j < HCTX_MAX_TYPES; j++)
2955
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2956
+ HCTX_TYPE_DEFAULT, i);
23812957 }
23822958
23832959 queue_for_each_hw_ctx(q, hctx, i) {
....@@ -2426,14 +3002,14 @@
24263002
24273003 queue_for_each_hw_ctx(q, hctx, i) {
24283004 if (shared)
2429
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
3005
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24303006 else
2431
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
3007
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24323008 }
24333009 }
24343010
2435
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2436
- bool shared)
3011
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3012
+ bool shared)
24373013 {
24383014 struct request_queue *q;
24393015
....@@ -2451,12 +3027,12 @@
24513027 struct blk_mq_tag_set *set = q->tag_set;
24523028
24533029 mutex_lock(&set->tag_list_lock);
2454
- list_del_rcu(&q->tag_set_list);
3030
+ list_del(&q->tag_set_list);
24553031 if (list_is_singular(&set->tag_list)) {
24563032 /* just transitioned to unshared */
2457
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
3033
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24583034 /* update existing queue */
2459
- blk_mq_update_tag_set_depth(set, false);
3035
+ blk_mq_update_tag_set_shared(set, false);
24603036 }
24613037 mutex_unlock(&set->tag_list_lock);
24623038 INIT_LIST_HEAD(&q->tag_set_list);
....@@ -2465,24 +3041,50 @@
24653041 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
24663042 struct request_queue *q)
24673043 {
2468
- q->tag_set = set;
2469
-
24703044 mutex_lock(&set->tag_list_lock);
24713045
24723046 /*
24733047 * Check to see if we're transitioning to shared (from 1 to 2 queues).
24743048 */
24753049 if (!list_empty(&set->tag_list) &&
2476
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2477
- set->flags |= BLK_MQ_F_TAG_SHARED;
3050
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3051
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24783052 /* update existing queue */
2479
- blk_mq_update_tag_set_depth(set, true);
3053
+ blk_mq_update_tag_set_shared(set, true);
24803054 }
2481
- if (set->flags & BLK_MQ_F_TAG_SHARED)
3055
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
24823056 queue_set_hctx_shared(q, true);
2483
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
3057
+ list_add_tail(&q->tag_set_list, &set->tag_list);
24843058
24853059 mutex_unlock(&set->tag_list_lock);
3060
+}
3061
+
3062
+/* All allocations will be freed in release handler of q->mq_kobj */
3063
+static int blk_mq_alloc_ctxs(struct request_queue *q)
3064
+{
3065
+ struct blk_mq_ctxs *ctxs;
3066
+ int cpu;
3067
+
3068
+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3069
+ if (!ctxs)
3070
+ return -ENOMEM;
3071
+
3072
+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3073
+ if (!ctxs->queue_ctx)
3074
+ goto fail;
3075
+
3076
+ for_each_possible_cpu(cpu) {
3077
+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3078
+ ctx->ctxs = ctxs;
3079
+ }
3080
+
3081
+ q->mq_kobj = &ctxs->kobj;
3082
+ q->queue_ctx = ctxs->queue_ctx;
3083
+
3084
+ return 0;
3085
+ fail:
3086
+ kfree(ctxs);
3087
+ return -ENOMEM;
24863088 }
24873089
24883090 /*
....@@ -2493,17 +3095,17 @@
24933095 */
24943096 void blk_mq_release(struct request_queue *q)
24953097 {
2496
- struct blk_mq_hw_ctx *hctx;
2497
- unsigned int i;
3098
+ struct blk_mq_hw_ctx *hctx, *next;
3099
+ int i;
24983100
2499
- /* hctx kobj stays in hctx */
2500
- queue_for_each_hw_ctx(q, hctx, i) {
2501
- if (!hctx)
2502
- continue;
3101
+ queue_for_each_hw_ctx(q, hctx, i)
3102
+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3103
+
3104
+ /* all hctx are in .unused_hctx_list now */
3105
+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3106
+ list_del_init(&hctx->hctx_list);
25033107 kobject_put(&hctx->kobj);
25043108 }
2505
-
2506
- q->mq_map = NULL;
25073109
25083110 kfree(q->queue_hw_ctx);
25093111
....@@ -2512,102 +3114,184 @@
25123114 * both share lifetime with request queue.
25133115 */
25143116 blk_mq_sysfs_deinit(q);
2515
-
2516
- free_percpu(q->queue_ctx);
25173117 }
25183118
2519
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3119
+struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3120
+ void *queuedata)
25203121 {
25213122 struct request_queue *uninit_q, *q;
25223123
2523
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
3124
+ uninit_q = blk_alloc_queue(set->numa_node);
25243125 if (!uninit_q)
25253126 return ERR_PTR(-ENOMEM);
3127
+ uninit_q->queuedata = queuedata;
25263128
2527
- q = blk_mq_init_allocated_queue(set, uninit_q);
3129
+ /*
3130
+ * Initialize the queue without an elevator. device_add_disk() will do
3131
+ * the initialization.
3132
+ */
3133
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
25283134 if (IS_ERR(q))
25293135 blk_cleanup_queue(uninit_q);
25303136
25313137 return q;
25323138 }
3139
+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
3140
+
3141
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3142
+{
3143
+ return blk_mq_init_queue_data(set, NULL);
3144
+}
25333145 EXPORT_SYMBOL(blk_mq_init_queue);
25343146
2535
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
3147
+/*
3148
+ * Helper for setting up a queue with mq ops, given queue depth, and
3149
+ * the passed in mq ops flags.
3150
+ */
3151
+struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
3152
+ const struct blk_mq_ops *ops,
3153
+ unsigned int queue_depth,
3154
+ unsigned int set_flags)
25363155 {
2537
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
3156
+ struct request_queue *q;
3157
+ int ret;
25383158
2539
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2540
- __alignof__(struct blk_mq_hw_ctx)) !=
2541
- sizeof(struct blk_mq_hw_ctx));
3159
+ memset(set, 0, sizeof(*set));
3160
+ set->ops = ops;
3161
+ set->nr_hw_queues = 1;
3162
+ set->nr_maps = 1;
3163
+ set->queue_depth = queue_depth;
3164
+ set->numa_node = NUMA_NO_NODE;
3165
+ set->flags = set_flags;
25423166
2543
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2544
- hw_ctx_size += sizeof(struct srcu_struct);
3167
+ ret = blk_mq_alloc_tag_set(set);
3168
+ if (ret)
3169
+ return ERR_PTR(ret);
25453170
2546
- return hw_ctx_size;
3171
+ q = blk_mq_init_queue(set);
3172
+ if (IS_ERR(q)) {
3173
+ blk_mq_free_tag_set(set);
3174
+ return q;
3175
+ }
3176
+
3177
+ return q;
3178
+}
3179
+EXPORT_SYMBOL(blk_mq_init_sq_queue);
3180
+
3181
+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3182
+ struct blk_mq_tag_set *set, struct request_queue *q,
3183
+ int hctx_idx, int node)
3184
+{
3185
+ struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3186
+
3187
+ /* reuse dead hctx first */
3188
+ spin_lock(&q->unused_hctx_lock);
3189
+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3190
+ if (tmp->numa_node == node) {
3191
+ hctx = tmp;
3192
+ break;
3193
+ }
3194
+ }
3195
+ if (hctx)
3196
+ list_del_init(&hctx->hctx_list);
3197
+ spin_unlock(&q->unused_hctx_lock);
3198
+
3199
+ if (!hctx)
3200
+ hctx = blk_mq_alloc_hctx(q, set, node);
3201
+ if (!hctx)
3202
+ goto fail;
3203
+
3204
+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3205
+ goto free_hctx;
3206
+
3207
+ return hctx;
3208
+
3209
+ free_hctx:
3210
+ kobject_put(&hctx->kobj);
3211
+ fail:
3212
+ return NULL;
25473213 }
25483214
25493215 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
25503216 struct request_queue *q)
25513217 {
2552
- int i, j;
3218
+ int i, j, end;
25533219 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
25543220
2555
- blk_mq_sysfs_unregister(q);
3221
+ if (q->nr_hw_queues < set->nr_hw_queues) {
3222
+ struct blk_mq_hw_ctx **new_hctxs;
3223
+
3224
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
3225
+ sizeof(*new_hctxs), GFP_KERNEL,
3226
+ set->numa_node);
3227
+ if (!new_hctxs)
3228
+ return;
3229
+ if (hctxs)
3230
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3231
+ sizeof(*hctxs));
3232
+ q->queue_hw_ctx = new_hctxs;
3233
+ kfree(hctxs);
3234
+ hctxs = new_hctxs;
3235
+ }
25563236
25573237 /* protect against switching io scheduler */
25583238 mutex_lock(&q->sysfs_lock);
25593239 for (i = 0; i < set->nr_hw_queues; i++) {
25603240 int node;
3241
+ struct blk_mq_hw_ctx *hctx;
25613242
2562
- if (hctxs[i])
3243
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3244
+ /*
3245
+ * If the hw queue has been mapped to another numa node,
3246
+ * we need to realloc the hctx. If allocation fails, fallback
3247
+ * to use the previous one.
3248
+ */
3249
+ if (hctxs[i] && (hctxs[i]->numa_node == node))
25633250 continue;
25643251
2565
- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2566
- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2567
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2568
- node);
2569
- if (!hctxs[i])
2570
- break;
2571
-
2572
- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2573
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2574
- node)) {
2575
- kfree(hctxs[i]);
2576
- hctxs[i] = NULL;
2577
- break;
3252
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3253
+ if (hctx) {
3254
+ if (hctxs[i])
3255
+ blk_mq_exit_hctx(q, set, hctxs[i], i);
3256
+ hctxs[i] = hctx;
3257
+ } else {
3258
+ if (hctxs[i])
3259
+ pr_warn("Allocate new hctx on node %d fails,\
3260
+ fallback to previous one on node %d\n",
3261
+ node, hctxs[i]->numa_node);
3262
+ else
3263
+ break;
25783264 }
2579
-
2580
- atomic_set(&hctxs[i]->nr_active, 0);
2581
- hctxs[i]->numa_node = node;
2582
- hctxs[i]->queue_num = i;
2583
-
2584
- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2585
- free_cpumask_var(hctxs[i]->cpumask);
2586
- kfree(hctxs[i]);
2587
- hctxs[i] = NULL;
2588
- break;
2589
- }
2590
- blk_mq_hctx_kobj_init(hctxs[i]);
25913265 }
2592
- for (j = i; j < q->nr_hw_queues; j++) {
3266
+ /*
3267
+ * Increasing nr_hw_queues fails. Free the newly allocated
3268
+ * hctxs and keep the previous q->nr_hw_queues.
3269
+ */
3270
+ if (i != set->nr_hw_queues) {
3271
+ j = q->nr_hw_queues;
3272
+ end = i;
3273
+ } else {
3274
+ j = i;
3275
+ end = q->nr_hw_queues;
3276
+ q->nr_hw_queues = set->nr_hw_queues;
3277
+ }
3278
+
3279
+ for (; j < end; j++) {
25933280 struct blk_mq_hw_ctx *hctx = hctxs[j];
25943281
25953282 if (hctx) {
25963283 if (hctx->tags)
25973284 blk_mq_free_map_and_requests(set, j);
25983285 blk_mq_exit_hctx(q, set, hctx, j);
2599
- kobject_put(&hctx->kobj);
26003286 hctxs[j] = NULL;
2601
-
26023287 }
26033288 }
2604
- q->nr_hw_queues = i;
26053289 mutex_unlock(&q->sysfs_lock);
2606
- blk_mq_sysfs_register(q);
26073290 }
26083291
26093292 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2610
- struct request_queue *q)
3293
+ struct request_queue *q,
3294
+ bool elevator_init)
26113295 {
26123296 /* mark the queue as mq asap */
26133297 q->mq_ops = set->ops;
....@@ -2618,19 +3302,14 @@
26183302 if (!q->poll_cb)
26193303 goto err_exit;
26203304
2621
- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2622
- if (!q->queue_ctx)
2623
- goto err_exit;
3305
+ if (blk_mq_alloc_ctxs(q))
3306
+ goto err_poll;
26243307
26253308 /* init q->mq_kobj and sw queues' kobjects */
26263309 blk_mq_sysfs_init(q);
26273310
2628
- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2629
- GFP_KERNEL, set->numa_node);
2630
- if (!q->queue_hw_ctx)
2631
- goto err_percpu;
2632
-
2633
- q->mq_map = set->mq_map;
3311
+ INIT_LIST_HEAD(&q->unused_hctx_list);
3312
+ spin_lock_init(&q->unused_hctx_lock);
26343313
26353314 blk_mq_realloc_hw_ctxs(set, q);
26363315 if (!q->nr_hw_queues)
....@@ -2639,12 +3318,12 @@
26393318 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
26403319 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
26413320
2642
- q->nr_queues = nr_cpu_ids;
3321
+ q->tag_set = set;
26433322
26443323 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2645
-
2646
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2647
- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
3324
+ if (set->nr_maps > HCTX_TYPE_POLL &&
3325
+ set->map[HCTX_TYPE_POLL].nr_queues)
3326
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
26483327
26493328 q->sg_reserved_size = INT_MAX;
26503329
....@@ -2652,41 +3331,29 @@
26523331 INIT_LIST_HEAD(&q->requeue_list);
26533332 spin_lock_init(&q->requeue_lock);
26543333
2655
- blk_queue_make_request(q, blk_mq_make_request);
2656
- if (q->mq_ops->poll)
2657
- q->poll_fn = blk_mq_poll;
2658
-
2659
- /*
2660
- * Do this after blk_queue_make_request() overrides it...
2661
- */
26623334 q->nr_requests = set->queue_depth;
26633335
26643336 /*
26653337 * Default to classic polling
26663338 */
2667
- q->poll_nsec = -1;
2668
-
2669
- if (set->ops->complete)
2670
- blk_queue_softirq_done(q, set->ops->complete);
3339
+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
26713340
26723341 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
26733342 blk_mq_add_queue_tag_set(set, q);
26743343 blk_mq_map_swqueue(q);
26753344
2676
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2677
- int ret;
2678
-
2679
- ret = elevator_init_mq(q);
2680
- if (ret)
2681
- return ERR_PTR(ret);
2682
- }
3345
+ if (elevator_init)
3346
+ elevator_init_mq(q);
26833347
26843348 return q;
26853349
26863350 err_hctxs:
26873351 kfree(q->queue_hw_ctx);
2688
-err_percpu:
2689
- free_percpu(q->queue_ctx);
3352
+ q->nr_hw_queues = 0;
3353
+ blk_mq_sysfs_deinit(q);
3354
+err_poll:
3355
+ blk_stat_free_callback(q->poll_cb);
3356
+ q->poll_cb = NULL;
26903357 err_exit:
26913358 q->mq_ops = NULL;
26923359 return ERR_PTR(-ENOMEM);
....@@ -2704,38 +3371,21 @@
27043371 blk_mq_del_queue_tag_set(q);
27053372 }
27063373
2707
-/* Basically redo blk_mq_init_queue with queue frozen */
2708
-static void blk_mq_queue_reinit(struct request_queue *q)
2709
-{
2710
- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2711
-
2712
- blk_mq_debugfs_unregister_hctxs(q);
2713
- blk_mq_sysfs_unregister(q);
2714
-
2715
- /*
2716
- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2717
- * we should change hctx numa_node according to the new topology (this
2718
- * involves freeing and re-allocating memory, worth doing?)
2719
- */
2720
- blk_mq_map_swqueue(q);
2721
-
2722
- blk_mq_sysfs_register(q);
2723
- blk_mq_debugfs_register_hctxs(q);
2724
-}
2725
-
27263374 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
27273375 {
27283376 int i;
27293377
2730
- for (i = 0; i < set->nr_hw_queues; i++)
2731
- if (!__blk_mq_alloc_rq_map(set, i))
3378
+ for (i = 0; i < set->nr_hw_queues; i++) {
3379
+ if (!__blk_mq_alloc_map_and_request(set, i))
27323380 goto out_unwind;
3381
+ cond_resched();
3382
+ }
27333383
27343384 return 0;
27353385
27363386 out_unwind:
27373387 while (--i >= 0)
2738
- blk_mq_free_rq_map(set->tags[i]);
3388
+ blk_mq_free_map_and_requests(set, i);
27393389
27403390 return -ENOMEM;
27413391 }
....@@ -2745,7 +3395,7 @@
27453395 * may reduce the depth asked for, if memory is tight. set->queue_depth
27463396 * will be updated to reflect the allocated depth.
27473397 */
2748
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3398
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
27493399 {
27503400 unsigned int depth;
27513401 int err;
....@@ -2777,7 +3427,17 @@
27773427
27783428 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
27793429 {
2780
- if (set->ops->map_queues) {
3430
+ /*
3431
+ * blk_mq_map_queues() and multiple .map_queues() implementations
3432
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3433
+ * number of hardware queues.
3434
+ */
3435
+ if (set->nr_maps == 1)
3436
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3437
+
3438
+ if (set->ops->map_queues && !is_kdump_kernel()) {
3439
+ int i;
3440
+
27813441 /*
27823442 * transport .map_queues is usually done in the following
27833443 * way:
....@@ -2785,18 +3445,44 @@
27853445 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
27863446 * mask = get_cpu_mask(queue)
27873447 * for_each_cpu(cpu, mask)
2788
- * set->mq_map[cpu] = queue;
3448
+ * set->map[x].mq_map[cpu] = queue;
27893449 * }
27903450 *
27913451 * When we need to remap, the table has to be cleared for
27923452 * killing stale mapping since one CPU may not be mapped
27933453 * to any hw queue.
27943454 */
2795
- blk_mq_clear_mq_map(set);
3455
+ for (i = 0; i < set->nr_maps; i++)
3456
+ blk_mq_clear_mq_map(&set->map[i]);
27963457
27973458 return set->ops->map_queues(set);
2798
- } else
2799
- return blk_mq_map_queues(set);
3459
+ } else {
3460
+ BUG_ON(set->nr_maps > 1);
3461
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3462
+ }
3463
+}
3464
+
3465
+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3466
+ int cur_nr_hw_queues, int new_nr_hw_queues)
3467
+{
3468
+ struct blk_mq_tags **new_tags;
3469
+
3470
+ if (cur_nr_hw_queues >= new_nr_hw_queues)
3471
+ return 0;
3472
+
3473
+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3474
+ GFP_KERNEL, set->numa_node);
3475
+ if (!new_tags)
3476
+ return -ENOMEM;
3477
+
3478
+ if (set->tags)
3479
+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
3480
+ sizeof(*set->tags));
3481
+ kfree(set->tags);
3482
+ set->tags = new_tags;
3483
+ set->nr_hw_queues = new_nr_hw_queues;
3484
+
3485
+ return 0;
28003486 }
28013487
28023488 /*
....@@ -2807,7 +3493,7 @@
28073493 */
28083494 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
28093495 {
2810
- int ret;
3496
+ int i, ret;
28113497
28123498 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
28133499
....@@ -2830,6 +3516,11 @@
28303516 set->queue_depth = BLK_MQ_MAX_DEPTH;
28313517 }
28323518
3519
+ if (!set->nr_maps)
3520
+ set->nr_maps = 1;
3521
+ else if (set->nr_maps > HCTX_MAX_TYPES)
3522
+ return -EINVAL;
3523
+
28333524 /*
28343525 * If a crashdump is active, then we are potentially in a very
28353526 * memory constrained environment. Limit us to 1 queue and
....@@ -2837,42 +3528,59 @@
28373528 */
28383529 if (is_kdump_kernel()) {
28393530 set->nr_hw_queues = 1;
3531
+ set->nr_maps = 1;
28403532 set->queue_depth = min(64U, set->queue_depth);
28413533 }
28423534 /*
2843
- * There is no use for more h/w queues than cpus.
3535
+ * There is no use for more h/w queues than cpus if we just have
3536
+ * a single map
28443537 */
2845
- if (set->nr_hw_queues > nr_cpu_ids)
3538
+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
28463539 set->nr_hw_queues = nr_cpu_ids;
28473540
2848
- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2849
- GFP_KERNEL, set->numa_node);
2850
- if (!set->tags)
3541
+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
28513542 return -ENOMEM;
28523543
28533544 ret = -ENOMEM;
2854
- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2855
- GFP_KERNEL, set->numa_node);
2856
- if (!set->mq_map)
2857
- goto out_free_tags;
3545
+ for (i = 0; i < set->nr_maps; i++) {
3546
+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3547
+ sizeof(set->map[i].mq_map[0]),
3548
+ GFP_KERNEL, set->numa_node);
3549
+ if (!set->map[i].mq_map)
3550
+ goto out_free_mq_map;
3551
+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3552
+ }
28583553
28593554 ret = blk_mq_update_queue_map(set);
28603555 if (ret)
28613556 goto out_free_mq_map;
28623557
2863
- ret = blk_mq_alloc_rq_maps(set);
3558
+ ret = blk_mq_alloc_map_and_requests(set);
28643559 if (ret)
28653560 goto out_free_mq_map;
3561
+
3562
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
3563
+ atomic_set(&set->active_queues_shared_sbitmap, 0);
3564
+
3565
+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
3566
+ ret = -ENOMEM;
3567
+ goto out_free_mq_rq_maps;
3568
+ }
3569
+ }
28663570
28673571 mutex_init(&set->tag_list_lock);
28683572 INIT_LIST_HEAD(&set->tag_list);
28693573
28703574 return 0;
28713575
3576
+out_free_mq_rq_maps:
3577
+ for (i = 0; i < set->nr_hw_queues; i++)
3578
+ blk_mq_free_map_and_requests(set, i);
28723579 out_free_mq_map:
2873
- kfree(set->mq_map);
2874
- set->mq_map = NULL;
2875
-out_free_tags:
3580
+ for (i = 0; i < set->nr_maps; i++) {
3581
+ kfree(set->map[i].mq_map);
3582
+ set->map[i].mq_map = NULL;
3583
+ }
28763584 kfree(set->tags);
28773585 set->tags = NULL;
28783586 return ret;
....@@ -2881,13 +3589,18 @@
28813589
28823590 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
28833591 {
2884
- int i;
3592
+ int i, j;
28853593
2886
- for (i = 0; i < nr_cpu_ids; i++)
3594
+ for (i = 0; i < set->nr_hw_queues; i++)
28873595 blk_mq_free_map_and_requests(set, i);
28883596
2889
- kfree(set->mq_map);
2890
- set->mq_map = NULL;
3597
+ if (blk_mq_is_sbitmap_shared(set->flags))
3598
+ blk_mq_exit_shared_sbitmap(set);
3599
+
3600
+ for (j = 0; j < set->nr_maps; j++) {
3601
+ kfree(set->map[j].mq_map);
3602
+ set->map[j].mq_map = NULL;
3603
+ }
28913604
28923605 kfree(set->tags);
28933606 set->tags = NULL;
....@@ -2903,6 +3616,9 @@
29033616 if (!set)
29043617 return -EINVAL;
29053618
3619
+ if (q->nr_requests == nr)
3620
+ return 0;
3621
+
29063622 blk_mq_freeze_queue(q);
29073623 blk_mq_quiesce_queue(q);
29083624
....@@ -2917,14 +3633,16 @@
29173633 if (!hctx->sched_tags) {
29183634 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
29193635 false);
3636
+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
3637
+ blk_mq_tag_resize_shared_sbitmap(set, nr);
29203638 } else {
29213639 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
29223640 nr, true);
29233641 }
29243642 if (ret)
29253643 break;
2926
- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2927
- q->elevator->type->ops.mq.depth_updated(hctx);
3644
+ if (q->elevator && q->elevator->type->ops.depth_updated)
3645
+ q->elevator->type->ops.depth_updated(hctx);
29283646 }
29293647
29303648 if (!ret)
....@@ -3011,20 +3729,19 @@
30113729 {
30123730 struct request_queue *q;
30133731 LIST_HEAD(head);
3732
+ int prev_nr_hw_queues;
30143733
30153734 lockdep_assert_held(&set->tag_list_lock);
30163735
3017
- if (nr_hw_queues > nr_cpu_ids)
3736
+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
30183737 nr_hw_queues = nr_cpu_ids;
3019
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3738
+ if (nr_hw_queues < 1)
3739
+ return;
3740
+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
30203741 return;
30213742
30223743 list_for_each_entry(q, &set->tag_list, tag_set_list)
30233744 blk_mq_freeze_queue(q);
3024
- /*
3025
- * Sync with blk_mq_queue_tag_busy_iter.
3026
- */
3027
- synchronize_rcu();
30283745 /*
30293746 * Switch IO scheduler to 'none', cleaning up the data associated
30303747 * with the previous scheduler. We will switch back once we are done
....@@ -3034,11 +3751,35 @@
30343751 if (!blk_mq_elv_switch_none(&head, q))
30353752 goto switch_back;
30363753
3754
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3755
+ blk_mq_debugfs_unregister_hctxs(q);
3756
+ blk_mq_sysfs_unregister(q);
3757
+ }
3758
+
3759
+ prev_nr_hw_queues = set->nr_hw_queues;
3760
+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3761
+ 0)
3762
+ goto reregister;
3763
+
30373764 set->nr_hw_queues = nr_hw_queues;
3765
+fallback:
30383766 blk_mq_update_queue_map(set);
30393767 list_for_each_entry(q, &set->tag_list, tag_set_list) {
30403768 blk_mq_realloc_hw_ctxs(set, q);
3041
- blk_mq_queue_reinit(q);
3769
+ if (q->nr_hw_queues != set->nr_hw_queues) {
3770
+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3771
+ nr_hw_queues, prev_nr_hw_queues);
3772
+ set->nr_hw_queues = prev_nr_hw_queues;
3773
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3774
+ goto fallback;
3775
+ }
3776
+ blk_mq_map_swqueue(q);
3777
+ }
3778
+
3779
+reregister:
3780
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3781
+ blk_mq_sysfs_register(q);
3782
+ blk_mq_debugfs_register_hctxs(q);
30423783 }
30433784
30443785 switch_back:
....@@ -3092,7 +3833,6 @@
30923833 }
30933834
30943835 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3095
- struct blk_mq_hw_ctx *hctx,
30963836 struct request *rq)
30973837 {
30983838 unsigned long ret = 0;
....@@ -3125,7 +3865,6 @@
31253865 }
31263866
31273867 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3128
- struct blk_mq_hw_ctx *hctx,
31293868 struct request *rq)
31303869 {
31313870 struct hrtimer_sleeper hs;
....@@ -3137,18 +3876,15 @@
31373876 return false;
31383877
31393878 /*
3140
- * poll_nsec can be:
3879
+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
31413880 *
3142
- * -1: don't ever hybrid sleep
31433881 * 0: use half of prev avg
31443882 * >0: use this specific value
31453883 */
3146
- if (q->poll_nsec == -1)
3147
- return false;
3148
- else if (q->poll_nsec > 0)
3884
+ if (q->poll_nsec > 0)
31493885 nsecs = q->poll_nsec;
31503886 else
3151
- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3887
+ nsecs = blk_mq_poll_nsecs(q, rq);
31523888
31533889 if (!nsecs)
31543890 return false;
....@@ -3162,14 +3898,14 @@
31623898 kt = nsecs;
31633899
31643900 mode = HRTIMER_MODE_REL;
3165
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
3901
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
31663902 hrtimer_set_expires(&hs.timer, kt);
31673903
31683904 do {
31693905 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
31703906 break;
31713907 set_current_state(TASK_UNINTERRUPTIBLE);
3172
- hrtimer_start_expires(&hs.timer, mode);
3908
+ hrtimer_sleeper_start_expires(&hs, mode);
31733909 if (hs.task)
31743910 io_schedule();
31753911 hrtimer_cancel(&hs.timer);
....@@ -3181,59 +3917,14 @@
31813917 return true;
31823918 }
31833919
3184
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3920
+static bool blk_mq_poll_hybrid(struct request_queue *q,
3921
+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
31853922 {
3186
- struct request_queue *q = hctx->queue;
3187
- long state;
3188
-
3189
- /*
3190
- * If we sleep, have the caller restart the poll loop to reset
3191
- * the state. Like for the other success return cases, the
3192
- * caller is responsible for checking if the IO completed. If
3193
- * the IO isn't complete, we'll get called again and will go
3194
- * straight to the busy poll loop.
3195
- */
3196
- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3197
- return true;
3198
-
3199
- hctx->poll_considered++;
3200
-
3201
- state = current->state;
3202
- while (!need_resched()) {
3203
- int ret;
3204
-
3205
- hctx->poll_invoked++;
3206
-
3207
- ret = q->mq_ops->poll(hctx, rq->tag);
3208
- if (ret > 0) {
3209
- hctx->poll_success++;
3210
- set_current_state(TASK_RUNNING);
3211
- return true;
3212
- }
3213
-
3214
- if (signal_pending_state(state, current))
3215
- set_current_state(TASK_RUNNING);
3216
-
3217
- if (current->state == TASK_RUNNING)
3218
- return true;
3219
- if (ret < 0)
3220
- break;
3221
- cpu_relax();
3222
- }
3223
-
3224
- __set_current_state(TASK_RUNNING);
3225
- return false;
3226
-}
3227
-
3228
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3229
-{
3230
- struct blk_mq_hw_ctx *hctx;
32313923 struct request *rq;
32323924
3233
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3925
+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
32343926 return false;
32353927
3236
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
32373928 if (!blk_qc_t_is_internal(cookie))
32383929 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
32393930 else {
....@@ -3248,13 +3939,97 @@
32483939 return false;
32493940 }
32503941
3251
- return __blk_mq_poll(hctx, rq);
3942
+ return blk_mq_poll_hybrid_sleep(q, rq);
32523943 }
3944
+
3945
+/**
3946
+ * blk_poll - poll for IO completions
3947
+ * @q: the queue
3948
+ * @cookie: cookie passed back at IO submission time
3949
+ * @spin: whether to spin for completions
3950
+ *
3951
+ * Description:
3952
+ * Poll for completions on the passed in queue. Returns number of
3953
+ * completed entries found. If @spin is true, then blk_poll will continue
3954
+ * looping until at least one completion is found, unless the task is
3955
+ * otherwise marked running (or we need to reschedule).
3956
+ */
3957
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3958
+{
3959
+ struct blk_mq_hw_ctx *hctx;
3960
+ long state;
3961
+
3962
+ if (!blk_qc_t_valid(cookie) ||
3963
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3964
+ return 0;
3965
+
3966
+ if (current->plug)
3967
+ blk_flush_plug_list(current->plug, false);
3968
+
3969
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3970
+
3971
+ /*
3972
+ * If we sleep, have the caller restart the poll loop to reset
3973
+ * the state. Like for the other success return cases, the
3974
+ * caller is responsible for checking if the IO completed. If
3975
+ * the IO isn't complete, we'll get called again and will go
3976
+ * straight to the busy poll loop.
3977
+ */
3978
+ if (blk_mq_poll_hybrid(q, hctx, cookie))
3979
+ return 1;
3980
+
3981
+ hctx->poll_considered++;
3982
+
3983
+ state = current->state;
3984
+ do {
3985
+ int ret;
3986
+
3987
+ hctx->poll_invoked++;
3988
+
3989
+ ret = q->mq_ops->poll(hctx);
3990
+ if (ret > 0) {
3991
+ hctx->poll_success++;
3992
+ __set_current_state(TASK_RUNNING);
3993
+ return ret;
3994
+ }
3995
+
3996
+ if (signal_pending_state(state, current))
3997
+ __set_current_state(TASK_RUNNING);
3998
+
3999
+ if (current->state == TASK_RUNNING)
4000
+ return 1;
4001
+ if (ret < 0 || !spin)
4002
+ break;
4003
+ cpu_relax();
4004
+ } while (!need_resched());
4005
+
4006
+ __set_current_state(TASK_RUNNING);
4007
+ return 0;
4008
+}
4009
+EXPORT_SYMBOL_GPL(blk_poll);
4010
+
4011
+unsigned int blk_mq_rq_cpu(struct request *rq)
4012
+{
4013
+ return rq->mq_ctx->cpu;
4014
+}
4015
+EXPORT_SYMBOL(blk_mq_rq_cpu);
32534016
32544017 static int __init blk_mq_init(void)
32554018 {
4019
+ int i;
4020
+
4021
+ for_each_possible_cpu(i)
4022
+ init_llist_head(&per_cpu(blk_cpu_done, i));
4023
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4024
+
4025
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4026
+ "block/softirq:dead", NULL,
4027
+ blk_softirq_cpu_dead);
32564028 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
32574029 blk_mq_hctx_notify_dead);
4030
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4031
+ blk_mq_hctx_notify_online,
4032
+ blk_mq_hctx_notify_offline);
32584033 return 0;
32594034 }
32604035 subsys_initcall(blk_mq_init);