hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/block/blk-mq.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Block multiqueue core code
34 *
....@@ -25,30 +26,36 @@
2526 #include <linux/delay.h>
2627 #include <linux/crash_dump.h>
2728 #include <linux/prefetch.h>
29
+#include <linux/blk-crypto.h>
2830
2931 #include <trace/events/block.h>
3032
3133 #include <linux/blk-mq.h>
34
+#include <linux/t10-pi.h>
3235 #include "blk.h"
3336 #include "blk-mq.h"
3437 #include "blk-mq-debugfs.h"
3538 #include "blk-mq-tag.h"
39
+#include "blk-pm.h"
3640 #include "blk-stat.h"
3741 #include "blk-mq-sched.h"
3842 #include "blk-rq-qos.h"
3943
40
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
44
+#include <trace/hooks/block.h>
45
+
46
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
47
+
4148 static void blk_mq_poll_stats_start(struct request_queue *q);
4249 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
4350
4451 static int blk_mq_poll_stats_bkt(const struct request *rq)
4552 {
46
- int ddir, bytes, bucket;
53
+ int ddir, sectors, bucket;
4754
4855 ddir = rq_data_dir(rq);
49
- bytes = blk_rq_bytes(rq);
56
+ sectors = blk_rq_stats_sectors(rq);
5057
51
- bucket = ddir + 2*(ilog2(bytes) - 9);
58
+ bucket = ddir + 2 * ilog2(sectors);
5259
5360 if (bucket < 0)
5461 return -1;
....@@ -59,7 +66,8 @@
5966 }
6067
6168 /*
62
- * Check if any of the ctx's have pending work in this hardware queue
69
+ * Check if any of the ctx, dispatch list or elevator
70
+ * have pending work in this hardware queue.
6371 */
6472 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
6573 {
....@@ -74,75 +82,67 @@
7482 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
7583 struct blk_mq_ctx *ctx)
7684 {
77
- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78
- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
85
+ const int bit = ctx->index_hw[hctx->type];
86
+
87
+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
88
+ sbitmap_set_bit(&hctx->ctx_map, bit);
7989 }
8090
8191 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
8292 struct blk_mq_ctx *ctx)
8393 {
84
- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
94
+ const int bit = ctx->index_hw[hctx->type];
95
+
96
+ sbitmap_clear_bit(&hctx->ctx_map, bit);
8597 }
8698
8799 struct mq_inflight {
88100 struct hd_struct *part;
89
- unsigned int *inflight;
101
+ unsigned int inflight[2];
90102 };
91103
92
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
104
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93105 struct request *rq, void *priv,
94106 bool reserved)
95107 {
96108 struct mq_inflight *mi = priv;
97109
98
- /*
99
- * index[0] counts the specific partition that was asked for. index[1]
100
- * counts the ones that are active on the whole device, so increment
101
- * that if mi->part is indeed a partition, and not a whole device.
102
- */
103
- if (rq->part == mi->part)
104
- mi->inflight[0]++;
105
- if (mi->part->partno)
106
- mi->inflight[1]++;
107
-}
108
-
109
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
110
- unsigned int inflight[2])
111
-{
112
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113
-
114
- inflight[0] = inflight[1] = 0;
115
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116
-}
117
-
118
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119
- struct request *rq, void *priv,
120
- bool reserved)
121
-{
122
- struct mq_inflight *mi = priv;
123
-
124
- if (rq->part == mi->part)
110
+ if ((!mi->part->partno || rq->part == mi->part) &&
111
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125112 mi->inflight[rq_data_dir(rq)]++;
113
+
114
+ return true;
115
+}
116
+
117
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
118
+{
119
+ struct mq_inflight mi = { .part = part };
120
+
121
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
122
+
123
+ return mi.inflight[0] + mi.inflight[1];
126124 }
127125
128126 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
129127 unsigned int inflight[2])
130128 {
131
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
129
+ struct mq_inflight mi = { .part = part };
132130
133
- inflight[0] = inflight[1] = 0;
134
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
131
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
132
+ inflight[0] = mi.inflight[0];
133
+ inflight[1] = mi.inflight[1];
135134 }
136135
137136 void blk_freeze_queue_start(struct request_queue *q)
138137 {
139
- int freeze_depth;
140
-
141
- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142
- if (freeze_depth == 1) {
138
+ mutex_lock(&q->mq_freeze_lock);
139
+ if (++q->mq_freeze_depth == 1) {
143140 percpu_ref_kill(&q->q_usage_counter);
144
- if (q->mq_ops)
141
+ mutex_unlock(&q->mq_freeze_lock);
142
+ if (queue_is_mq(q))
145143 blk_mq_run_hw_queues(q, false);
144
+ } else {
145
+ mutex_unlock(&q->mq_freeze_lock);
146146 }
147147 }
148148 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
....@@ -176,8 +176,6 @@
176176 * exported to drivers as the only user for unfreeze is blk_mq.
177177 */
178178 blk_freeze_queue_start(q);
179
- if (!q->mq_ops)
180
- blk_drain_queue(q);
181179 blk_mq_freeze_queue_wait(q);
182180 }
183181
....@@ -193,14 +191,14 @@
193191
194192 void blk_mq_unfreeze_queue(struct request_queue *q)
195193 {
196
- int freeze_depth;
197
-
198
- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199
- WARN_ON_ONCE(freeze_depth < 0);
200
- if (!freeze_depth) {
201
- percpu_ref_reinit(&q->q_usage_counter);
194
+ mutex_lock(&q->mq_freeze_lock);
195
+ q->mq_freeze_depth--;
196
+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
197
+ if (!q->mq_freeze_depth) {
198
+ percpu_ref_resurrect(&q->q_usage_counter);
202199 wake_up_all(&q->mq_freeze_wq);
203200 }
201
+ mutex_unlock(&q->mq_freeze_lock);
204202 }
205203 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206204
....@@ -268,40 +266,37 @@
268266 blk_mq_tag_wakeup_all(hctx->tags, true);
269267 }
270268
271
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
269
+/*
270
+ * Only need start/end time stamping if we have iostat or
271
+ * blk stats enabled, or using an IO scheduler.
272
+ */
273
+static inline bool blk_mq_need_time_stamp(struct request *rq)
272274 {
273
- return blk_mq_has_free_tags(hctx->tags);
275
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
274276 }
275
-EXPORT_SYMBOL(blk_mq_can_queue);
276277
277278 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
278
- unsigned int tag, unsigned int op)
279
+ unsigned int tag, u64 alloc_time_ns)
279280 {
280281 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281282 struct request *rq = tags->static_rqs[tag];
282
- req_flags_t rq_flags = 0;
283283
284
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285
- rq->tag = -1;
284
+ if (data->q->elevator) {
285
+ rq->tag = BLK_MQ_NO_TAG;
286286 rq->internal_tag = tag;
287287 } else {
288
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289
- rq_flags = RQF_MQ_INFLIGHT;
290
- atomic_inc(&data->hctx->nr_active);
291
- }
292288 rq->tag = tag;
293
- rq->internal_tag = -1;
294
- data->hctx->tags->rqs[rq->tag] = rq;
289
+ rq->internal_tag = BLK_MQ_NO_TAG;
295290 }
296291
297292 /* csd/requeue_work/fifo_time is initialized before use */
298293 rq->q = data->q;
299294 rq->mq_ctx = data->ctx;
300
- rq->rq_flags = rq_flags;
301
- rq->cpu = -1;
302
- rq->cmd_flags = op;
303
- if (data->flags & BLK_MQ_REQ_PREEMPT)
304
- rq->rq_flags |= RQF_PREEMPT;
295
+ rq->mq_hctx = data->hctx;
296
+ rq->rq_flags = 0;
297
+ rq->cmd_flags = data->cmd_flags;
298
+ if (data->flags & BLK_MQ_REQ_PM)
299
+ rq->rq_flags |= RQF_PM;
305300 if (blk_queue_io_stat(data->q))
306301 rq->rq_flags |= RQF_IO_STAT;
307302 INIT_LIST_HEAD(&rq->queuelist);
....@@ -309,97 +304,110 @@
309304 RB_CLEAR_NODE(&rq->rb_node);
310305 rq->rq_disk = NULL;
311306 rq->part = NULL;
312
- rq->start_time_ns = ktime_get_ns();
307
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
308
+ rq->alloc_time_ns = alloc_time_ns;
309
+#endif
310
+ if (blk_mq_need_time_stamp(rq))
311
+ rq->start_time_ns = ktime_get_ns();
312
+ else
313
+ rq->start_time_ns = 0;
313314 rq->io_start_time_ns = 0;
315
+ rq->stats_sectors = 0;
314316 rq->nr_phys_segments = 0;
315317 #if defined(CONFIG_BLK_DEV_INTEGRITY)
316318 rq->nr_integrity_segments = 0;
317319 #endif
318
- rq->special = NULL;
320
+ blk_crypto_rq_set_defaults(rq);
319321 /* tag was already set */
320
- rq->extra_len = 0;
321
- rq->__deadline = 0;
322
+ WRITE_ONCE(rq->deadline, 0);
322323
323
- INIT_LIST_HEAD(&rq->timeout_list);
324324 rq->timeout = 0;
325325
326326 rq->end_io = NULL;
327327 rq->end_io_data = NULL;
328
- rq->next_rq = NULL;
329328
330
-#ifdef CONFIG_BLK_CGROUP
331
- rq->rl = NULL;
332
-#endif
333
-
334
- data->ctx->rq_dispatched[op_is_sync(op)]++;
329
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
335330 refcount_set(&rq->ref, 1);
331
+
332
+ if (!op_is_flush(data->cmd_flags)) {
333
+ struct elevator_queue *e = data->q->elevator;
334
+
335
+ rq->elv.icq = NULL;
336
+ if (e && e->type->ops.prepare_request) {
337
+ if (e->type->icq_cache)
338
+ blk_mq_sched_assign_ioc(rq);
339
+
340
+ e->type->ops.prepare_request(rq);
341
+ rq->rq_flags |= RQF_ELVPRIV;
342
+ }
343
+ }
344
+
345
+ data->hctx->queued++;
346
+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
336347 return rq;
337348 }
338349
339
-static struct request *blk_mq_get_request(struct request_queue *q,
340
- struct bio *bio, unsigned int op,
341
- struct blk_mq_alloc_data *data)
350
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
342351 {
352
+ struct request_queue *q = data->q;
343353 struct elevator_queue *e = q->elevator;
344
- struct request *rq;
354
+ u64 alloc_time_ns = 0;
345355 unsigned int tag;
346
- bool put_ctx_on_error = false;
347356
348
- blk_queue_enter_live(q);
349
- data->q = q;
350
- if (likely(!data->ctx)) {
351
- data->ctx = blk_mq_get_ctx(q);
352
- put_ctx_on_error = true;
353
- }
354
- if (likely(!data->hctx))
355
- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
356
- if (op & REQ_NOWAIT)
357
+ /* alloc_time includes depth and tag waits */
358
+ if (blk_queue_rq_alloc_time(q))
359
+ alloc_time_ns = ktime_get_ns();
360
+
361
+ if (data->cmd_flags & REQ_NOWAIT)
357362 data->flags |= BLK_MQ_REQ_NOWAIT;
358363
359364 if (e) {
360
- data->flags |= BLK_MQ_REQ_INTERNAL;
361
-
362365 /*
363366 * Flush requests are special and go directly to the
364367 * dispatch list. Don't include reserved tags in the
365368 * limiting, as it isn't useful.
366369 */
367
- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
370
+ if (!op_is_flush(data->cmd_flags) &&
371
+ e->type->ops.limit_depth &&
368372 !(data->flags & BLK_MQ_REQ_RESERVED))
369
- e->type->ops.mq.limit_depth(op, data);
370
- } else {
373
+ e->type->ops.limit_depth(data->cmd_flags, data);
374
+ }
375
+
376
+retry:
377
+ data->ctx = blk_mq_get_ctx(q);
378
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
379
+ if (!e)
371380 blk_mq_tag_busy(data->hctx);
372
- }
373381
382
+ /*
383
+ * Waiting allocations only fail because of an inactive hctx. In that
384
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
385
+ * should have migrated us to an online CPU by now.
386
+ */
374387 tag = blk_mq_get_tag(data);
375
- if (tag == BLK_MQ_TAG_FAIL) {
376
- if (put_ctx_on_error) {
377
- blk_mq_put_ctx(data->ctx);
378
- data->ctx = NULL;
379
- }
380
- blk_queue_exit(q);
381
- return NULL;
382
- }
388
+ if (tag == BLK_MQ_NO_TAG) {
389
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
390
+ return NULL;
383391
384
- rq = blk_mq_rq_ctx_init(data, tag, op);
385
- if (!op_is_flush(op)) {
386
- rq->elv.icq = NULL;
387
- if (e && e->type->ops.mq.prepare_request) {
388
- if (e->type->icq_cache && rq_ioc(bio))
389
- blk_mq_sched_assign_ioc(rq, bio);
390
-
391
- e->type->ops.mq.prepare_request(rq, bio);
392
- rq->rq_flags |= RQF_ELVPRIV;
393
- }
392
+ /*
393
+ * Give up the CPU and sleep for a random short time to ensure
394
+ * that thread using a realtime scheduling class are migrated
395
+ * off the CPU, and thus off the hctx that is going away.
396
+ */
397
+ msleep(3);
398
+ goto retry;
394399 }
395
- data->hctx->queued++;
396
- return rq;
400
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
397401 }
398402
399403 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
400404 blk_mq_req_flags_t flags)
401405 {
402
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
406
+ struct blk_mq_alloc_data data = {
407
+ .q = q,
408
+ .flags = flags,
409
+ .cmd_flags = op,
410
+ };
403411 struct request *rq;
404412 int ret;
405413
....@@ -407,28 +415,35 @@
407415 if (ret)
408416 return ERR_PTR(ret);
409417
410
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
411
- blk_queue_exit(q);
412
-
418
+ rq = __blk_mq_alloc_request(&data);
413419 if (!rq)
414
- return ERR_PTR(-EWOULDBLOCK);
415
-
416
- blk_mq_put_ctx(alloc_data.ctx);
417
-
420
+ goto out_queue_exit;
418421 rq->__data_len = 0;
419422 rq->__sector = (sector_t) -1;
420423 rq->bio = rq->biotail = NULL;
421424 return rq;
425
+out_queue_exit:
426
+ blk_queue_exit(q);
427
+ return ERR_PTR(-EWOULDBLOCK);
422428 }
423429 EXPORT_SYMBOL(blk_mq_alloc_request);
424430
425431 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
426432 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
427433 {
428
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
429
- struct request *rq;
434
+ struct blk_mq_alloc_data data = {
435
+ .q = q,
436
+ .flags = flags,
437
+ .cmd_flags = op,
438
+ };
439
+ u64 alloc_time_ns = 0;
430440 unsigned int cpu;
441
+ unsigned int tag;
431442 int ret;
443
+
444
+ /* alloc_time includes depth and tag waits */
445
+ if (blk_queue_rq_alloc_time(q))
446
+ alloc_time_ns = ktime_get_ns();
432447
433448 /*
434449 * If the tag allocator sleeps we could get an allocation for a
....@@ -436,7 +451,8 @@
436451 * allocator for this for the rare use case of a command tied to
437452 * a specific queue.
438453 */
439
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
454
+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
455
+ WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
440456 return ERR_PTR(-EINVAL);
441457
442458 if (hctx_idx >= q->nr_hw_queues)
....@@ -450,21 +466,27 @@
450466 * Check if the hardware context is actually mapped to anything.
451467 * If not tell the caller that it should skip this queue.
452468 */
453
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
454
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
455
- blk_queue_exit(q);
456
- return ERR_PTR(-EXDEV);
457
- }
458
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
459
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
469
+ ret = -EXDEV;
470
+ data.hctx = q->queue_hw_ctx[hctx_idx];
471
+ if (!blk_mq_hw_queue_mapped(data.hctx))
472
+ goto out_queue_exit;
473
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
474
+ if (cpu >= nr_cpu_ids)
475
+ goto out_queue_exit;
476
+ data.ctx = __blk_mq_get_ctx(q, cpu);
460477
461
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
478
+ if (!q->elevator)
479
+ blk_mq_tag_busy(data.hctx);
480
+
481
+ ret = -EWOULDBLOCK;
482
+ tag = blk_mq_get_tag(&data);
483
+ if (tag == BLK_MQ_NO_TAG)
484
+ goto out_queue_exit;
485
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
486
+
487
+out_queue_exit:
462488 blk_queue_exit(q);
463
-
464
- if (!rq)
465
- return ERR_PTR(-EWOULDBLOCK);
466
-
467
- return rq;
489
+ return ERR_PTR(ret);
468490 }
469491 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
470492
....@@ -472,13 +494,16 @@
472494 {
473495 struct request_queue *q = rq->q;
474496 struct blk_mq_ctx *ctx = rq->mq_ctx;
475
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
497
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
476498 const int sched_tag = rq->internal_tag;
477499
478
- if (rq->tag != -1)
479
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
480
- if (sched_tag != -1)
481
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
500
+ blk_crypto_free_request(rq);
501
+ blk_pm_mark_last_busy(rq);
502
+ rq->mq_hctx = NULL;
503
+ if (rq->tag != BLK_MQ_NO_TAG)
504
+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
505
+ if (sched_tag != BLK_MQ_NO_TAG)
506
+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
482507 blk_mq_sched_restart(hctx);
483508 blk_queue_exit(q);
484509 }
....@@ -488,11 +513,11 @@
488513 struct request_queue *q = rq->q;
489514 struct elevator_queue *e = q->elevator;
490515 struct blk_mq_ctx *ctx = rq->mq_ctx;
491
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
516
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
492517
493518 if (rq->rq_flags & RQF_ELVPRIV) {
494
- if (e && e->type->ops.mq.finish_request)
495
- e->type->ops.mq.finish_request(rq);
519
+ if (e && e->type->ops.finish_request)
520
+ e->type->ops.finish_request(rq);
496521 if (rq->elv.icq) {
497522 put_io_context(rq->elv.icq->ioc);
498523 rq->elv.icq = NULL;
....@@ -501,15 +526,12 @@
501526
502527 ctx->rq_completed[rq_is_sync(rq)]++;
503528 if (rq->rq_flags & RQF_MQ_INFLIGHT)
504
- atomic_dec(&hctx->nr_active);
529
+ __blk_mq_dec_active_requests(hctx);
505530
506531 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
507532 laptop_io_completion(q->backing_dev_info);
508533
509534 rq_qos_done(q, rq);
510
-
511
- if (blk_rq_rl(rq))
512
- blk_put_rl(blk_rq_rl(rq));
513535
514536 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
515537 if (refcount_dec_and_test(&rq->ref))
....@@ -519,12 +541,17 @@
519541
520542 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
521543 {
522
- u64 now = ktime_get_ns();
544
+ u64 now = 0;
545
+
546
+ if (blk_mq_need_time_stamp(rq))
547
+ now = ktime_get_ns();
523548
524549 if (rq->rq_flags & RQF_STATS) {
525550 blk_mq_poll_stats_start(rq->q);
526551 blk_stat_add(rq, now);
527552 }
553
+
554
+ blk_mq_sched_completed_request(rq, now);
528555
529556 blk_account_io_done(rq, now);
530557
....@@ -532,8 +559,6 @@
532559 rq_qos_done(rq->q, rq);
533560 rq->end_io(rq, error);
534561 } else {
535
- if (unlikely(blk_bidi_rq(rq)))
536
- blk_mq_free_request(rq->next_rq);
537562 blk_mq_free_request(rq);
538563 }
539564 }
....@@ -547,43 +572,139 @@
547572 }
548573 EXPORT_SYMBOL(blk_mq_end_request);
549574
575
+/*
576
+ * Softirq action handler - move entries to local list and loop over them
577
+ * while passing them to the queue registered handler.
578
+ */
579
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
580
+{
581
+ struct list_head *cpu_list, local_list;
582
+
583
+ local_irq_disable();
584
+ cpu_list = this_cpu_ptr(&blk_cpu_done);
585
+ list_replace_init(cpu_list, &local_list);
586
+ local_irq_enable();
587
+
588
+ while (!list_empty(&local_list)) {
589
+ struct request *rq;
590
+
591
+ rq = list_entry(local_list.next, struct request, ipi_list);
592
+ list_del_init(&rq->ipi_list);
593
+ rq->q->mq_ops->complete(rq);
594
+ }
595
+}
596
+
597
+static void blk_mq_trigger_softirq(struct request *rq)
598
+{
599
+ struct list_head *list;
600
+ unsigned long flags;
601
+
602
+ local_irq_save(flags);
603
+ list = this_cpu_ptr(&blk_cpu_done);
604
+ list_add_tail(&rq->ipi_list, list);
605
+
606
+ /*
607
+ * If the list only contains our just added request, signal a raise of
608
+ * the softirq. If there are already entries there, someone already
609
+ * raised the irq but it hasn't run yet.
610
+ */
611
+ if (list->next == &rq->ipi_list)
612
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
613
+ local_irq_restore(flags);
614
+}
615
+
616
+static int blk_softirq_cpu_dead(unsigned int cpu)
617
+{
618
+ /*
619
+ * If a CPU goes away, splice its entries to the current CPU
620
+ * and trigger a run of the softirq
621
+ */
622
+ local_irq_disable();
623
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
624
+ this_cpu_ptr(&blk_cpu_done));
625
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
626
+ local_irq_enable();
627
+
628
+ return 0;
629
+}
630
+
631
+
550632 static void __blk_mq_complete_request_remote(void *data)
551633 {
552634 struct request *rq = data;
553635
554
- rq->q->softirq_done_fn(rq);
636
+ /*
637
+ * For most of single queue controllers, there is only one irq vector
638
+ * for handling I/O completion, and the only irq's affinity is set
639
+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
640
+ * is handled on one specific CPU.
641
+ *
642
+ * So complete I/O requests in softirq context in case of single queue
643
+ * devices to avoid degrading I/O performance due to irqsoff latency.
644
+ */
645
+ if (rq->q->nr_hw_queues == 1)
646
+ blk_mq_trigger_softirq(rq);
647
+ else
648
+ rq->q->mq_ops->complete(rq);
555649 }
556650
557
-static void __blk_mq_complete_request(struct request *rq)
651
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
558652 {
559
- struct blk_mq_ctx *ctx = rq->mq_ctx;
560
- bool shared = false;
561
- int cpu;
653
+ int cpu = raw_smp_processor_id();
562654
563
- if (!blk_mq_mark_complete(rq))
564
- return;
565
- if (rq->internal_tag != -1)
566
- blk_mq_sched_completed_request(rq);
655
+ if (!IS_ENABLED(CONFIG_SMP) ||
656
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
657
+ return false;
567658
568
- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
569
- rq->q->softirq_done_fn(rq);
570
- return;
571
- }
659
+ /* same CPU or cache domain? Complete locally */
660
+ if (cpu == rq->mq_ctx->cpu ||
661
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
662
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
663
+ return false;
572664
573
- cpu = get_cpu();
574
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
575
- shared = cpus_share_cache(cpu, ctx->cpu);
665
+ /* don't try to IPI to an offline CPU */
666
+ return cpu_online(rq->mq_ctx->cpu);
667
+}
576668
577
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
669
+bool blk_mq_complete_request_remote(struct request *rq)
670
+{
671
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
672
+
673
+ /*
674
+ * For a polled request, always complete locallly, it's pointless
675
+ * to redirect the completion.
676
+ */
677
+ if (rq->cmd_flags & REQ_HIPRI)
678
+ return false;
679
+
680
+ if (blk_mq_complete_need_ipi(rq)) {
578681 rq->csd.func = __blk_mq_complete_request_remote;
579682 rq->csd.info = rq;
580683 rq->csd.flags = 0;
581
- smp_call_function_single_async(ctx->cpu, &rq->csd);
684
+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
582685 } else {
583
- rq->q->softirq_done_fn(rq);
686
+ if (rq->q->nr_hw_queues > 1)
687
+ return false;
688
+ blk_mq_trigger_softirq(rq);
584689 }
585
- put_cpu();
690
+
691
+ return true;
586692 }
693
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
694
+
695
+/**
696
+ * blk_mq_complete_request - end I/O on a request
697
+ * @rq: the request being processed
698
+ *
699
+ * Description:
700
+ * Complete a request by scheduling the ->complete_rq operation.
701
+ **/
702
+void blk_mq_complete_request(struct request *rq)
703
+{
704
+ if (!blk_mq_complete_request_remote(rq))
705
+ rq->q->mq_ops->complete(rq);
706
+}
707
+EXPORT_SYMBOL(blk_mq_complete_request);
587708
588709 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
589710 __releases(hctx->srcu)
....@@ -606,40 +727,22 @@
606727 }
607728
608729 /**
609
- * blk_mq_complete_request - end I/O on a request
610
- * @rq: the request being processed
730
+ * blk_mq_start_request - Start processing a request
731
+ * @rq: Pointer to request to be started
611732 *
612
- * Description:
613
- * Ends all I/O on a request. It does not handle partial completions.
614
- * The actual completion happens out-of-order, through a IPI handler.
615
- **/
616
-void blk_mq_complete_request(struct request *rq)
617
-{
618
- if (unlikely(blk_should_fake_timeout(rq->q)))
619
- return;
620
- __blk_mq_complete_request(rq);
621
-}
622
-EXPORT_SYMBOL(blk_mq_complete_request);
623
-
624
-int blk_mq_request_started(struct request *rq)
625
-{
626
- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
627
-}
628
-EXPORT_SYMBOL_GPL(blk_mq_request_started);
629
-
733
+ * Function used by device drivers to notify the block layer that a request
734
+ * is going to be processed now, so blk layer can do proper initializations
735
+ * such as starting the timeout timer.
736
+ */
630737 void blk_mq_start_request(struct request *rq)
631738 {
632739 struct request_queue *q = rq->q;
633
-
634
- blk_mq_sched_started_request(rq);
635740
636741 trace_block_rq_issue(q, rq);
637742
638743 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
639744 rq->io_start_time_ns = ktime_get_ns();
640
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
641
- rq->throtl_size = blk_rq_sectors(rq);
642
-#endif
745
+ rq->stats_sectors = blk_rq_sectors(rq);
643746 rq->rq_flags |= RQF_STATS;
644747 rq_qos_issue(q, rq);
645748 }
....@@ -649,14 +752,10 @@
649752 blk_add_timer(rq);
650753 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
651754
652
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
653
- /*
654
- * Make sure space for the drain appears. We know we can do
655
- * this because max_hw_segments has been adjusted to be one
656
- * fewer than the device can handle.
657
- */
658
- rq->nr_phys_segments++;
659
- }
755
+#ifdef CONFIG_BLK_DEV_INTEGRITY
756
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
757
+ q->integrity.profile->prepare_fn(rq);
758
+#endif
660759 }
661760 EXPORT_SYMBOL(blk_mq_start_request);
662761
....@@ -672,8 +771,6 @@
672771 if (blk_mq_request_started(rq)) {
673772 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
674773 rq->rq_flags &= ~RQF_TIMED_OUT;
675
- if (q->dma_drain_size && blk_rq_bytes(rq))
676
- rq->nr_phys_segments--;
677774 }
678775 }
679776
....@@ -684,7 +781,6 @@
684781 /* this request will be re-inserted to io scheduler queue */
685782 blk_mq_sched_requeue_request(rq);
686783
687
- BUG_ON(blk_queued_rq(rq));
688784 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
689785 }
690786 EXPORT_SYMBOL(blk_mq_requeue_request);
....@@ -712,7 +808,7 @@
712808 * merge.
713809 */
714810 if (rq->rq_flags & RQF_DONTPREP)
715
- blk_mq_request_bypass_insert(rq, false);
811
+ blk_mq_request_bypass_insert(rq, false, false);
716812 else
717813 blk_mq_sched_insert_request(rq, true, false, false);
718814 }
....@@ -750,7 +846,6 @@
750846 if (kick_requeue_list)
751847 blk_mq_kick_requeue_list(q);
752848 }
753
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
754849
755850 void blk_mq_kick_requeue_list(struct request_queue *q)
756851 {
....@@ -777,6 +872,32 @@
777872 }
778873 EXPORT_SYMBOL(blk_mq_tag_to_rq);
779874
875
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
876
+ void *priv, bool reserved)
877
+{
878
+ /*
879
+ * If we find a request that isn't idle and the queue matches,
880
+ * we know the queue is busy. Return false to stop the iteration.
881
+ */
882
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
883
+ bool *busy = priv;
884
+
885
+ *busy = true;
886
+ return false;
887
+ }
888
+
889
+ return true;
890
+}
891
+
892
+bool blk_mq_queue_inflight(struct request_queue *q)
893
+{
894
+ bool busy = false;
895
+
896
+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
897
+ return busy;
898
+}
899
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
900
+
780901 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
781902 {
782903 req->rq_flags |= RQF_TIMED_OUT;
....@@ -801,7 +922,7 @@
801922 if (rq->rq_flags & RQF_TIMED_OUT)
802923 return false;
803924
804
- deadline = blk_rq_deadline(rq);
925
+ deadline = READ_ONCE(rq->deadline);
805926 if (time_after_eq(jiffies, deadline))
806927 return true;
807928
....@@ -812,43 +933,29 @@
812933 return false;
813934 }
814935
815
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
936
+void blk_mq_put_rq_ref(struct request *rq)
937
+{
938
+ if (is_flush_rq(rq))
939
+ rq->end_io(rq, 0);
940
+ else if (refcount_dec_and_test(&rq->ref))
941
+ __blk_mq_free_request(rq);
942
+}
943
+
944
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816945 struct request *rq, void *priv, bool reserved)
817946 {
818947 unsigned long *next = priv;
819948
820949 /*
821
- * Just do a quick check if it is expired before locking the request in
822
- * so we're not unnecessarilly synchronizing across CPUs.
823
- */
824
- if (!blk_mq_req_expired(rq, next))
825
- return;
826
-
827
- /*
828
- * We have reason to believe the request may be expired. Take a
829
- * reference on the request to lock this request lifetime into its
830
- * currently allocated context to prevent it from being reallocated in
831
- * the event the completion by-passes this timeout handler.
832
- *
833
- * If the reference was already released, then the driver beat the
834
- * timeout handler to posting a natural completion.
835
- */
836
- if (!refcount_inc_not_zero(&rq->ref))
837
- return;
838
-
839
- /*
840
- * The request is now locked and cannot be reallocated underneath the
841
- * timeout handler's processing. Re-verify this exact request is truly
842
- * expired; if it is not expired, then the request was completed and
843
- * reallocated as a new request.
950
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
951
+ * be reallocated underneath the timeout handler's processing, then
952
+ * the expire check is reliable. If the request is not expired, then
953
+ * it was completed and reallocated as a new request after returning
954
+ * from blk_mq_check_expired().
844955 */
845956 if (blk_mq_req_expired(rq, next))
846957 blk_mq_rq_timed_out(rq, reserved);
847
-
848
- if (is_flush_rq(rq, hctx))
849
- rq->end_io(rq, 0);
850
- else if (refcount_dec_and_test(&rq->ref))
851
- __blk_mq_free_request(rq);
958
+ return true;
852959 }
853960
854961 static void blk_mq_timeout_work(struct work_struct *work)
....@@ -905,9 +1012,10 @@
9051012 struct flush_busy_ctx_data *flush_data = data;
9061013 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
9071014 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1015
+ enum hctx_type type = hctx->type;
9081016
9091017 spin_lock(&ctx->lock);
910
- list_splice_tail_init(&ctx->rq_list, flush_data->list);
1018
+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
9111019 sbitmap_clear_bit(sb, bitnr);
9121020 spin_unlock(&ctx->lock);
9131021 return true;
....@@ -939,12 +1047,13 @@
9391047 struct dispatch_rq_data *dispatch_data = data;
9401048 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
9411049 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1050
+ enum hctx_type type = hctx->type;
9421051
9431052 spin_lock(&ctx->lock);
944
- if (!list_empty(&ctx->rq_list)) {
945
- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
1053
+ if (!list_empty(&ctx->rq_lists[type])) {
1054
+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
9461055 list_del_init(&dispatch_data->rq->queuelist);
947
- if (list_empty(&ctx->rq_list))
1056
+ if (list_empty(&ctx->rq_lists[type]))
9481057 sbitmap_clear_bit(sb, bitnr);
9491058 }
9501059 spin_unlock(&ctx->lock);
....@@ -955,7 +1064,7 @@
9551064 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
9561065 struct blk_mq_ctx *start)
9571066 {
958
- unsigned off = start ? start->index_hw : 0;
1067
+ unsigned off = start ? start->index_hw[hctx->type] : 0;
9591068 struct dispatch_rq_data data = {
9601069 .hctx = hctx,
9611070 .rq = NULL,
....@@ -975,33 +1084,44 @@
9751084 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
9761085 }
9771086
978
-bool blk_mq_get_driver_tag(struct request *rq)
1087
+static bool __blk_mq_get_driver_tag(struct request *rq)
9791088 {
980
- struct blk_mq_alloc_data data = {
981
- .q = rq->q,
982
- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
983
- .flags = BLK_MQ_REQ_NOWAIT,
984
- };
985
- bool shared;
1089
+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
1090
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1091
+ int tag;
9861092
987
- if (rq->tag != -1)
988
- goto done;
1093
+ blk_mq_tag_busy(rq->mq_hctx);
9891094
990
- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
991
- data.flags |= BLK_MQ_REQ_RESERVED;
992
-
993
- shared = blk_mq_tag_busy(data.hctx);
994
- rq->tag = blk_mq_get_tag(&data);
995
- if (rq->tag >= 0) {
996
- if (shared) {
997
- rq->rq_flags |= RQF_MQ_INFLIGHT;
998
- atomic_inc(&data.hctx->nr_active);
999
- }
1000
- data.hctx->tags->rqs[rq->tag] = rq;
1095
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1096
+ bt = rq->mq_hctx->tags->breserved_tags;
1097
+ tag_offset = 0;
1098
+ } else {
1099
+ if (!hctx_may_queue(rq->mq_hctx, bt))
1100
+ return false;
10011101 }
10021102
1003
-done:
1004
- return rq->tag != -1;
1103
+ tag = __sbitmap_queue_get(bt);
1104
+ if (tag == BLK_MQ_NO_TAG)
1105
+ return false;
1106
+
1107
+ rq->tag = tag + tag_offset;
1108
+ return true;
1109
+}
1110
+
1111
+static bool blk_mq_get_driver_tag(struct request *rq)
1112
+{
1113
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1114
+
1115
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1116
+ return false;
1117
+
1118
+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1119
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1120
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
1121
+ __blk_mq_inc_active_requests(hctx);
1122
+ }
1123
+ hctx->tags->rqs[rq->tag] = rq;
1124
+ return true;
10051125 }
10061126
10071127 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
....@@ -1012,7 +1132,13 @@
10121132 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
10131133
10141134 spin_lock(&hctx->dispatch_wait_lock);
1015
- list_del_init(&wait->entry);
1135
+ if (!list_empty(&wait->entry)) {
1136
+ struct sbitmap_queue *sbq;
1137
+
1138
+ list_del_init(&wait->entry);
1139
+ sbq = hctx->tags->bitmap_tags;
1140
+ atomic_dec(&sbq->ws_active);
1141
+ }
10161142 spin_unlock(&hctx->dispatch_wait_lock);
10171143
10181144 blk_mq_run_hw_queue(hctx, true);
....@@ -1028,13 +1154,13 @@
10281154 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
10291155 struct request *rq)
10301156 {
1157
+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
10311158 struct wait_queue_head *wq;
10321159 wait_queue_entry_t *wait;
10331160 bool ret;
10341161
1035
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1036
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1037
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1162
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1163
+ blk_mq_sched_mark_restart_hctx(hctx);
10381164
10391165 /*
10401166 * It's possible that a tag was freed in the window between the
....@@ -1051,7 +1177,7 @@
10511177 if (!list_empty_careful(&wait->entry))
10521178 return false;
10531179
1054
- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1180
+ wq = &bt_wait_ptr(sbq, hctx)->wait;
10551181
10561182 spin_lock_irq(&wq->lock);
10571183 spin_lock(&hctx->dispatch_wait_lock);
....@@ -1061,6 +1187,7 @@
10611187 return false;
10621188 }
10631189
1190
+ atomic_inc(&sbq->ws_active);
10641191 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
10651192 __add_wait_queue(wq, wait);
10661193
....@@ -1081,6 +1208,7 @@
10811208 * someone else gets the wakeup.
10821209 */
10831210 list_del_init(&wait->entry);
1211
+ atomic_dec(&sbq->ws_active);
10841212 spin_unlock(&hctx->dispatch_wait_lock);
10851213 spin_unlock_irq(&wq->lock);
10861214
....@@ -1099,9 +1227,6 @@
10991227 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
11001228 {
11011229 unsigned int ewma;
1102
-
1103
- if (hctx->queue->elevator)
1104
- return;
11051230
11061231 ewma = hctx->dispatch_busy;
11071232
....@@ -1135,22 +1260,83 @@
11351260 __blk_mq_requeue_request(rq);
11361261 }
11371262
1263
+static void blk_mq_handle_zone_resource(struct request *rq,
1264
+ struct list_head *zone_list)
1265
+{
1266
+ /*
1267
+ * If we end up here it is because we cannot dispatch a request to a
1268
+ * specific zone due to LLD level zone-write locking or other zone
1269
+ * related resource not being available. In this case, set the request
1270
+ * aside in zone_list for retrying it later.
1271
+ */
1272
+ list_add(&rq->queuelist, zone_list);
1273
+ __blk_mq_requeue_request(rq);
1274
+}
1275
+
1276
+enum prep_dispatch {
1277
+ PREP_DISPATCH_OK,
1278
+ PREP_DISPATCH_NO_TAG,
1279
+ PREP_DISPATCH_NO_BUDGET,
1280
+};
1281
+
1282
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1283
+ bool need_budget)
1284
+{
1285
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1286
+
1287
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
1288
+ blk_mq_put_driver_tag(rq);
1289
+ return PREP_DISPATCH_NO_BUDGET;
1290
+ }
1291
+
1292
+ if (!blk_mq_get_driver_tag(rq)) {
1293
+ /*
1294
+ * The initial allocation attempt failed, so we need to
1295
+ * rerun the hardware queue when a tag is freed. The
1296
+ * waitqueue takes care of that. If the queue is run
1297
+ * before we add this entry back on the dispatch list,
1298
+ * we'll re-run it below.
1299
+ */
1300
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
1301
+ /*
1302
+ * All budgets not got from this function will be put
1303
+ * together during handling partial dispatch
1304
+ */
1305
+ if (need_budget)
1306
+ blk_mq_put_dispatch_budget(rq->q);
1307
+ return PREP_DISPATCH_NO_TAG;
1308
+ }
1309
+ }
1310
+
1311
+ return PREP_DISPATCH_OK;
1312
+}
1313
+
1314
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1315
+static void blk_mq_release_budgets(struct request_queue *q,
1316
+ unsigned int nr_budgets)
1317
+{
1318
+ int i;
1319
+
1320
+ for (i = 0; i < nr_budgets; i++)
1321
+ blk_mq_put_dispatch_budget(q);
1322
+}
1323
+
11381324 /*
11391325 * Returns true if we did some work AND can potentially do more.
11401326 */
1141
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1142
- bool got_budget)
1327
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1328
+ unsigned int nr_budgets)
11431329 {
1144
- struct blk_mq_hw_ctx *hctx;
1330
+ enum prep_dispatch prep;
1331
+ struct request_queue *q = hctx->queue;
11451332 struct request *rq, *nxt;
1146
- bool no_tag = false;
11471333 int errors, queued;
11481334 blk_status_t ret = BLK_STS_OK;
1335
+ LIST_HEAD(zone_list);
1336
+ bool needs_resource = false;
11491337
11501338 if (list_empty(list))
11511339 return false;
1152
-
1153
- WARN_ON(!list_is_singular(list) && got_budget);
11541340
11551341 /*
11561342 * Now process all the entries, sending them to the driver.
....@@ -1161,29 +1347,10 @@
11611347
11621348 rq = list_first_entry(list, struct request, queuelist);
11631349
1164
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1165
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1350
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
1351
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1352
+ if (prep != PREP_DISPATCH_OK)
11661353 break;
1167
-
1168
- if (!blk_mq_get_driver_tag(rq)) {
1169
- /*
1170
- * The initial allocation attempt failed, so we need to
1171
- * rerun the hardware queue when a tag is freed. The
1172
- * waitqueue takes care of that. If the queue is run
1173
- * before we add this entry back on the dispatch list,
1174
- * we'll re-run it below.
1175
- */
1176
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1177
- blk_mq_put_dispatch_budget(hctx);
1178
- /*
1179
- * For non-shared tags, the RESTART check
1180
- * will suffice.
1181
- */
1182
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1183
- no_tag = true;
1184
- break;
1185
- }
1186
- }
11871354
11881355 list_del_init(&rq->queuelist);
11891356
....@@ -1200,32 +1367,63 @@
12001367 bd.last = !blk_mq_get_driver_tag(nxt);
12011368 }
12021369
1370
+ /*
1371
+ * once the request is queued to lld, no need to cover the
1372
+ * budget any more
1373
+ */
1374
+ if (nr_budgets)
1375
+ nr_budgets--;
12031376 ret = q->mq_ops->queue_rq(hctx, &bd);
1204
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1205
- blk_mq_handle_dev_resource(rq, list);
1377
+ switch (ret) {
1378
+ case BLK_STS_OK:
1379
+ queued++;
12061380 break;
1207
- }
1208
-
1209
- if (unlikely(ret != BLK_STS_OK)) {
1381
+ case BLK_STS_RESOURCE:
1382
+ needs_resource = true;
1383
+ fallthrough;
1384
+ case BLK_STS_DEV_RESOURCE:
1385
+ blk_mq_handle_dev_resource(rq, list);
1386
+ goto out;
1387
+ case BLK_STS_ZONE_RESOURCE:
1388
+ /*
1389
+ * Move the request to zone_list and keep going through
1390
+ * the dispatch list to find more requests the drive can
1391
+ * accept.
1392
+ */
1393
+ blk_mq_handle_zone_resource(rq, &zone_list);
1394
+ needs_resource = true;
1395
+ break;
1396
+ default:
12101397 errors++;
12111398 blk_mq_end_request(rq, BLK_STS_IOERR);
1212
- continue;
12131399 }
1214
-
1215
- queued++;
12161400 } while (!list_empty(list));
1401
+out:
1402
+ if (!list_empty(&zone_list))
1403
+ list_splice_tail_init(&zone_list, list);
12171404
12181405 hctx->dispatched[queued_to_index(queued)]++;
12191406
1407
+ /* If we didn't flush the entire list, we could have told the driver
1408
+ * there was more coming, but that turned out to be a lie.
1409
+ */
1410
+ if ((!list_empty(list) || errors || needs_resource ||
1411
+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
1412
+ q->mq_ops->commit_rqs(hctx);
12201413 /*
12211414 * Any items that need requeuing? Stuff them into hctx->dispatch,
12221415 * that is where we will continue on next queue run.
12231416 */
12241417 if (!list_empty(list)) {
12251418 bool needs_restart;
1419
+ /* For non-shared tags, the RESTART check will suffice */
1420
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1421
+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1422
+
1423
+ blk_mq_release_budgets(q, nr_budgets);
12261424
12271425 spin_lock(&hctx->lock);
1228
- list_splice_init(list, &hctx->dispatch);
1426
+ list_splice_tail_init(list, &hctx->dispatch);
12291427 spin_unlock(&hctx->lock);
12301428
12311429 /*
....@@ -1259,13 +1457,17 @@
12591457 *
12601458 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
12611459 * bit is set, run queue after a delay to avoid IO stalls
1262
- * that could otherwise occur if the queue is idle.
1460
+ * that could otherwise occur if the queue is idle. We'll do
1461
+ * similar if we couldn't get budget or couldn't lock a zone
1462
+ * and SCHED_RESTART is set.
12631463 */
12641464 needs_restart = blk_mq_sched_needs_restart(hctx);
1465
+ if (prep == PREP_DISPATCH_NO_BUDGET)
1466
+ needs_resource = true;
12651467 if (!needs_restart ||
12661468 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
12671469 blk_mq_run_hw_queue(hctx, true);
1268
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
1470
+ else if (needs_restart && needs_resource)
12691471 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
12701472
12711473 blk_mq_update_dispatch_busy(hctx, true);
....@@ -1273,16 +1475,15 @@
12731475 } else
12741476 blk_mq_update_dispatch_busy(hctx, false);
12751477
1276
- /*
1277
- * If the host/device is unable to accept more work, inform the
1278
- * caller of that.
1279
- */
1280
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1281
- return false;
1282
-
12831478 return (queued + errors) != 0;
12841479 }
12851480
1481
+/**
1482
+ * __blk_mq_run_hw_queue - Run a hardware queue.
1483
+ * @hctx: Pointer to the hardware queue to run.
1484
+ *
1485
+ * Send pending requests to the hardware.
1486
+ */
12861487 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
12871488 {
12881489 int srcu_idx;
....@@ -1380,6 +1581,15 @@
13801581 return next_cpu;
13811582 }
13821583
1584
+/**
1585
+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1586
+ * @hctx: Pointer to the hardware queue to run.
1587
+ * @async: If we want to run the queue asynchronously.
1588
+ * @msecs: Microseconds of delay to wait before running the queue.
1589
+ *
1590
+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
1591
+ * with a delay of @msecs.
1592
+ */
13831593 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
13841594 unsigned long msecs)
13851595 {
....@@ -1401,13 +1611,29 @@
14011611 msecs_to_jiffies(msecs));
14021612 }
14031613
1614
+/**
1615
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1616
+ * @hctx: Pointer to the hardware queue to run.
1617
+ * @msecs: Microseconds of delay to wait before running the queue.
1618
+ *
1619
+ * Run a hardware queue asynchronously with a delay of @msecs.
1620
+ */
14041621 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
14051622 {
14061623 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
14071624 }
14081625 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
14091626
1410
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1627
+/**
1628
+ * blk_mq_run_hw_queue - Start to run a hardware queue.
1629
+ * @hctx: Pointer to the hardware queue to run.
1630
+ * @async: If we want to run the queue asynchronously.
1631
+ *
1632
+ * Check if the request queue is not in a quiesced state and if there are
1633
+ * pending requests to be sent. If this is true, run the queue to send requests
1634
+ * to hardware.
1635
+ */
1636
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
14111637 {
14121638 int srcu_idx;
14131639 bool need_run;
....@@ -1425,28 +1651,101 @@
14251651 blk_mq_hctx_has_pending(hctx);
14261652 hctx_unlock(hctx, srcu_idx);
14271653
1428
- if (need_run) {
1654
+ if (need_run)
14291655 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1430
- return true;
1431
- }
1432
-
1433
- return false;
14341656 }
14351657 EXPORT_SYMBOL(blk_mq_run_hw_queue);
14361658
1659
+/*
1660
+ * Is the request queue handled by an IO scheduler that does not respect
1661
+ * hardware queues when dispatching?
1662
+ */
1663
+static bool blk_mq_has_sqsched(struct request_queue *q)
1664
+{
1665
+ struct elevator_queue *e = q->elevator;
1666
+
1667
+ if (e && e->type->ops.dispatch_request &&
1668
+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1669
+ return true;
1670
+ return false;
1671
+}
1672
+
1673
+/*
1674
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
1675
+ * scheduler.
1676
+ */
1677
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1678
+{
1679
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1680
+ /*
1681
+ * If the IO scheduler does not respect hardware queues when
1682
+ * dispatching, we just don't bother with multiple HW queues and
1683
+ * dispatch from hctx for the current CPU since running multiple queues
1684
+ * just causes lock contention inside the scheduler and pointless cache
1685
+ * bouncing.
1686
+ */
1687
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
1688
+
1689
+ if (!blk_mq_hctx_stopped(hctx))
1690
+ return hctx;
1691
+ return NULL;
1692
+}
1693
+
1694
+/**
1695
+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
1696
+ * @q: Pointer to the request queue to run.
1697
+ * @async: If we want to run the queue asynchronously.
1698
+ */
14371699 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
14381700 {
1439
- struct blk_mq_hw_ctx *hctx;
1701
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
14401702 int i;
14411703
1704
+ sq_hctx = NULL;
1705
+ if (blk_mq_has_sqsched(q))
1706
+ sq_hctx = blk_mq_get_sq_hctx(q);
14421707 queue_for_each_hw_ctx(q, hctx, i) {
14431708 if (blk_mq_hctx_stopped(hctx))
14441709 continue;
1445
-
1446
- blk_mq_run_hw_queue(hctx, async);
1710
+ /*
1711
+ * Dispatch from this hctx either if there's no hctx preferred
1712
+ * by IO scheduler or if it has requests that bypass the
1713
+ * scheduler.
1714
+ */
1715
+ if (!sq_hctx || sq_hctx == hctx ||
1716
+ !list_empty_careful(&hctx->dispatch))
1717
+ blk_mq_run_hw_queue(hctx, async);
14471718 }
14481719 }
14491720 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1721
+
1722
+/**
1723
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1724
+ * @q: Pointer to the request queue to run.
1725
+ * @msecs: Microseconds of delay to wait before running the queues.
1726
+ */
1727
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1728
+{
1729
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
1730
+ int i;
1731
+
1732
+ sq_hctx = NULL;
1733
+ if (blk_mq_has_sqsched(q))
1734
+ sq_hctx = blk_mq_get_sq_hctx(q);
1735
+ queue_for_each_hw_ctx(q, hctx, i) {
1736
+ if (blk_mq_hctx_stopped(hctx))
1737
+ continue;
1738
+ /*
1739
+ * Dispatch from this hctx either if there's no hctx preferred
1740
+ * by IO scheduler or if it has requests that bypass the
1741
+ * scheduler.
1742
+ */
1743
+ if (!sq_hctx || sq_hctx == hctx ||
1744
+ !list_empty_careful(&hctx->dispatch))
1745
+ blk_mq_delay_run_hw_queue(hctx, msecs);
1746
+ }
1747
+}
1748
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
14501749
14511750 /**
14521751 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
....@@ -1551,7 +1850,7 @@
15511850 /*
15521851 * If we are stopped, don't run the queue.
15531852 */
1554
- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1853
+ if (blk_mq_hctx_stopped(hctx))
15551854 return;
15561855
15571856 __blk_mq_run_hw_queue(hctx);
....@@ -1562,15 +1861,16 @@
15621861 bool at_head)
15631862 {
15641863 struct blk_mq_ctx *ctx = rq->mq_ctx;
1864
+ enum hctx_type type = hctx->type;
15651865
15661866 lockdep_assert_held(&ctx->lock);
15671867
15681868 trace_block_rq_insert(hctx->queue, rq);
15691869
15701870 if (at_head)
1571
- list_add(&rq->queuelist, &ctx->rq_list);
1871
+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
15721872 else
1573
- list_add_tail(&rq->queuelist, &ctx->rq_list);
1873
+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
15741874 }
15751875
15761876 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
....@@ -1584,17 +1884,25 @@
15841884 blk_mq_hctx_mark_pending(hctx, ctx);
15851885 }
15861886
1587
-/*
1887
+/**
1888
+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1889
+ * @rq: Pointer to request to be inserted.
1890
+ * @at_head: true if the request should be inserted at the head of the list.
1891
+ * @run_queue: If we should run the hardware queue after inserting the request.
1892
+ *
15881893 * Should only be used carefully, when the caller knows we want to
15891894 * bypass a potential IO scheduler on the target device.
15901895 */
1591
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1896
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1897
+ bool run_queue)
15921898 {
1593
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1594
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1899
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
15951900
15961901 spin_lock(&hctx->lock);
1597
- list_add_tail(&rq->queuelist, &hctx->dispatch);
1902
+ if (at_head)
1903
+ list_add(&rq->queuelist, &hctx->dispatch);
1904
+ else
1905
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
15981906 spin_unlock(&hctx->lock);
15991907
16001908 if (run_queue)
....@@ -1606,6 +1914,7 @@
16061914
16071915 {
16081916 struct request *rq;
1917
+ enum hctx_type type = hctx->type;
16091918
16101919 /*
16111920 * preemption doesn't flush plug list, so it's possible ctx->cpu is
....@@ -1617,95 +1926,87 @@
16171926 }
16181927
16191928 spin_lock(&ctx->lock);
1620
- list_splice_tail_init(list, &ctx->rq_list);
1929
+ list_splice_tail_init(list, &ctx->rq_lists[type]);
16211930 blk_mq_hctx_mark_pending(hctx, ctx);
16221931 spin_unlock(&ctx->lock);
16231932 }
16241933
1625
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1934
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
16261935 {
16271936 struct request *rqa = container_of(a, struct request, queuelist);
16281937 struct request *rqb = container_of(b, struct request, queuelist);
16291938
1630
- return !(rqa->mq_ctx < rqb->mq_ctx ||
1631
- (rqa->mq_ctx == rqb->mq_ctx &&
1632
- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1939
+ if (rqa->mq_ctx != rqb->mq_ctx)
1940
+ return rqa->mq_ctx > rqb->mq_ctx;
1941
+ if (rqa->mq_hctx != rqb->mq_hctx)
1942
+ return rqa->mq_hctx > rqb->mq_hctx;
1943
+
1944
+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
16331945 }
16341946
16351947 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
16361948 {
1637
- struct blk_mq_ctx *this_ctx;
1638
- struct request_queue *this_q;
1639
- struct request *rq;
16401949 LIST_HEAD(list);
1641
- LIST_HEAD(ctx_list);
1642
- unsigned int depth;
16431950
1951
+ if (list_empty(&plug->mq_list))
1952
+ return;
16441953 list_splice_init(&plug->mq_list, &list);
16451954
1646
- list_sort(NULL, &list, plug_ctx_cmp);
1955
+ if (plug->rq_count > 2 && plug->multiple_queues)
1956
+ list_sort(NULL, &list, plug_rq_cmp);
16471957
1648
- this_q = NULL;
1649
- this_ctx = NULL;
1650
- depth = 0;
1958
+ plug->rq_count = 0;
16511959
1652
- while (!list_empty(&list)) {
1653
- rq = list_entry_rq(list.next);
1654
- list_del_init(&rq->queuelist);
1655
- BUG_ON(!rq->q);
1656
- if (rq->mq_ctx != this_ctx) {
1657
- if (this_ctx) {
1658
- trace_block_unplug(this_q, depth, !from_schedule);
1659
- blk_mq_sched_insert_requests(this_q, this_ctx,
1660
- &ctx_list,
1661
- from_schedule);
1662
- }
1960
+ do {
1961
+ struct list_head rq_list;
1962
+ struct request *rq, *head_rq = list_entry_rq(list.next);
1963
+ struct list_head *pos = &head_rq->queuelist; /* skip first */
1964
+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1965
+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1966
+ unsigned int depth = 1;
16631967
1664
- this_ctx = rq->mq_ctx;
1665
- this_q = rq->q;
1666
- depth = 0;
1968
+ list_for_each_continue(pos, &list) {
1969
+ rq = list_entry_rq(pos);
1970
+ BUG_ON(!rq->q);
1971
+ if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1972
+ break;
1973
+ depth++;
16671974 }
16681975
1669
- depth++;
1670
- list_add_tail(&rq->queuelist, &ctx_list);
1671
- }
1672
-
1673
- /*
1674
- * If 'this_ctx' is set, we know we have entries to complete
1675
- * on 'ctx_list'. Do those.
1676
- */
1677
- if (this_ctx) {
1678
- trace_block_unplug(this_q, depth, !from_schedule);
1679
- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1976
+ list_cut_before(&rq_list, &list, pos);
1977
+ trace_block_unplug(head_rq->q, depth, !from_schedule);
1978
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
16801979 from_schedule);
1681
- }
1980
+ } while(!list_empty(&list));
16821981 }
16831982
1684
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1983
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1984
+ unsigned int nr_segs)
16851985 {
1686
- blk_init_request_from_bio(rq, bio);
1986
+ int err;
16871987
1688
- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1988
+ if (bio->bi_opf & REQ_RAHEAD)
1989
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
16891990
1690
- blk_account_io_start(rq, true);
1691
-}
1991
+ rq->__sector = bio->bi_iter.bi_sector;
1992
+ rq->write_hint = bio->bi_write_hint;
1993
+ blk_rq_bio_prep(rq, bio, nr_segs);
16921994
1693
-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1694
-{
1695
- if (rq->tag != -1)
1696
- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1995
+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
1996
+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1997
+ WARN_ON_ONCE(err);
16971998
1698
- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1999
+ blk_account_io_start(rq);
16992000 }
17002001
17012002 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
17022003 struct request *rq,
1703
- blk_qc_t *cookie)
2004
+ blk_qc_t *cookie, bool last)
17042005 {
17052006 struct request_queue *q = rq->q;
17062007 struct blk_mq_queue_data bd = {
17072008 .rq = rq,
1708
- .last = true,
2009
+ .last = last,
17092010 };
17102011 blk_qc_t new_cookie;
17112012 blk_status_t ret;
....@@ -1740,7 +2041,7 @@
17402041 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17412042 struct request *rq,
17422043 blk_qc_t *cookie,
1743
- bool bypass_insert)
2044
+ bool bypass_insert, bool last)
17442045 {
17452046 struct request_queue *q = rq->q;
17462047 bool run_queue = true;
....@@ -1761,23 +2062,35 @@
17612062 if (q->elevator && !bypass_insert)
17622063 goto insert;
17632064
1764
- if (!blk_mq_get_dispatch_budget(hctx))
2065
+ if (!blk_mq_get_dispatch_budget(q))
17652066 goto insert;
17662067
17672068 if (!blk_mq_get_driver_tag(rq)) {
1768
- blk_mq_put_dispatch_budget(hctx);
2069
+ blk_mq_put_dispatch_budget(q);
17692070 goto insert;
17702071 }
17712072
1772
- return __blk_mq_issue_directly(hctx, rq, cookie);
2073
+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
17732074 insert:
17742075 if (bypass_insert)
17752076 return BLK_STS_RESOURCE;
17762077
1777
- blk_mq_request_bypass_insert(rq, run_queue);
2078
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
2079
+
17782080 return BLK_STS_OK;
17792081 }
17802082
2083
+/**
2084
+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2085
+ * @hctx: Pointer of the associated hardware queue.
2086
+ * @rq: Pointer to request to be sent.
2087
+ * @cookie: Request queue cookie.
2088
+ *
2089
+ * If the device has enough resources to accept a new request now, send the
2090
+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
2091
+ * we can try send it another time in the future. Requests inserted at this
2092
+ * queue have higher priority.
2093
+ */
17812094 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17822095 struct request *rq, blk_qc_t *cookie)
17832096 {
....@@ -1788,25 +2101,24 @@
17882101
17892102 hctx_lock(hctx, &srcu_idx);
17902103
1791
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
2104
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
17922105 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1793
- blk_mq_request_bypass_insert(rq, true);
2106
+ blk_mq_request_bypass_insert(rq, false, true);
17942107 else if (ret != BLK_STS_OK)
17952108 blk_mq_end_request(rq, ret);
17962109
17972110 hctx_unlock(hctx, srcu_idx);
17982111 }
17992112
1800
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
2113
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
18012114 {
18022115 blk_status_t ret;
18032116 int srcu_idx;
18042117 blk_qc_t unused_cookie;
1805
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1806
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
2118
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
18072119
18082120 hctx_lock(hctx, &srcu_idx);
1809
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
2121
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
18102122 hctx_unlock(hctx, srcu_idx);
18112123
18122124 return ret;
....@@ -1815,104 +2127,169 @@
18152127 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
18162128 struct list_head *list)
18172129 {
2130
+ int queued = 0;
2131
+ int errors = 0;
2132
+
18182133 while (!list_empty(list)) {
18192134 blk_status_t ret;
18202135 struct request *rq = list_first_entry(list, struct request,
18212136 queuelist);
18222137
18232138 list_del_init(&rq->queuelist);
1824
- ret = blk_mq_request_issue_directly(rq);
2139
+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
18252140 if (ret != BLK_STS_OK) {
2141
+ errors++;
18262142 if (ret == BLK_STS_RESOURCE ||
18272143 ret == BLK_STS_DEV_RESOURCE) {
1828
- blk_mq_request_bypass_insert(rq,
2144
+ blk_mq_request_bypass_insert(rq, false,
18292145 list_empty(list));
18302146 break;
18312147 }
18322148 blk_mq_end_request(rq, ret);
1833
- }
2149
+ } else
2150
+ queued++;
2151
+ }
2152
+
2153
+ /*
2154
+ * If we didn't flush the entire list, we could have told
2155
+ * the driver there was more coming, but that turned out to
2156
+ * be a lie.
2157
+ */
2158
+ if ((!list_empty(list) || errors) &&
2159
+ hctx->queue->mq_ops->commit_rqs && queued)
2160
+ hctx->queue->mq_ops->commit_rqs(hctx);
2161
+}
2162
+
2163
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2164
+{
2165
+ list_add_tail(&rq->queuelist, &plug->mq_list);
2166
+ plug->rq_count++;
2167
+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2168
+ struct request *tmp;
2169
+
2170
+ tmp = list_first_entry(&plug->mq_list, struct request,
2171
+ queuelist);
2172
+ if (tmp->q != rq->q)
2173
+ plug->multiple_queues = true;
18342174 }
18352175 }
18362176
1837
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2177
+/*
2178
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
2179
+ * queues. This is important for md arrays to benefit from merging
2180
+ * requests.
2181
+ */
2182
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
18382183 {
2184
+ if (plug->multiple_queues)
2185
+ return BLK_MAX_REQUEST_COUNT * 2;
2186
+ return BLK_MAX_REQUEST_COUNT;
2187
+}
2188
+
2189
+/**
2190
+ * blk_mq_submit_bio - Create and send a request to block device.
2191
+ * @bio: Bio pointer.
2192
+ *
2193
+ * Builds up a request structure from @q and @bio and send to the device. The
2194
+ * request may not be queued directly to hardware if:
2195
+ * * This request can be merged with another one
2196
+ * * We want to place request at plug queue for possible future merging
2197
+ * * There is an IO scheduler active at this queue
2198
+ *
2199
+ * It will not queue the request if there is an error with the bio, or at the
2200
+ * request creation.
2201
+ *
2202
+ * Returns: Request queue cookie.
2203
+ */
2204
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
2205
+{
2206
+ struct request_queue *q = bio->bi_disk->queue;
18392207 const int is_sync = op_is_sync(bio->bi_opf);
18402208 const int is_flush_fua = op_is_flush(bio->bi_opf);
1841
- struct blk_mq_alloc_data data = { .flags = 0 };
2209
+ struct blk_mq_alloc_data data = {
2210
+ .q = q,
2211
+ };
18422212 struct request *rq;
1843
- unsigned int request_count = 0;
18442213 struct blk_plug *plug;
18452214 struct request *same_queue_rq = NULL;
2215
+ unsigned int nr_segs;
18462216 blk_qc_t cookie;
2217
+ blk_status_t ret;
18472218
18482219 blk_queue_bounce(q, &bio);
1849
-
1850
- blk_queue_split(q, &bio);
2220
+ __blk_queue_split(&bio, &nr_segs);
18512221
18522222 if (!bio_integrity_prep(bio))
1853
- return BLK_QC_T_NONE;
2223
+ goto queue_exit;
18542224
18552225 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1856
- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1857
- return BLK_QC_T_NONE;
2226
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2227
+ goto queue_exit;
18582228
1859
- if (blk_mq_sched_bio_merge(q, bio))
1860
- return BLK_QC_T_NONE;
2229
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2230
+ goto queue_exit;
18612231
1862
- rq_qos_throttle(q, bio, NULL);
2232
+ rq_qos_throttle(q, bio);
18632233
1864
- trace_block_getrq(q, bio, bio->bi_opf);
1865
-
1866
- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
2234
+ data.cmd_flags = bio->bi_opf;
2235
+ rq = __blk_mq_alloc_request(&data);
18672236 if (unlikely(!rq)) {
18682237 rq_qos_cleanup(q, bio);
18692238 if (bio->bi_opf & REQ_NOWAIT)
18702239 bio_wouldblock_error(bio);
1871
- return BLK_QC_T_NONE;
2240
+ goto queue_exit;
18722241 }
2242
+
2243
+ trace_block_getrq(q, bio, bio->bi_opf);
18732244
18742245 rq_qos_track(q, rq, bio);
18752246
18762247 cookie = request_to_qc_t(data.hctx, rq);
18772248
1878
- plug = current->plug;
1879
- if (unlikely(is_flush_fua)) {
1880
- blk_mq_put_ctx(data.ctx);
1881
- blk_mq_bio_to_request(rq, bio);
2249
+ blk_mq_bio_to_request(rq, bio, nr_segs);
18822250
1883
- /* bypass scheduler for flush rq */
2251
+ ret = blk_crypto_rq_get_keyslot(rq);
2252
+ if (ret != BLK_STS_OK) {
2253
+ bio->bi_status = ret;
2254
+ bio_endio(bio);
2255
+ blk_mq_free_request(rq);
2256
+ return BLK_QC_T_NONE;
2257
+ }
2258
+
2259
+ plug = blk_mq_plug(q, bio);
2260
+ if (unlikely(is_flush_fua)) {
2261
+ /* Bypass scheduler for flush requests */
18842262 blk_insert_flush(rq);
18852263 blk_mq_run_hw_queue(data.hctx, true);
1886
- } else if (plug && q->nr_hw_queues == 1) {
1887
- struct request *last = NULL;
1888
-
1889
- blk_mq_put_ctx(data.ctx);
1890
- blk_mq_bio_to_request(rq, bio);
1891
-
2264
+ } else if (plug && (q->nr_hw_queues == 1 ||
2265
+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
2266
+ q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
18922267 /*
1893
- * @request_count may become stale because of schedule
1894
- * out, so check the list again.
2268
+ * Use plugging if we have a ->commit_rqs() hook as well, as
2269
+ * we know the driver uses bd->last in a smart fashion.
2270
+ *
2271
+ * Use normal plugging if this disk is slow HDD, as sequential
2272
+ * IO may benefit a lot from plug merging.
18952273 */
1896
- if (list_empty(&plug->mq_list))
1897
- request_count = 0;
1898
- else if (blk_queue_nomerges(q))
1899
- request_count = blk_plug_queued_count(q);
2274
+ unsigned int request_count = plug->rq_count;
2275
+ struct request *last = NULL;
19002276
19012277 if (!request_count)
19022278 trace_block_plug(q);
19032279 else
19042280 last = list_entry_rq(plug->mq_list.prev);
19052281
1906
- if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2282
+ if (request_count >= blk_plug_max_rq_count(plug) || (last &&
19072283 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
19082284 blk_flush_plug_list(plug, false);
19092285 trace_block_plug(q);
19102286 }
19112287
1912
- list_add_tail(&rq->queuelist, &plug->mq_list);
2288
+ blk_add_rq_to_plug(plug, rq);
2289
+ } else if (q->elevator) {
2290
+ /* Insert the request at the IO scheduler queue */
2291
+ blk_mq_sched_insert_request(rq, false, true, true);
19132292 } else if (plug && !blk_queue_nomerges(q)) {
1914
- blk_mq_bio_to_request(rq, bio);
1915
-
19162293 /*
19172294 * We do limited plugging. If the bio can be merged, do that.
19182295 * Otherwise the existing request in the plug list will be
....@@ -1922,30 +2299,74 @@
19222299 */
19232300 if (list_empty(&plug->mq_list))
19242301 same_queue_rq = NULL;
1925
- if (same_queue_rq)
2302
+ if (same_queue_rq) {
19262303 list_del_init(&same_queue_rq->queuelist);
1927
- list_add_tail(&rq->queuelist, &plug->mq_list);
1928
-
1929
- blk_mq_put_ctx(data.ctx);
2304
+ plug->rq_count--;
2305
+ }
2306
+ blk_add_rq_to_plug(plug, rq);
2307
+ trace_block_plug(q);
19302308
19312309 if (same_queue_rq) {
1932
- data.hctx = blk_mq_map_queue(q,
1933
- same_queue_rq->mq_ctx->cpu);
2310
+ data.hctx = same_queue_rq->mq_hctx;
2311
+ trace_block_unplug(q, 1, true);
19342312 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
19352313 &cookie);
19362314 }
1937
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1938
- !data.hctx->dispatch_busy)) {
1939
- blk_mq_put_ctx(data.ctx);
1940
- blk_mq_bio_to_request(rq, bio);
2315
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
2316
+ !data.hctx->dispatch_busy) {
2317
+ /*
2318
+ * There is no scheduler and we can try to send directly
2319
+ * to the hardware.
2320
+ */
19412321 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
19422322 } else {
1943
- blk_mq_put_ctx(data.ctx);
1944
- blk_mq_bio_to_request(rq, bio);
2323
+ /* Default case. */
19452324 blk_mq_sched_insert_request(rq, false, true, true);
19462325 }
19472326
19482327 return cookie;
2328
+queue_exit:
2329
+ blk_queue_exit(q);
2330
+ return BLK_QC_T_NONE;
2331
+}
2332
+
2333
+static size_t order_to_size(unsigned int order)
2334
+{
2335
+ return (size_t)PAGE_SIZE << order;
2336
+}
2337
+
2338
+/* called before freeing request pool in @tags */
2339
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
2340
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
2341
+{
2342
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
2343
+ struct page *page;
2344
+ unsigned long flags;
2345
+
2346
+ list_for_each_entry(page, &tags->page_list, lru) {
2347
+ unsigned long start = (unsigned long)page_address(page);
2348
+ unsigned long end = start + order_to_size(page->private);
2349
+ int i;
2350
+
2351
+ for (i = 0; i < set->queue_depth; i++) {
2352
+ struct request *rq = drv_tags->rqs[i];
2353
+ unsigned long rq_addr = (unsigned long)rq;
2354
+
2355
+ if (rq_addr >= start && rq_addr < end) {
2356
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
2357
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
2358
+ }
2359
+ }
2360
+ }
2361
+
2362
+ /*
2363
+ * Wait until all pending iteration is done.
2364
+ *
2365
+ * Request reference is cleared and it is guaranteed to be observed
2366
+ * after the ->lock is released.
2367
+ */
2368
+ spin_lock_irqsave(&drv_tags->lock, flags);
2369
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
19492370 }
19502371
19512372 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
....@@ -1966,42 +2387,44 @@
19662387 }
19672388 }
19682389
2390
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
2391
+
19692392 while (!list_empty(&tags->page_list)) {
19702393 page = list_first_entry(&tags->page_list, struct page, lru);
19712394 list_del_init(&page->lru);
19722395 /*
19732396 * Remove kmemleak object previously allocated in
1974
- * blk_mq_init_rq_map().
2397
+ * blk_mq_alloc_rqs().
19752398 */
19762399 kmemleak_free(page_address(page));
19772400 __free_pages(page, page->private);
19782401 }
19792402 }
19802403
1981
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2404
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
19822405 {
19832406 kfree(tags->rqs);
19842407 tags->rqs = NULL;
19852408 kfree(tags->static_rqs);
19862409 tags->static_rqs = NULL;
19872410
1988
- blk_mq_free_tags(tags);
2411
+ blk_mq_free_tags(tags, flags);
19892412 }
19902413
19912414 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
19922415 unsigned int hctx_idx,
19932416 unsigned int nr_tags,
1994
- unsigned int reserved_tags)
2417
+ unsigned int reserved_tags,
2418
+ unsigned int flags)
19952419 {
19962420 struct blk_mq_tags *tags;
19972421 int node;
19982422
1999
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2423
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20002424 if (node == NUMA_NO_NODE)
20012425 node = set->numa_node;
20022426
2003
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2004
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2427
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
20052428 if (!tags)
20062429 return NULL;
20072430
....@@ -2009,7 +2432,7 @@
20092432 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
20102433 node);
20112434 if (!tags->rqs) {
2012
- blk_mq_free_tags(tags);
2435
+ blk_mq_free_tags(tags, flags);
20132436 return NULL;
20142437 }
20152438
....@@ -2018,16 +2441,11 @@
20182441 node);
20192442 if (!tags->static_rqs) {
20202443 kfree(tags->rqs);
2021
- blk_mq_free_tags(tags);
2444
+ blk_mq_free_tags(tags, flags);
20222445 return NULL;
20232446 }
20242447
20252448 return tags;
2026
-}
2027
-
2028
-static size_t order_to_size(unsigned int order)
2029
-{
2030
- return (size_t)PAGE_SIZE << order;
20312449 }
20322450
20332451 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
....@@ -2052,7 +2470,7 @@
20522470 size_t rq_size, left;
20532471 int node;
20542472
2055
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2473
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20562474 if (node == NUMA_NO_NODE)
20572475 node = set->numa_node;
20582476
....@@ -2064,6 +2482,7 @@
20642482 */
20652483 rq_size = round_up(sizeof(struct request) + set->cmd_size,
20662484 cache_line_size());
2485
+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
20672486 left = rq_size * depth;
20682487
20692488 for (i = 0; i < depth; ) {
....@@ -2122,6 +2541,86 @@
21222541 return -ENOMEM;
21232542 }
21242543
2544
+struct rq_iter_data {
2545
+ struct blk_mq_hw_ctx *hctx;
2546
+ bool has_rq;
2547
+};
2548
+
2549
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2550
+{
2551
+ struct rq_iter_data *iter_data = data;
2552
+
2553
+ if (rq->mq_hctx != iter_data->hctx)
2554
+ return true;
2555
+ iter_data->has_rq = true;
2556
+ return false;
2557
+}
2558
+
2559
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2560
+{
2561
+ struct blk_mq_tags *tags = hctx->sched_tags ?
2562
+ hctx->sched_tags : hctx->tags;
2563
+ struct rq_iter_data data = {
2564
+ .hctx = hctx,
2565
+ };
2566
+
2567
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2568
+ return data.has_rq;
2569
+}
2570
+
2571
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2572
+ struct blk_mq_hw_ctx *hctx)
2573
+{
2574
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2575
+ return false;
2576
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2577
+ return false;
2578
+ return true;
2579
+}
2580
+
2581
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2582
+{
2583
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2584
+ struct blk_mq_hw_ctx, cpuhp_online);
2585
+
2586
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2587
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
2588
+ return 0;
2589
+
2590
+ /*
2591
+ * Prevent new request from being allocated on the current hctx.
2592
+ *
2593
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
2594
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2595
+ * seen once we return from the tag allocator.
2596
+ */
2597
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2598
+ smp_mb__after_atomic();
2599
+
2600
+ /*
2601
+ * Try to grab a reference to the queue and wait for any outstanding
2602
+ * requests. If we could not grab a reference the queue has been
2603
+ * frozen and there are no requests.
2604
+ */
2605
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2606
+ while (blk_mq_hctx_has_requests(hctx))
2607
+ msleep(5);
2608
+ percpu_ref_put(&hctx->queue->q_usage_counter);
2609
+ }
2610
+
2611
+ return 0;
2612
+}
2613
+
2614
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2615
+{
2616
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2617
+ struct blk_mq_hw_ctx, cpuhp_online);
2618
+
2619
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
2620
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2621
+ return 0;
2622
+}
2623
+
21252624 /*
21262625 * 'cpu' is going away. splice any existing rq_list entries from this
21272626 * software queue to the hw queue dispatch list, and ensure that it
....@@ -2132,13 +2631,18 @@
21322631 struct blk_mq_hw_ctx *hctx;
21332632 struct blk_mq_ctx *ctx;
21342633 LIST_HEAD(tmp);
2634
+ enum hctx_type type;
21352635
21362636 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2637
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
2638
+ return 0;
2639
+
21372640 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2641
+ type = hctx->type;
21382642
21392643 spin_lock(&ctx->lock);
2140
- if (!list_empty(&ctx->rq_list)) {
2141
- list_splice_init(&ctx->rq_list, &tmp);
2644
+ if (!list_empty(&ctx->rq_lists[type])) {
2645
+ list_splice_init(&ctx->rq_lists[type], &tmp);
21422646 blk_mq_hctx_clear_pending(hctx, ctx);
21432647 }
21442648 spin_unlock(&ctx->lock);
....@@ -2156,8 +2660,40 @@
21562660
21572661 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
21582662 {
2663
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2664
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2665
+ &hctx->cpuhp_online);
21592666 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
21602667 &hctx->cpuhp_dead);
2668
+}
2669
+
2670
+/*
2671
+ * Before freeing hw queue, clearing the flush request reference in
2672
+ * tags->rqs[] for avoiding potential UAF.
2673
+ */
2674
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
2675
+ unsigned int queue_depth, struct request *flush_rq)
2676
+{
2677
+ int i;
2678
+ unsigned long flags;
2679
+
2680
+ /* The hw queue may not be mapped yet */
2681
+ if (!tags)
2682
+ return;
2683
+
2684
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
2685
+
2686
+ for (i = 0; i < queue_depth; i++)
2687
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
2688
+
2689
+ /*
2690
+ * Wait until all pending iteration is done.
2691
+ *
2692
+ * Request reference is cleared and it is guaranteed to be observed
2693
+ * after the ->lock is released.
2694
+ */
2695
+ spin_lock_irqsave(&tags->lock, flags);
2696
+ spin_unlock_irqrestore(&tags->lock, flags);
21612697 }
21622698
21632699 /* hctx->ctxs will be freed in queue's release handler */
....@@ -2165,18 +2701,24 @@
21652701 struct blk_mq_tag_set *set,
21662702 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
21672703 {
2168
- blk_mq_debugfs_unregister_hctx(hctx);
2704
+ struct request *flush_rq = hctx->fq->flush_rq;
21692705
21702706 if (blk_mq_hw_queue_mapped(hctx))
21712707 blk_mq_tag_idle(hctx);
21722708
2709
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
2710
+ set->queue_depth, flush_rq);
21732711 if (set->ops->exit_request)
2174
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2712
+ set->ops->exit_request(set, flush_rq, hctx_idx);
21752713
21762714 if (set->ops->exit_hctx)
21772715 set->ops->exit_hctx(hctx, hctx_idx);
21782716
21792717 blk_mq_remove_cpuhp(hctx);
2718
+
2719
+ spin_lock(&q->unused_hctx_lock);
2720
+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
2721
+ spin_unlock(&q->unused_hctx_lock);
21802722 }
21812723
21822724 static void blk_mq_exit_hw_queues(struct request_queue *q,
....@@ -2188,112 +2730,160 @@
21882730 queue_for_each_hw_ctx(q, hctx, i) {
21892731 if (i == nr_queue)
21902732 break;
2733
+ blk_mq_debugfs_unregister_hctx(hctx);
21912734 blk_mq_exit_hctx(q, set, hctx, i);
21922735 }
2736
+}
2737
+
2738
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2739
+{
2740
+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2741
+
2742
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2743
+ __alignof__(struct blk_mq_hw_ctx)) !=
2744
+ sizeof(struct blk_mq_hw_ctx));
2745
+
2746
+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
2747
+ hw_ctx_size += sizeof(struct srcu_struct);
2748
+
2749
+ return hw_ctx_size;
21932750 }
21942751
21952752 static int blk_mq_init_hctx(struct request_queue *q,
21962753 struct blk_mq_tag_set *set,
21972754 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
21982755 {
2199
- int node;
2756
+ hctx->queue_num = hctx_idx;
22002757
2201
- node = hctx->numa_node;
2758
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2759
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2760
+ &hctx->cpuhp_online);
2761
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2762
+
2763
+ hctx->tags = set->tags[hctx_idx];
2764
+
2765
+ if (set->ops->init_hctx &&
2766
+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2767
+ goto unregister_cpu_notifier;
2768
+
2769
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2770
+ hctx->numa_node))
2771
+ goto exit_hctx;
2772
+ return 0;
2773
+
2774
+ exit_hctx:
2775
+ if (set->ops->exit_hctx)
2776
+ set->ops->exit_hctx(hctx, hctx_idx);
2777
+ unregister_cpu_notifier:
2778
+ blk_mq_remove_cpuhp(hctx);
2779
+ return -1;
2780
+}
2781
+
2782
+static struct blk_mq_hw_ctx *
2783
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2784
+ int node)
2785
+{
2786
+ struct blk_mq_hw_ctx *hctx;
2787
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2788
+
2789
+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2790
+ if (!hctx)
2791
+ goto fail_alloc_hctx;
2792
+
2793
+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2794
+ goto free_hctx;
2795
+
2796
+ atomic_set(&hctx->nr_active, 0);
22022797 if (node == NUMA_NO_NODE)
2203
- node = hctx->numa_node = set->numa_node;
2798
+ node = set->numa_node;
2799
+ hctx->numa_node = node;
22042800
22052801 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
22062802 spin_lock_init(&hctx->lock);
22072803 INIT_LIST_HEAD(&hctx->dispatch);
22082804 hctx->queue = q;
2209
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2805
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
22102806
2211
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2212
-
2213
- hctx->tags = set->tags[hctx_idx];
2807
+ INIT_LIST_HEAD(&hctx->hctx_list);
22142808
22152809 /*
22162810 * Allocate space for all possible cpus to avoid allocation at
22172811 * runtime
22182812 */
22192813 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2220
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
2814
+ gfp, node);
22212815 if (!hctx->ctxs)
2222
- goto unregister_cpu_notifier;
2816
+ goto free_cpumask;
22232817
22242818 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2225
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
2819
+ gfp, node))
22262820 goto free_ctxs;
2227
-
22282821 hctx->nr_ctx = 0;
22292822
22302823 spin_lock_init(&hctx->dispatch_wait_lock);
22312824 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
22322825 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
22332826
2234
- if (set->ops->init_hctx &&
2235
- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2236
- goto free_bitmap;
2237
-
2238
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2239
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
2827
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
22402828 if (!hctx->fq)
2241
- goto exit_hctx;
2242
-
2243
- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2244
- goto free_fq;
2829
+ goto free_bitmap;
22452830
22462831 if (hctx->flags & BLK_MQ_F_BLOCKING)
22472832 init_srcu_struct(hctx->srcu);
2833
+ blk_mq_hctx_kobj_init(hctx);
22482834
2249
- blk_mq_debugfs_register_hctx(q, hctx);
2835
+ return hctx;
22502836
2251
- return 0;
2252
-
2253
- free_fq:
2254
- blk_free_flush_queue(hctx->fq);
2255
- exit_hctx:
2256
- if (set->ops->exit_hctx)
2257
- set->ops->exit_hctx(hctx, hctx_idx);
22582837 free_bitmap:
22592838 sbitmap_free(&hctx->ctx_map);
22602839 free_ctxs:
22612840 kfree(hctx->ctxs);
2262
- unregister_cpu_notifier:
2263
- blk_mq_remove_cpuhp(hctx);
2264
- return -1;
2841
+ free_cpumask:
2842
+ free_cpumask_var(hctx->cpumask);
2843
+ free_hctx:
2844
+ kfree(hctx);
2845
+ fail_alloc_hctx:
2846
+ return NULL;
22652847 }
22662848
22672849 static void blk_mq_init_cpu_queues(struct request_queue *q,
22682850 unsigned int nr_hw_queues)
22692851 {
2270
- unsigned int i;
2852
+ struct blk_mq_tag_set *set = q->tag_set;
2853
+ unsigned int i, j;
22712854
22722855 for_each_possible_cpu(i) {
22732856 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
22742857 struct blk_mq_hw_ctx *hctx;
2858
+ int k;
22752859
22762860 __ctx->cpu = i;
22772861 spin_lock_init(&__ctx->lock);
2278
- INIT_LIST_HEAD(&__ctx->rq_list);
2862
+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2863
+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2864
+
22792865 __ctx->queue = q;
22802866
22812867 /*
22822868 * Set local node, IFF we have more than one hw queue. If
22832869 * not, we remain on the home node of the device
22842870 */
2285
- hctx = blk_mq_map_queue(q, i);
2286
- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2287
- hctx->numa_node = local_memory_node(cpu_to_node(i));
2871
+ for (j = 0; j < set->nr_maps; j++) {
2872
+ hctx = blk_mq_map_queue_type(q, j, i);
2873
+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2874
+ hctx->numa_node = cpu_to_node(i);
2875
+ }
22882876 }
22892877 }
22902878
2291
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2879
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2880
+ int hctx_idx)
22922881 {
2882
+ unsigned int flags = set->flags;
22932883 int ret = 0;
22942884
22952885 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2296
- set->queue_depth, set->reserved_tags);
2886
+ set->queue_depth, set->reserved_tags, flags);
22972887 if (!set->tags[hctx_idx])
22982888 return false;
22992889
....@@ -2302,7 +2892,7 @@
23022892 if (!ret)
23032893 return true;
23042894
2305
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2895
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23062896 set->tags[hctx_idx] = NULL;
23072897 return false;
23082898 }
....@@ -2310,16 +2900,18 @@
23102900 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
23112901 unsigned int hctx_idx)
23122902 {
2313
- if (set->tags[hctx_idx]) {
2903
+ unsigned int flags = set->flags;
2904
+
2905
+ if (set->tags && set->tags[hctx_idx]) {
23142906 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2315
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2907
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23162908 set->tags[hctx_idx] = NULL;
23172909 }
23182910 }
23192911
23202912 static void blk_mq_map_swqueue(struct request_queue *q)
23212913 {
2322
- unsigned int i, hctx_idx;
2914
+ unsigned int i, j, hctx_idx;
23232915 struct blk_mq_hw_ctx *hctx;
23242916 struct blk_mq_ctx *ctx;
23252917 struct blk_mq_tag_set *set = q->tag_set;
....@@ -2336,25 +2928,52 @@
23362928 * If the cpu isn't present, the cpu is mapped to first hctx.
23372929 */
23382930 for_each_possible_cpu(i) {
2339
- hctx_idx = q->mq_map[i];
2340
- /* unmapped hw queue can be remapped after CPU topo changed */
2341
- if (!set->tags[hctx_idx] &&
2342
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2343
- /*
2344
- * If tags initialization fail for some hctx,
2345
- * that hctx won't be brought online. In this
2346
- * case, remap the current ctx to hctx[0] which
2347
- * is guaranteed to always have tags allocated
2348
- */
2349
- q->mq_map[i] = 0;
2350
- }
23512931
23522932 ctx = per_cpu_ptr(q->queue_ctx, i);
2353
- hctx = blk_mq_map_queue(q, i);
2933
+ for (j = 0; j < set->nr_maps; j++) {
2934
+ if (!set->map[j].nr_queues) {
2935
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2936
+ HCTX_TYPE_DEFAULT, i);
2937
+ continue;
2938
+ }
2939
+ hctx_idx = set->map[j].mq_map[i];
2940
+ /* unmapped hw queue can be remapped after CPU topo changed */
2941
+ if (!set->tags[hctx_idx] &&
2942
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2943
+ /*
2944
+ * If tags initialization fail for some hctx,
2945
+ * that hctx won't be brought online. In this
2946
+ * case, remap the current ctx to hctx[0] which
2947
+ * is guaranteed to always have tags allocated
2948
+ */
2949
+ set->map[j].mq_map[i] = 0;
2950
+ }
23542951
2355
- cpumask_set_cpu(i, hctx->cpumask);
2356
- ctx->index_hw = hctx->nr_ctx;
2357
- hctx->ctxs[hctx->nr_ctx++] = ctx;
2952
+ hctx = blk_mq_map_queue_type(q, j, i);
2953
+ ctx->hctxs[j] = hctx;
2954
+ /*
2955
+ * If the CPU is already set in the mask, then we've
2956
+ * mapped this one already. This can happen if
2957
+ * devices share queues across queue maps.
2958
+ */
2959
+ if (cpumask_test_cpu(i, hctx->cpumask))
2960
+ continue;
2961
+
2962
+ cpumask_set_cpu(i, hctx->cpumask);
2963
+ hctx->type = j;
2964
+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
2965
+ hctx->ctxs[hctx->nr_ctx++] = ctx;
2966
+
2967
+ /*
2968
+ * If the nr_ctx type overflows, we have exceeded the
2969
+ * amount of sw queues we can support.
2970
+ */
2971
+ BUG_ON(!hctx->nr_ctx);
2972
+ }
2973
+
2974
+ for (; j < HCTX_MAX_TYPES; j++)
2975
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2976
+ HCTX_TYPE_DEFAULT, i);
23582977 }
23592978
23602979 queue_for_each_hw_ctx(q, hctx, i) {
....@@ -2403,14 +3022,14 @@
24033022
24043023 queue_for_each_hw_ctx(q, hctx, i) {
24053024 if (shared)
2406
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
3025
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24073026 else
2408
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
3027
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24093028 }
24103029 }
24113030
2412
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2413
- bool shared)
3031
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3032
+ bool shared)
24143033 {
24153034 struct request_queue *q;
24163035
....@@ -2428,12 +3047,12 @@
24283047 struct blk_mq_tag_set *set = q->tag_set;
24293048
24303049 mutex_lock(&set->tag_list_lock);
2431
- list_del_rcu(&q->tag_set_list);
3050
+ list_del(&q->tag_set_list);
24323051 if (list_is_singular(&set->tag_list)) {
24333052 /* just transitioned to unshared */
2434
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
3053
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24353054 /* update existing queue */
2436
- blk_mq_update_tag_set_depth(set, false);
3055
+ blk_mq_update_tag_set_shared(set, false);
24373056 }
24383057 mutex_unlock(&set->tag_list_lock);
24393058 INIT_LIST_HEAD(&q->tag_set_list);
....@@ -2442,24 +3061,50 @@
24423061 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
24433062 struct request_queue *q)
24443063 {
2445
- q->tag_set = set;
2446
-
24473064 mutex_lock(&set->tag_list_lock);
24483065
24493066 /*
24503067 * Check to see if we're transitioning to shared (from 1 to 2 queues).
24513068 */
24523069 if (!list_empty(&set->tag_list) &&
2453
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2454
- set->flags |= BLK_MQ_F_TAG_SHARED;
3070
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3071
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24553072 /* update existing queue */
2456
- blk_mq_update_tag_set_depth(set, true);
3073
+ blk_mq_update_tag_set_shared(set, true);
24573074 }
2458
- if (set->flags & BLK_MQ_F_TAG_SHARED)
3075
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
24593076 queue_set_hctx_shared(q, true);
2460
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
3077
+ list_add_tail(&q->tag_set_list, &set->tag_list);
24613078
24623079 mutex_unlock(&set->tag_list_lock);
3080
+}
3081
+
3082
+/* All allocations will be freed in release handler of q->mq_kobj */
3083
+static int blk_mq_alloc_ctxs(struct request_queue *q)
3084
+{
3085
+ struct blk_mq_ctxs *ctxs;
3086
+ int cpu;
3087
+
3088
+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3089
+ if (!ctxs)
3090
+ return -ENOMEM;
3091
+
3092
+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3093
+ if (!ctxs->queue_ctx)
3094
+ goto fail;
3095
+
3096
+ for_each_possible_cpu(cpu) {
3097
+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3098
+ ctx->ctxs = ctxs;
3099
+ }
3100
+
3101
+ q->mq_kobj = &ctxs->kobj;
3102
+ q->queue_ctx = ctxs->queue_ctx;
3103
+
3104
+ return 0;
3105
+ fail:
3106
+ kfree(ctxs);
3107
+ return -ENOMEM;
24633108 }
24643109
24653110 /*
....@@ -2470,17 +3115,17 @@
24703115 */
24713116 void blk_mq_release(struct request_queue *q)
24723117 {
2473
- struct blk_mq_hw_ctx *hctx;
2474
- unsigned int i;
3118
+ struct blk_mq_hw_ctx *hctx, *next;
3119
+ int i;
24753120
2476
- /* hctx kobj stays in hctx */
2477
- queue_for_each_hw_ctx(q, hctx, i) {
2478
- if (!hctx)
2479
- continue;
3121
+ queue_for_each_hw_ctx(q, hctx, i)
3122
+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3123
+
3124
+ /* all hctx are in .unused_hctx_list now */
3125
+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3126
+ list_del_init(&hctx->hctx_list);
24803127 kobject_put(&hctx->kobj);
24813128 }
2482
-
2483
- q->mq_map = NULL;
24843129
24853130 kfree(q->queue_hw_ctx);
24863131
....@@ -2489,102 +3134,184 @@
24893134 * both share lifetime with request queue.
24903135 */
24913136 blk_mq_sysfs_deinit(q);
2492
-
2493
- free_percpu(q->queue_ctx);
24943137 }
24953138
2496
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3139
+struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3140
+ void *queuedata)
24973141 {
24983142 struct request_queue *uninit_q, *q;
24993143
2500
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
3144
+ uninit_q = blk_alloc_queue(set->numa_node);
25013145 if (!uninit_q)
25023146 return ERR_PTR(-ENOMEM);
3147
+ uninit_q->queuedata = queuedata;
25033148
2504
- q = blk_mq_init_allocated_queue(set, uninit_q);
3149
+ /*
3150
+ * Initialize the queue without an elevator. device_add_disk() will do
3151
+ * the initialization.
3152
+ */
3153
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
25053154 if (IS_ERR(q))
25063155 blk_cleanup_queue(uninit_q);
25073156
25083157 return q;
25093158 }
3159
+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
3160
+
3161
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3162
+{
3163
+ return blk_mq_init_queue_data(set, NULL);
3164
+}
25103165 EXPORT_SYMBOL(blk_mq_init_queue);
25113166
2512
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
3167
+/*
3168
+ * Helper for setting up a queue with mq ops, given queue depth, and
3169
+ * the passed in mq ops flags.
3170
+ */
3171
+struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
3172
+ const struct blk_mq_ops *ops,
3173
+ unsigned int queue_depth,
3174
+ unsigned int set_flags)
25133175 {
2514
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
3176
+ struct request_queue *q;
3177
+ int ret;
25153178
2516
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2517
- __alignof__(struct blk_mq_hw_ctx)) !=
2518
- sizeof(struct blk_mq_hw_ctx));
3179
+ memset(set, 0, sizeof(*set));
3180
+ set->ops = ops;
3181
+ set->nr_hw_queues = 1;
3182
+ set->nr_maps = 1;
3183
+ set->queue_depth = queue_depth;
3184
+ set->numa_node = NUMA_NO_NODE;
3185
+ set->flags = set_flags;
25193186
2520
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2521
- hw_ctx_size += sizeof(struct srcu_struct);
3187
+ ret = blk_mq_alloc_tag_set(set);
3188
+ if (ret)
3189
+ return ERR_PTR(ret);
25223190
2523
- return hw_ctx_size;
3191
+ q = blk_mq_init_queue(set);
3192
+ if (IS_ERR(q)) {
3193
+ blk_mq_free_tag_set(set);
3194
+ return q;
3195
+ }
3196
+
3197
+ return q;
3198
+}
3199
+EXPORT_SYMBOL(blk_mq_init_sq_queue);
3200
+
3201
+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3202
+ struct blk_mq_tag_set *set, struct request_queue *q,
3203
+ int hctx_idx, int node)
3204
+{
3205
+ struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3206
+
3207
+ /* reuse dead hctx first */
3208
+ spin_lock(&q->unused_hctx_lock);
3209
+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3210
+ if (tmp->numa_node == node) {
3211
+ hctx = tmp;
3212
+ break;
3213
+ }
3214
+ }
3215
+ if (hctx)
3216
+ list_del_init(&hctx->hctx_list);
3217
+ spin_unlock(&q->unused_hctx_lock);
3218
+
3219
+ if (!hctx)
3220
+ hctx = blk_mq_alloc_hctx(q, set, node);
3221
+ if (!hctx)
3222
+ goto fail;
3223
+
3224
+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3225
+ goto free_hctx;
3226
+
3227
+ return hctx;
3228
+
3229
+ free_hctx:
3230
+ kobject_put(&hctx->kobj);
3231
+ fail:
3232
+ return NULL;
25243233 }
25253234
25263235 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
25273236 struct request_queue *q)
25283237 {
2529
- int i, j;
3238
+ int i, j, end;
25303239 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
25313240
2532
- blk_mq_sysfs_unregister(q);
3241
+ if (q->nr_hw_queues < set->nr_hw_queues) {
3242
+ struct blk_mq_hw_ctx **new_hctxs;
3243
+
3244
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
3245
+ sizeof(*new_hctxs), GFP_KERNEL,
3246
+ set->numa_node);
3247
+ if (!new_hctxs)
3248
+ return;
3249
+ if (hctxs)
3250
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3251
+ sizeof(*hctxs));
3252
+ q->queue_hw_ctx = new_hctxs;
3253
+ kfree(hctxs);
3254
+ hctxs = new_hctxs;
3255
+ }
25333256
25343257 /* protect against switching io scheduler */
25353258 mutex_lock(&q->sysfs_lock);
25363259 for (i = 0; i < set->nr_hw_queues; i++) {
25373260 int node;
3261
+ struct blk_mq_hw_ctx *hctx;
25383262
2539
- if (hctxs[i])
3263
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3264
+ /*
3265
+ * If the hw queue has been mapped to another numa node,
3266
+ * we need to realloc the hctx. If allocation fails, fallback
3267
+ * to use the previous one.
3268
+ */
3269
+ if (hctxs[i] && (hctxs[i]->numa_node == node))
25403270 continue;
25413271
2542
- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2543
- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2544
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2545
- node);
2546
- if (!hctxs[i])
2547
- break;
2548
-
2549
- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2550
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2551
- node)) {
2552
- kfree(hctxs[i]);
2553
- hctxs[i] = NULL;
2554
- break;
3272
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3273
+ if (hctx) {
3274
+ if (hctxs[i])
3275
+ blk_mq_exit_hctx(q, set, hctxs[i], i);
3276
+ hctxs[i] = hctx;
3277
+ } else {
3278
+ if (hctxs[i])
3279
+ pr_warn("Allocate new hctx on node %d fails,\
3280
+ fallback to previous one on node %d\n",
3281
+ node, hctxs[i]->numa_node);
3282
+ else
3283
+ break;
25553284 }
2556
-
2557
- atomic_set(&hctxs[i]->nr_active, 0);
2558
- hctxs[i]->numa_node = node;
2559
- hctxs[i]->queue_num = i;
2560
-
2561
- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2562
- free_cpumask_var(hctxs[i]->cpumask);
2563
- kfree(hctxs[i]);
2564
- hctxs[i] = NULL;
2565
- break;
2566
- }
2567
- blk_mq_hctx_kobj_init(hctxs[i]);
25683285 }
2569
- for (j = i; j < q->nr_hw_queues; j++) {
3286
+ /*
3287
+ * Increasing nr_hw_queues fails. Free the newly allocated
3288
+ * hctxs and keep the previous q->nr_hw_queues.
3289
+ */
3290
+ if (i != set->nr_hw_queues) {
3291
+ j = q->nr_hw_queues;
3292
+ end = i;
3293
+ } else {
3294
+ j = i;
3295
+ end = q->nr_hw_queues;
3296
+ q->nr_hw_queues = set->nr_hw_queues;
3297
+ }
3298
+
3299
+ for (; j < end; j++) {
25703300 struct blk_mq_hw_ctx *hctx = hctxs[j];
25713301
25723302 if (hctx) {
25733303 if (hctx->tags)
25743304 blk_mq_free_map_and_requests(set, j);
25753305 blk_mq_exit_hctx(q, set, hctx, j);
2576
- kobject_put(&hctx->kobj);
25773306 hctxs[j] = NULL;
2578
-
25793307 }
25803308 }
2581
- q->nr_hw_queues = i;
25823309 mutex_unlock(&q->sysfs_lock);
2583
- blk_mq_sysfs_register(q);
25843310 }
25853311
25863312 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2587
- struct request_queue *q)
3313
+ struct request_queue *q,
3314
+ bool elevator_init)
25883315 {
25893316 /* mark the queue as mq asap */
25903317 q->mq_ops = set->ops;
....@@ -2595,19 +3322,14 @@
25953322 if (!q->poll_cb)
25963323 goto err_exit;
25973324
2598
- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2599
- if (!q->queue_ctx)
2600
- goto err_exit;
3325
+ if (blk_mq_alloc_ctxs(q))
3326
+ goto err_poll;
26013327
26023328 /* init q->mq_kobj and sw queues' kobjects */
26033329 blk_mq_sysfs_init(q);
26043330
2605
- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2606
- GFP_KERNEL, set->numa_node);
2607
- if (!q->queue_hw_ctx)
2608
- goto err_percpu;
2609
-
2610
- q->mq_map = set->mq_map;
3331
+ INIT_LIST_HEAD(&q->unused_hctx_list);
3332
+ spin_lock_init(&q->unused_hctx_lock);
26113333
26123334 blk_mq_realloc_hw_ctxs(set, q);
26133335 if (!q->nr_hw_queues)
....@@ -2616,12 +3338,12 @@
26163338 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
26173339 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
26183340
2619
- q->nr_queues = nr_cpu_ids;
3341
+ q->tag_set = set;
26203342
26213343 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2622
-
2623
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2624
- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
3344
+ if (set->nr_maps > HCTX_TYPE_POLL &&
3345
+ set->map[HCTX_TYPE_POLL].nr_queues)
3346
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
26253347
26263348 q->sg_reserved_size = INT_MAX;
26273349
....@@ -2629,41 +3351,29 @@
26293351 INIT_LIST_HEAD(&q->requeue_list);
26303352 spin_lock_init(&q->requeue_lock);
26313353
2632
- blk_queue_make_request(q, blk_mq_make_request);
2633
- if (q->mq_ops->poll)
2634
- q->poll_fn = blk_mq_poll;
2635
-
2636
- /*
2637
- * Do this after blk_queue_make_request() overrides it...
2638
- */
26393354 q->nr_requests = set->queue_depth;
26403355
26413356 /*
26423357 * Default to classic polling
26433358 */
2644
- q->poll_nsec = -1;
2645
-
2646
- if (set->ops->complete)
2647
- blk_queue_softirq_done(q, set->ops->complete);
3359
+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
26483360
26493361 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
26503362 blk_mq_add_queue_tag_set(set, q);
26513363 blk_mq_map_swqueue(q);
26523364
2653
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2654
- int ret;
2655
-
2656
- ret = elevator_init_mq(q);
2657
- if (ret)
2658
- return ERR_PTR(ret);
2659
- }
3365
+ if (elevator_init)
3366
+ elevator_init_mq(q);
26603367
26613368 return q;
26623369
26633370 err_hctxs:
26643371 kfree(q->queue_hw_ctx);
2665
-err_percpu:
2666
- free_percpu(q->queue_ctx);
3372
+ q->nr_hw_queues = 0;
3373
+ blk_mq_sysfs_deinit(q);
3374
+err_poll:
3375
+ blk_stat_free_callback(q->poll_cb);
3376
+ q->poll_cb = NULL;
26673377 err_exit:
26683378 q->mq_ops = NULL;
26693379 return ERR_PTR(-ENOMEM);
....@@ -2681,38 +3391,21 @@
26813391 blk_mq_del_queue_tag_set(q);
26823392 }
26833393
2684
-/* Basically redo blk_mq_init_queue with queue frozen */
2685
-static void blk_mq_queue_reinit(struct request_queue *q)
2686
-{
2687
- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2688
-
2689
- blk_mq_debugfs_unregister_hctxs(q);
2690
- blk_mq_sysfs_unregister(q);
2691
-
2692
- /*
2693
- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2694
- * we should change hctx numa_node according to the new topology (this
2695
- * involves freeing and re-allocating memory, worth doing?)
2696
- */
2697
- blk_mq_map_swqueue(q);
2698
-
2699
- blk_mq_sysfs_register(q);
2700
- blk_mq_debugfs_register_hctxs(q);
2701
-}
2702
-
27033394 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
27043395 {
27053396 int i;
27063397
2707
- for (i = 0; i < set->nr_hw_queues; i++)
2708
- if (!__blk_mq_alloc_rq_map(set, i))
3398
+ for (i = 0; i < set->nr_hw_queues; i++) {
3399
+ if (!__blk_mq_alloc_map_and_request(set, i))
27093400 goto out_unwind;
3401
+ cond_resched();
3402
+ }
27103403
27113404 return 0;
27123405
27133406 out_unwind:
27143407 while (--i >= 0)
2715
- blk_mq_free_rq_map(set->tags[i]);
3408
+ blk_mq_free_map_and_requests(set, i);
27163409
27173410 return -ENOMEM;
27183411 }
....@@ -2722,7 +3415,7 @@
27223415 * may reduce the depth asked for, if memory is tight. set->queue_depth
27233416 * will be updated to reflect the allocated depth.
27243417 */
2725
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3418
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
27263419 {
27273420 unsigned int depth;
27283421 int err;
....@@ -2754,7 +3447,17 @@
27543447
27553448 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
27563449 {
2757
- if (set->ops->map_queues) {
3450
+ /*
3451
+ * blk_mq_map_queues() and multiple .map_queues() implementations
3452
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3453
+ * number of hardware queues.
3454
+ */
3455
+ if (set->nr_maps == 1)
3456
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3457
+
3458
+ if (set->ops->map_queues && !is_kdump_kernel()) {
3459
+ int i;
3460
+
27583461 /*
27593462 * transport .map_queues is usually done in the following
27603463 * way:
....@@ -2762,18 +3465,44 @@
27623465 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
27633466 * mask = get_cpu_mask(queue)
27643467 * for_each_cpu(cpu, mask)
2765
- * set->mq_map[cpu] = queue;
3468
+ * set->map[x].mq_map[cpu] = queue;
27663469 * }
27673470 *
27683471 * When we need to remap, the table has to be cleared for
27693472 * killing stale mapping since one CPU may not be mapped
27703473 * to any hw queue.
27713474 */
2772
- blk_mq_clear_mq_map(set);
3475
+ for (i = 0; i < set->nr_maps; i++)
3476
+ blk_mq_clear_mq_map(&set->map[i]);
27733477
27743478 return set->ops->map_queues(set);
2775
- } else
2776
- return blk_mq_map_queues(set);
3479
+ } else {
3480
+ BUG_ON(set->nr_maps > 1);
3481
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3482
+ }
3483
+}
3484
+
3485
+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3486
+ int cur_nr_hw_queues, int new_nr_hw_queues)
3487
+{
3488
+ struct blk_mq_tags **new_tags;
3489
+
3490
+ if (cur_nr_hw_queues >= new_nr_hw_queues)
3491
+ return 0;
3492
+
3493
+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3494
+ GFP_KERNEL, set->numa_node);
3495
+ if (!new_tags)
3496
+ return -ENOMEM;
3497
+
3498
+ if (set->tags)
3499
+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
3500
+ sizeof(*set->tags));
3501
+ kfree(set->tags);
3502
+ set->tags = new_tags;
3503
+ set->nr_hw_queues = new_nr_hw_queues;
3504
+
3505
+ return 0;
27773506 }
27783507
27793508 /*
....@@ -2784,7 +3513,7 @@
27843513 */
27853514 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
27863515 {
2787
- int ret;
3516
+ int i, ret;
27883517
27893518 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
27903519
....@@ -2807,6 +3536,11 @@
28073536 set->queue_depth = BLK_MQ_MAX_DEPTH;
28083537 }
28093538
3539
+ if (!set->nr_maps)
3540
+ set->nr_maps = 1;
3541
+ else if (set->nr_maps > HCTX_MAX_TYPES)
3542
+ return -EINVAL;
3543
+
28103544 /*
28113545 * If a crashdump is active, then we are potentially in a very
28123546 * memory constrained environment. Limit us to 1 queue and
....@@ -2814,42 +3548,59 @@
28143548 */
28153549 if (is_kdump_kernel()) {
28163550 set->nr_hw_queues = 1;
3551
+ set->nr_maps = 1;
28173552 set->queue_depth = min(64U, set->queue_depth);
28183553 }
28193554 /*
2820
- * There is no use for more h/w queues than cpus.
3555
+ * There is no use for more h/w queues than cpus if we just have
3556
+ * a single map
28213557 */
2822
- if (set->nr_hw_queues > nr_cpu_ids)
3558
+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
28233559 set->nr_hw_queues = nr_cpu_ids;
28243560
2825
- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2826
- GFP_KERNEL, set->numa_node);
2827
- if (!set->tags)
3561
+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
28283562 return -ENOMEM;
28293563
28303564 ret = -ENOMEM;
2831
- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2832
- GFP_KERNEL, set->numa_node);
2833
- if (!set->mq_map)
2834
- goto out_free_tags;
3565
+ for (i = 0; i < set->nr_maps; i++) {
3566
+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3567
+ sizeof(set->map[i].mq_map[0]),
3568
+ GFP_KERNEL, set->numa_node);
3569
+ if (!set->map[i].mq_map)
3570
+ goto out_free_mq_map;
3571
+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3572
+ }
28353573
28363574 ret = blk_mq_update_queue_map(set);
28373575 if (ret)
28383576 goto out_free_mq_map;
28393577
2840
- ret = blk_mq_alloc_rq_maps(set);
3578
+ ret = blk_mq_alloc_map_and_requests(set);
28413579 if (ret)
28423580 goto out_free_mq_map;
3581
+
3582
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
3583
+ atomic_set(&set->active_queues_shared_sbitmap, 0);
3584
+
3585
+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
3586
+ ret = -ENOMEM;
3587
+ goto out_free_mq_rq_maps;
3588
+ }
3589
+ }
28433590
28443591 mutex_init(&set->tag_list_lock);
28453592 INIT_LIST_HEAD(&set->tag_list);
28463593
28473594 return 0;
28483595
3596
+out_free_mq_rq_maps:
3597
+ for (i = 0; i < set->nr_hw_queues; i++)
3598
+ blk_mq_free_map_and_requests(set, i);
28493599 out_free_mq_map:
2850
- kfree(set->mq_map);
2851
- set->mq_map = NULL;
2852
-out_free_tags:
3600
+ for (i = 0; i < set->nr_maps; i++) {
3601
+ kfree(set->map[i].mq_map);
3602
+ set->map[i].mq_map = NULL;
3603
+ }
28533604 kfree(set->tags);
28543605 set->tags = NULL;
28553606 return ret;
....@@ -2858,13 +3609,18 @@
28583609
28593610 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
28603611 {
2861
- int i;
3612
+ int i, j;
28623613
2863
- for (i = 0; i < nr_cpu_ids; i++)
3614
+ for (i = 0; i < set->nr_hw_queues; i++)
28643615 blk_mq_free_map_and_requests(set, i);
28653616
2866
- kfree(set->mq_map);
2867
- set->mq_map = NULL;
3617
+ if (blk_mq_is_sbitmap_shared(set->flags))
3618
+ blk_mq_exit_shared_sbitmap(set);
3619
+
3620
+ for (j = 0; j < set->nr_maps; j++) {
3621
+ kfree(set->map[j].mq_map);
3622
+ set->map[j].mq_map = NULL;
3623
+ }
28683624
28693625 kfree(set->tags);
28703626 set->tags = NULL;
....@@ -2880,6 +3636,9 @@
28803636 if (!set)
28813637 return -EINVAL;
28823638
3639
+ if (q->nr_requests == nr)
3640
+ return 0;
3641
+
28833642 blk_mq_freeze_queue(q);
28843643 blk_mq_quiesce_queue(q);
28853644
....@@ -2894,14 +3653,16 @@
28943653 if (!hctx->sched_tags) {
28953654 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
28963655 false);
3656
+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
3657
+ blk_mq_tag_resize_shared_sbitmap(set, nr);
28973658 } else {
28983659 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
28993660 nr, true);
29003661 }
29013662 if (ret)
29023663 break;
2903
- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2904
- q->elevator->type->ops.mq.depth_updated(hctx);
3664
+ if (q->elevator && q->elevator->type->ops.depth_updated)
3665
+ q->elevator->type->ops.depth_updated(hctx);
29053666 }
29063667
29073668 if (!ret)
....@@ -2988,20 +3749,19 @@
29883749 {
29893750 struct request_queue *q;
29903751 LIST_HEAD(head);
3752
+ int prev_nr_hw_queues;
29913753
29923754 lockdep_assert_held(&set->tag_list_lock);
29933755
2994
- if (nr_hw_queues > nr_cpu_ids)
3756
+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
29953757 nr_hw_queues = nr_cpu_ids;
2996
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3758
+ if (nr_hw_queues < 1)
3759
+ return;
3760
+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
29973761 return;
29983762
29993763 list_for_each_entry(q, &set->tag_list, tag_set_list)
30003764 blk_mq_freeze_queue(q);
3001
- /*
3002
- * Sync with blk_mq_queue_tag_busy_iter.
3003
- */
3004
- synchronize_rcu();
30053765 /*
30063766 * Switch IO scheduler to 'none', cleaning up the data associated
30073767 * with the previous scheduler. We will switch back once we are done
....@@ -3011,11 +3771,35 @@
30113771 if (!blk_mq_elv_switch_none(&head, q))
30123772 goto switch_back;
30133773
3774
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3775
+ blk_mq_debugfs_unregister_hctxs(q);
3776
+ blk_mq_sysfs_unregister(q);
3777
+ }
3778
+
3779
+ prev_nr_hw_queues = set->nr_hw_queues;
3780
+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3781
+ 0)
3782
+ goto reregister;
3783
+
30143784 set->nr_hw_queues = nr_hw_queues;
3785
+fallback:
30153786 blk_mq_update_queue_map(set);
30163787 list_for_each_entry(q, &set->tag_list, tag_set_list) {
30173788 blk_mq_realloc_hw_ctxs(set, q);
3018
- blk_mq_queue_reinit(q);
3789
+ if (q->nr_hw_queues != set->nr_hw_queues) {
3790
+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3791
+ nr_hw_queues, prev_nr_hw_queues);
3792
+ set->nr_hw_queues = prev_nr_hw_queues;
3793
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3794
+ goto fallback;
3795
+ }
3796
+ blk_mq_map_swqueue(q);
3797
+ }
3798
+
3799
+reregister:
3800
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3801
+ blk_mq_sysfs_register(q);
3802
+ blk_mq_debugfs_register_hctxs(q);
30193803 }
30203804
30213805 switch_back:
....@@ -3069,7 +3853,6 @@
30693853 }
30703854
30713855 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3072
- struct blk_mq_hw_ctx *hctx,
30733856 struct request *rq)
30743857 {
30753858 unsigned long ret = 0;
....@@ -3102,7 +3885,6 @@
31023885 }
31033886
31043887 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3105
- struct blk_mq_hw_ctx *hctx,
31063888 struct request *rq)
31073889 {
31083890 struct hrtimer_sleeper hs;
....@@ -3114,18 +3896,15 @@
31143896 return false;
31153897
31163898 /*
3117
- * poll_nsec can be:
3899
+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
31183900 *
3119
- * -1: don't ever hybrid sleep
31203901 * 0: use half of prev avg
31213902 * >0: use this specific value
31223903 */
3123
- if (q->poll_nsec == -1)
3124
- return false;
3125
- else if (q->poll_nsec > 0)
3904
+ if (q->poll_nsec > 0)
31263905 nsecs = q->poll_nsec;
31273906 else
3128
- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3907
+ nsecs = blk_mq_poll_nsecs(q, rq);
31293908
31303909 if (!nsecs)
31313910 return false;
....@@ -3139,15 +3918,14 @@
31393918 kt = nsecs;
31403919
31413920 mode = HRTIMER_MODE_REL;
3142
- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3921
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
31433922 hrtimer_set_expires(&hs.timer, kt);
31443923
3145
- hrtimer_init_sleeper(&hs, current);
31463924 do {
31473925 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
31483926 break;
31493927 set_current_state(TASK_UNINTERRUPTIBLE);
3150
- hrtimer_start_expires(&hs.timer, mode);
3928
+ hrtimer_sleeper_start_expires(&hs, mode);
31513929 if (hs.task)
31523930 io_schedule();
31533931 hrtimer_cancel(&hs.timer);
....@@ -3159,59 +3937,14 @@
31593937 return true;
31603938 }
31613939
3162
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3940
+static bool blk_mq_poll_hybrid(struct request_queue *q,
3941
+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
31633942 {
3164
- struct request_queue *q = hctx->queue;
3165
- long state;
3166
-
3167
- /*
3168
- * If we sleep, have the caller restart the poll loop to reset
3169
- * the state. Like for the other success return cases, the
3170
- * caller is responsible for checking if the IO completed. If
3171
- * the IO isn't complete, we'll get called again and will go
3172
- * straight to the busy poll loop.
3173
- */
3174
- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3175
- return true;
3176
-
3177
- hctx->poll_considered++;
3178
-
3179
- state = current->state;
3180
- while (!need_resched()) {
3181
- int ret;
3182
-
3183
- hctx->poll_invoked++;
3184
-
3185
- ret = q->mq_ops->poll(hctx, rq->tag);
3186
- if (ret > 0) {
3187
- hctx->poll_success++;
3188
- set_current_state(TASK_RUNNING);
3189
- return true;
3190
- }
3191
-
3192
- if (signal_pending_state(state, current))
3193
- set_current_state(TASK_RUNNING);
3194
-
3195
- if (current->state == TASK_RUNNING)
3196
- return true;
3197
- if (ret < 0)
3198
- break;
3199
- cpu_relax();
3200
- }
3201
-
3202
- __set_current_state(TASK_RUNNING);
3203
- return false;
3204
-}
3205
-
3206
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3207
-{
3208
- struct blk_mq_hw_ctx *hctx;
32093943 struct request *rq;
32103944
3211
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3945
+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
32123946 return false;
32133947
3214
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
32153948 if (!blk_qc_t_is_internal(cookie))
32163949 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
32173950 else {
....@@ -3226,13 +3959,97 @@
32263959 return false;
32273960 }
32283961
3229
- return __blk_mq_poll(hctx, rq);
3962
+ return blk_mq_poll_hybrid_sleep(q, rq);
32303963 }
3964
+
3965
+/**
3966
+ * blk_poll - poll for IO completions
3967
+ * @q: the queue
3968
+ * @cookie: cookie passed back at IO submission time
3969
+ * @spin: whether to spin for completions
3970
+ *
3971
+ * Description:
3972
+ * Poll for completions on the passed in queue. Returns number of
3973
+ * completed entries found. If @spin is true, then blk_poll will continue
3974
+ * looping until at least one completion is found, unless the task is
3975
+ * otherwise marked running (or we need to reschedule).
3976
+ */
3977
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3978
+{
3979
+ struct blk_mq_hw_ctx *hctx;
3980
+ long state;
3981
+
3982
+ if (!blk_qc_t_valid(cookie) ||
3983
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3984
+ return 0;
3985
+
3986
+ if (current->plug)
3987
+ blk_flush_plug_list(current->plug, false);
3988
+
3989
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3990
+
3991
+ /*
3992
+ * If we sleep, have the caller restart the poll loop to reset
3993
+ * the state. Like for the other success return cases, the
3994
+ * caller is responsible for checking if the IO completed. If
3995
+ * the IO isn't complete, we'll get called again and will go
3996
+ * straight to the busy poll loop.
3997
+ */
3998
+ if (blk_mq_poll_hybrid(q, hctx, cookie))
3999
+ return 1;
4000
+
4001
+ hctx->poll_considered++;
4002
+
4003
+ state = current->state;
4004
+ do {
4005
+ int ret;
4006
+
4007
+ hctx->poll_invoked++;
4008
+
4009
+ ret = q->mq_ops->poll(hctx);
4010
+ if (ret > 0) {
4011
+ hctx->poll_success++;
4012
+ __set_current_state(TASK_RUNNING);
4013
+ return ret;
4014
+ }
4015
+
4016
+ if (signal_pending_state(state, current))
4017
+ __set_current_state(TASK_RUNNING);
4018
+
4019
+ if (current->state == TASK_RUNNING)
4020
+ return 1;
4021
+ if (ret < 0 || !spin)
4022
+ break;
4023
+ cpu_relax();
4024
+ } while (!need_resched());
4025
+
4026
+ __set_current_state(TASK_RUNNING);
4027
+ return 0;
4028
+}
4029
+EXPORT_SYMBOL_GPL(blk_poll);
4030
+
4031
+unsigned int blk_mq_rq_cpu(struct request *rq)
4032
+{
4033
+ return rq->mq_ctx->cpu;
4034
+}
4035
+EXPORT_SYMBOL(blk_mq_rq_cpu);
32314036
32324037 static int __init blk_mq_init(void)
32334038 {
4039
+ int i;
4040
+
4041
+ for_each_possible_cpu(i)
4042
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
4043
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4044
+
4045
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4046
+ "block/softirq:dead", NULL,
4047
+ blk_softirq_cpu_dead);
32344048 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
32354049 blk_mq_hctx_notify_dead);
4050
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4051
+ blk_mq_hctx_notify_online,
4052
+ blk_mq_hctx_notify_offline);
32364053 return 0;
32374054 }
32384055 subsys_initcall(blk_mq_init);