hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/block/blk-mq.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Block multiqueue core code
34 *
....@@ -25,30 +26,36 @@
2526 #include <linux/delay.h>
2627 #include <linux/crash_dump.h>
2728 #include <linux/prefetch.h>
29
+#include <linux/blk-crypto.h>
2830
2931 #include <trace/events/block.h>
3032
3133 #include <linux/blk-mq.h>
34
+#include <linux/t10-pi.h>
3235 #include "blk.h"
3336 #include "blk-mq.h"
3437 #include "blk-mq-debugfs.h"
3538 #include "blk-mq-tag.h"
39
+#include "blk-pm.h"
3640 #include "blk-stat.h"
3741 #include "blk-mq-sched.h"
3842 #include "blk-rq-qos.h"
3943
40
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
44
+#include <trace/hooks/block.h>
45
+
46
+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
47
+
4148 static void blk_mq_poll_stats_start(struct request_queue *q);
4249 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
4350
4451 static int blk_mq_poll_stats_bkt(const struct request *rq)
4552 {
46
- int ddir, bytes, bucket;
53
+ int ddir, sectors, bucket;
4754
4855 ddir = rq_data_dir(rq);
49
- bytes = blk_rq_bytes(rq);
56
+ sectors = blk_rq_stats_sectors(rq);
5057
51
- bucket = ddir + 2*(ilog2(bytes) - 9);
58
+ bucket = ddir + 2 * ilog2(sectors);
5259
5360 if (bucket < 0)
5461 return -1;
....@@ -59,7 +66,8 @@
5966 }
6067
6168 /*
62
- * Check if any of the ctx's have pending work in this hardware queue
69
+ * Check if any of the ctx, dispatch list or elevator
70
+ * have pending work in this hardware queue.
6371 */
6472 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
6573 {
....@@ -74,75 +82,67 @@
7482 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
7583 struct blk_mq_ctx *ctx)
7684 {
77
- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78
- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
85
+ const int bit = ctx->index_hw[hctx->type];
86
+
87
+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
88
+ sbitmap_set_bit(&hctx->ctx_map, bit);
7989 }
8090
8191 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
8292 struct blk_mq_ctx *ctx)
8393 {
84
- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
94
+ const int bit = ctx->index_hw[hctx->type];
95
+
96
+ sbitmap_clear_bit(&hctx->ctx_map, bit);
8597 }
8698
8799 struct mq_inflight {
88100 struct hd_struct *part;
89
- unsigned int *inflight;
101
+ unsigned int inflight[2];
90102 };
91103
92
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
104
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93105 struct request *rq, void *priv,
94106 bool reserved)
95107 {
96108 struct mq_inflight *mi = priv;
97109
98
- /*
99
- * index[0] counts the specific partition that was asked for. index[1]
100
- * counts the ones that are active on the whole device, so increment
101
- * that if mi->part is indeed a partition, and not a whole device.
102
- */
103
- if (rq->part == mi->part)
104
- mi->inflight[0]++;
105
- if (mi->part->partno)
106
- mi->inflight[1]++;
107
-}
108
-
109
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
110
- unsigned int inflight[2])
111
-{
112
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113
-
114
- inflight[0] = inflight[1] = 0;
115
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116
-}
117
-
118
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119
- struct request *rq, void *priv,
120
- bool reserved)
121
-{
122
- struct mq_inflight *mi = priv;
123
-
124
- if (rq->part == mi->part)
110
+ if ((!mi->part->partno || rq->part == mi->part) &&
111
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125112 mi->inflight[rq_data_dir(rq)]++;
113
+
114
+ return true;
115
+}
116
+
117
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
118
+{
119
+ struct mq_inflight mi = { .part = part };
120
+
121
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
122
+
123
+ return mi.inflight[0] + mi.inflight[1];
126124 }
127125
128126 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
129127 unsigned int inflight[2])
130128 {
131
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
129
+ struct mq_inflight mi = { .part = part };
132130
133
- inflight[0] = inflight[1] = 0;
134
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
131
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
132
+ inflight[0] = mi.inflight[0];
133
+ inflight[1] = mi.inflight[1];
135134 }
136135
137136 void blk_freeze_queue_start(struct request_queue *q)
138137 {
139
- int freeze_depth;
140
-
141
- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142
- if (freeze_depth == 1) {
138
+ mutex_lock(&q->mq_freeze_lock);
139
+ if (++q->mq_freeze_depth == 1) {
143140 percpu_ref_kill(&q->q_usage_counter);
144
- if (q->mq_ops)
141
+ mutex_unlock(&q->mq_freeze_lock);
142
+ if (queue_is_mq(q))
145143 blk_mq_run_hw_queues(q, false);
144
+ } else {
145
+ mutex_unlock(&q->mq_freeze_lock);
146146 }
147147 }
148148 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
....@@ -176,8 +176,6 @@
176176 * exported to drivers as the only user for unfreeze is blk_mq.
177177 */
178178 blk_freeze_queue_start(q);
179
- if (!q->mq_ops)
180
- blk_drain_queue(q);
181179 blk_mq_freeze_queue_wait(q);
182180 }
183181
....@@ -193,14 +191,14 @@
193191
194192 void blk_mq_unfreeze_queue(struct request_queue *q)
195193 {
196
- int freeze_depth;
197
-
198
- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199
- WARN_ON_ONCE(freeze_depth < 0);
200
- if (!freeze_depth) {
201
- percpu_ref_reinit(&q->q_usage_counter);
194
+ mutex_lock(&q->mq_freeze_lock);
195
+ q->mq_freeze_depth--;
196
+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
197
+ if (!q->mq_freeze_depth) {
198
+ percpu_ref_resurrect(&q->q_usage_counter);
202199 wake_up_all(&q->mq_freeze_wq);
203200 }
201
+ mutex_unlock(&q->mq_freeze_lock);
204202 }
205203 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206204
....@@ -268,40 +266,37 @@
268266 blk_mq_tag_wakeup_all(hctx->tags, true);
269267 }
270268
271
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
269
+/*
270
+ * Only need start/end time stamping if we have iostat or
271
+ * blk stats enabled, or using an IO scheduler.
272
+ */
273
+static inline bool blk_mq_need_time_stamp(struct request *rq)
272274 {
273
- return blk_mq_has_free_tags(hctx->tags);
275
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
274276 }
275
-EXPORT_SYMBOL(blk_mq_can_queue);
276277
277278 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
278
- unsigned int tag, unsigned int op)
279
+ unsigned int tag, u64 alloc_time_ns)
279280 {
280281 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281282 struct request *rq = tags->static_rqs[tag];
282
- req_flags_t rq_flags = 0;
283283
284
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285
- rq->tag = -1;
284
+ if (data->q->elevator) {
285
+ rq->tag = BLK_MQ_NO_TAG;
286286 rq->internal_tag = tag;
287287 } else {
288
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289
- rq_flags = RQF_MQ_INFLIGHT;
290
- atomic_inc(&data->hctx->nr_active);
291
- }
292288 rq->tag = tag;
293
- rq->internal_tag = -1;
294
- data->hctx->tags->rqs[rq->tag] = rq;
289
+ rq->internal_tag = BLK_MQ_NO_TAG;
295290 }
296291
297292 /* csd/requeue_work/fifo_time is initialized before use */
298293 rq->q = data->q;
299294 rq->mq_ctx = data->ctx;
300
- rq->rq_flags = rq_flags;
301
- rq->cpu = -1;
302
- rq->cmd_flags = op;
303
- if (data->flags & BLK_MQ_REQ_PREEMPT)
304
- rq->rq_flags |= RQF_PREEMPT;
295
+ rq->mq_hctx = data->hctx;
296
+ rq->rq_flags = 0;
297
+ rq->cmd_flags = data->cmd_flags;
298
+ if (data->flags & BLK_MQ_REQ_PM)
299
+ rq->rq_flags |= RQF_PM;
305300 if (blk_queue_io_stat(data->q))
306301 rq->rq_flags |= RQF_IO_STAT;
307302 INIT_LIST_HEAD(&rq->queuelist);
....@@ -309,97 +304,110 @@
309304 RB_CLEAR_NODE(&rq->rb_node);
310305 rq->rq_disk = NULL;
311306 rq->part = NULL;
312
- rq->start_time_ns = ktime_get_ns();
307
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
308
+ rq->alloc_time_ns = alloc_time_ns;
309
+#endif
310
+ if (blk_mq_need_time_stamp(rq))
311
+ rq->start_time_ns = ktime_get_ns();
312
+ else
313
+ rq->start_time_ns = 0;
313314 rq->io_start_time_ns = 0;
315
+ rq->stats_sectors = 0;
314316 rq->nr_phys_segments = 0;
315317 #if defined(CONFIG_BLK_DEV_INTEGRITY)
316318 rq->nr_integrity_segments = 0;
317319 #endif
318
- rq->special = NULL;
320
+ blk_crypto_rq_set_defaults(rq);
319321 /* tag was already set */
320
- rq->extra_len = 0;
321
- rq->__deadline = 0;
322
+ WRITE_ONCE(rq->deadline, 0);
322323
323
- INIT_LIST_HEAD(&rq->timeout_list);
324324 rq->timeout = 0;
325325
326326 rq->end_io = NULL;
327327 rq->end_io_data = NULL;
328
- rq->next_rq = NULL;
329328
330
-#ifdef CONFIG_BLK_CGROUP
331
- rq->rl = NULL;
332
-#endif
333
-
334
- data->ctx->rq_dispatched[op_is_sync(op)]++;
329
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
335330 refcount_set(&rq->ref, 1);
331
+
332
+ if (!op_is_flush(data->cmd_flags)) {
333
+ struct elevator_queue *e = data->q->elevator;
334
+
335
+ rq->elv.icq = NULL;
336
+ if (e && e->type->ops.prepare_request) {
337
+ if (e->type->icq_cache)
338
+ blk_mq_sched_assign_ioc(rq);
339
+
340
+ e->type->ops.prepare_request(rq);
341
+ rq->rq_flags |= RQF_ELVPRIV;
342
+ }
343
+ }
344
+
345
+ data->hctx->queued++;
346
+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
336347 return rq;
337348 }
338349
339
-static struct request *blk_mq_get_request(struct request_queue *q,
340
- struct bio *bio, unsigned int op,
341
- struct blk_mq_alloc_data *data)
350
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
342351 {
352
+ struct request_queue *q = data->q;
343353 struct elevator_queue *e = q->elevator;
344
- struct request *rq;
354
+ u64 alloc_time_ns = 0;
345355 unsigned int tag;
346
- bool put_ctx_on_error = false;
347356
348
- blk_queue_enter_live(q);
349
- data->q = q;
350
- if (likely(!data->ctx)) {
351
- data->ctx = blk_mq_get_ctx(q);
352
- put_ctx_on_error = true;
353
- }
354
- if (likely(!data->hctx))
355
- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
356
- if (op & REQ_NOWAIT)
357
+ /* alloc_time includes depth and tag waits */
358
+ if (blk_queue_rq_alloc_time(q))
359
+ alloc_time_ns = ktime_get_ns();
360
+
361
+ if (data->cmd_flags & REQ_NOWAIT)
357362 data->flags |= BLK_MQ_REQ_NOWAIT;
358363
359364 if (e) {
360
- data->flags |= BLK_MQ_REQ_INTERNAL;
361
-
362365 /*
363366 * Flush requests are special and go directly to the
364367 * dispatch list. Don't include reserved tags in the
365368 * limiting, as it isn't useful.
366369 */
367
- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
370
+ if (!op_is_flush(data->cmd_flags) &&
371
+ e->type->ops.limit_depth &&
368372 !(data->flags & BLK_MQ_REQ_RESERVED))
369
- e->type->ops.mq.limit_depth(op, data);
370
- } else {
373
+ e->type->ops.limit_depth(data->cmd_flags, data);
374
+ }
375
+
376
+retry:
377
+ data->ctx = blk_mq_get_ctx(q);
378
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
379
+ if (!e)
371380 blk_mq_tag_busy(data->hctx);
372
- }
373381
382
+ /*
383
+ * Waiting allocations only fail because of an inactive hctx. In that
384
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
385
+ * should have migrated us to an online CPU by now.
386
+ */
374387 tag = blk_mq_get_tag(data);
375
- if (tag == BLK_MQ_TAG_FAIL) {
376
- if (put_ctx_on_error) {
377
- blk_mq_put_ctx(data->ctx);
378
- data->ctx = NULL;
379
- }
380
- blk_queue_exit(q);
381
- return NULL;
382
- }
388
+ if (tag == BLK_MQ_NO_TAG) {
389
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
390
+ return NULL;
383391
384
- rq = blk_mq_rq_ctx_init(data, tag, op);
385
- if (!op_is_flush(op)) {
386
- rq->elv.icq = NULL;
387
- if (e && e->type->ops.mq.prepare_request) {
388
- if (e->type->icq_cache && rq_ioc(bio))
389
- blk_mq_sched_assign_ioc(rq, bio);
390
-
391
- e->type->ops.mq.prepare_request(rq, bio);
392
- rq->rq_flags |= RQF_ELVPRIV;
393
- }
392
+ /*
393
+ * Give up the CPU and sleep for a random short time to ensure
394
+ * that thread using a realtime scheduling class are migrated
395
+ * off the CPU, and thus off the hctx that is going away.
396
+ */
397
+ msleep(3);
398
+ goto retry;
394399 }
395
- data->hctx->queued++;
396
- return rq;
400
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
397401 }
398402
399403 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
400404 blk_mq_req_flags_t flags)
401405 {
402
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
406
+ struct blk_mq_alloc_data data = {
407
+ .q = q,
408
+ .flags = flags,
409
+ .cmd_flags = op,
410
+ };
403411 struct request *rq;
404412 int ret;
405413
....@@ -407,28 +415,35 @@
407415 if (ret)
408416 return ERR_PTR(ret);
409417
410
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
411
- blk_queue_exit(q);
412
-
418
+ rq = __blk_mq_alloc_request(&data);
413419 if (!rq)
414
- return ERR_PTR(-EWOULDBLOCK);
415
-
416
- blk_mq_put_ctx(alloc_data.ctx);
417
-
420
+ goto out_queue_exit;
418421 rq->__data_len = 0;
419422 rq->__sector = (sector_t) -1;
420423 rq->bio = rq->biotail = NULL;
421424 return rq;
425
+out_queue_exit:
426
+ blk_queue_exit(q);
427
+ return ERR_PTR(-EWOULDBLOCK);
422428 }
423429 EXPORT_SYMBOL(blk_mq_alloc_request);
424430
425431 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
426432 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
427433 {
428
- struct blk_mq_alloc_data alloc_data = { .flags = flags };
429
- struct request *rq;
434
+ struct blk_mq_alloc_data data = {
435
+ .q = q,
436
+ .flags = flags,
437
+ .cmd_flags = op,
438
+ };
439
+ u64 alloc_time_ns = 0;
430440 unsigned int cpu;
441
+ unsigned int tag;
431442 int ret;
443
+
444
+ /* alloc_time includes depth and tag waits */
445
+ if (blk_queue_rq_alloc_time(q))
446
+ alloc_time_ns = ktime_get_ns();
432447
433448 /*
434449 * If the tag allocator sleeps we could get an allocation for a
....@@ -436,7 +451,7 @@
436451 * allocator for this for the rare use case of a command tied to
437452 * a specific queue.
438453 */
439
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
454
+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
440455 return ERR_PTR(-EINVAL);
441456
442457 if (hctx_idx >= q->nr_hw_queues)
....@@ -450,21 +465,27 @@
450465 * Check if the hardware context is actually mapped to anything.
451466 * If not tell the caller that it should skip this queue.
452467 */
453
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
454
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
455
- blk_queue_exit(q);
456
- return ERR_PTR(-EXDEV);
457
- }
458
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
459
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
468
+ ret = -EXDEV;
469
+ data.hctx = q->queue_hw_ctx[hctx_idx];
470
+ if (!blk_mq_hw_queue_mapped(data.hctx))
471
+ goto out_queue_exit;
472
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
473
+ if (cpu >= nr_cpu_ids)
474
+ goto out_queue_exit;
475
+ data.ctx = __blk_mq_get_ctx(q, cpu);
460476
461
- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
477
+ if (!q->elevator)
478
+ blk_mq_tag_busy(data.hctx);
479
+
480
+ ret = -EWOULDBLOCK;
481
+ tag = blk_mq_get_tag(&data);
482
+ if (tag == BLK_MQ_NO_TAG)
483
+ goto out_queue_exit;
484
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
485
+
486
+out_queue_exit:
462487 blk_queue_exit(q);
463
-
464
- if (!rq)
465
- return ERR_PTR(-EWOULDBLOCK);
466
-
467
- return rq;
488
+ return ERR_PTR(ret);
468489 }
469490 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
470491
....@@ -472,13 +493,16 @@
472493 {
473494 struct request_queue *q = rq->q;
474495 struct blk_mq_ctx *ctx = rq->mq_ctx;
475
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
496
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
476497 const int sched_tag = rq->internal_tag;
477498
478
- if (rq->tag != -1)
479
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
480
- if (sched_tag != -1)
481
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
499
+ blk_crypto_free_request(rq);
500
+ blk_pm_mark_last_busy(rq);
501
+ rq->mq_hctx = NULL;
502
+ if (rq->tag != BLK_MQ_NO_TAG)
503
+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
504
+ if (sched_tag != BLK_MQ_NO_TAG)
505
+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
482506 blk_mq_sched_restart(hctx);
483507 blk_queue_exit(q);
484508 }
....@@ -488,11 +512,11 @@
488512 struct request_queue *q = rq->q;
489513 struct elevator_queue *e = q->elevator;
490514 struct blk_mq_ctx *ctx = rq->mq_ctx;
491
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
515
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
492516
493517 if (rq->rq_flags & RQF_ELVPRIV) {
494
- if (e && e->type->ops.mq.finish_request)
495
- e->type->ops.mq.finish_request(rq);
518
+ if (e && e->type->ops.finish_request)
519
+ e->type->ops.finish_request(rq);
496520 if (rq->elv.icq) {
497521 put_io_context(rq->elv.icq->ioc);
498522 rq->elv.icq = NULL;
....@@ -501,15 +525,12 @@
501525
502526 ctx->rq_completed[rq_is_sync(rq)]++;
503527 if (rq->rq_flags & RQF_MQ_INFLIGHT)
504
- atomic_dec(&hctx->nr_active);
528
+ __blk_mq_dec_active_requests(hctx);
505529
506530 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
507531 laptop_io_completion(q->backing_dev_info);
508532
509533 rq_qos_done(q, rq);
510
-
511
- if (blk_rq_rl(rq))
512
- blk_put_rl(blk_rq_rl(rq));
513534
514535 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
515536 if (refcount_dec_and_test(&rq->ref))
....@@ -519,12 +540,17 @@
519540
520541 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
521542 {
522
- u64 now = ktime_get_ns();
543
+ u64 now = 0;
544
+
545
+ if (blk_mq_need_time_stamp(rq))
546
+ now = ktime_get_ns();
523547
524548 if (rq->rq_flags & RQF_STATS) {
525549 blk_mq_poll_stats_start(rq->q);
526550 blk_stat_add(rq, now);
527551 }
552
+
553
+ blk_mq_sched_completed_request(rq, now);
528554
529555 blk_account_io_done(rq, now);
530556
....@@ -532,8 +558,6 @@
532558 rq_qos_done(rq->q, rq);
533559 rq->end_io(rq, error);
534560 } else {
535
- if (unlikely(blk_bidi_rq(rq)))
536
- blk_mq_free_request(rq->next_rq);
537561 blk_mq_free_request(rq);
538562 }
539563 }
....@@ -547,43 +571,120 @@
547571 }
548572 EXPORT_SYMBOL(blk_mq_end_request);
549573
550
-static void __blk_mq_complete_request_remote(void *data)
574
+static void blk_complete_reqs(struct llist_head *list)
551575 {
552
- struct request *rq = data;
576
+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
577
+ struct request *rq, *next;
553578
554
- rq->q->softirq_done_fn(rq);
579
+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
580
+ rq->q->mq_ops->complete(rq);
555581 }
556582
557
-static void __blk_mq_complete_request(struct request *rq)
583
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
558584 {
559
- struct blk_mq_ctx *ctx = rq->mq_ctx;
560
- bool shared = false;
561
- int cpu;
585
+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
586
+}
562587
563
- if (!blk_mq_mark_complete(rq))
564
- return;
565
- if (rq->internal_tag != -1)
566
- blk_mq_sched_completed_request(rq);
588
+static int blk_softirq_cpu_dead(unsigned int cpu)
589
+{
590
+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
591
+ return 0;
592
+}
567593
568
- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
569
- rq->q->softirq_done_fn(rq);
570
- return;
571
- }
594
+static void __blk_mq_complete_request_remote(void *data)
595
+{
596
+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
597
+}
572598
573
- cpu = get_cpu();
574
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
575
- shared = cpus_share_cache(cpu, ctx->cpu);
599
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
600
+{
601
+ int cpu = raw_smp_processor_id();
576602
577
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
603
+ if (!IS_ENABLED(CONFIG_SMP) ||
604
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
605
+ return false;
606
+ /*
607
+ * With force threaded interrupts enabled, raising softirq from an SMP
608
+ * function call will always result in waking the ksoftirqd thread.
609
+ * This is probably worse than completing the request on a different
610
+ * cache domain.
611
+ */
612
+ if (force_irqthreads)
613
+ return false;
614
+
615
+ /* same CPU or cache domain? Complete locally */
616
+ if (cpu == rq->mq_ctx->cpu ||
617
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
618
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
619
+ return false;
620
+
621
+ /* don't try to IPI to an offline CPU */
622
+ return cpu_online(rq->mq_ctx->cpu);
623
+}
624
+
625
+static void blk_mq_complete_send_ipi(struct request *rq)
626
+{
627
+ struct llist_head *list;
628
+ unsigned int cpu;
629
+
630
+ cpu = rq->mq_ctx->cpu;
631
+ list = &per_cpu(blk_cpu_done, cpu);
632
+ if (llist_add(&rq->ipi_list, list)) {
578633 rq->csd.func = __blk_mq_complete_request_remote;
579634 rq->csd.info = rq;
580635 rq->csd.flags = 0;
581
- smp_call_function_single_async(ctx->cpu, &rq->csd);
582
- } else {
583
- rq->q->softirq_done_fn(rq);
636
+ smp_call_function_single_async(cpu, &rq->csd);
584637 }
585
- put_cpu();
586638 }
639
+
640
+static void blk_mq_raise_softirq(struct request *rq)
641
+{
642
+ struct llist_head *list;
643
+
644
+ preempt_disable();
645
+ list = this_cpu_ptr(&blk_cpu_done);
646
+ if (llist_add(&rq->ipi_list, list))
647
+ raise_softirq(BLOCK_SOFTIRQ);
648
+ preempt_enable();
649
+}
650
+
651
+bool blk_mq_complete_request_remote(struct request *rq)
652
+{
653
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
654
+
655
+ /*
656
+ * For a polled request, always complete locallly, it's pointless
657
+ * to redirect the completion.
658
+ */
659
+ if (rq->cmd_flags & REQ_HIPRI)
660
+ return false;
661
+
662
+ if (blk_mq_complete_need_ipi(rq)) {
663
+ blk_mq_complete_send_ipi(rq);
664
+ return true;
665
+ }
666
+
667
+ if (rq->q->nr_hw_queues == 1) {
668
+ blk_mq_raise_softirq(rq);
669
+ return true;
670
+ }
671
+ return false;
672
+}
673
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
674
+
675
+/**
676
+ * blk_mq_complete_request - end I/O on a request
677
+ * @rq: the request being processed
678
+ *
679
+ * Description:
680
+ * Complete a request by scheduling the ->complete_rq operation.
681
+ **/
682
+void blk_mq_complete_request(struct request *rq)
683
+{
684
+ if (!blk_mq_complete_request_remote(rq))
685
+ rq->q->mq_ops->complete(rq);
686
+}
687
+EXPORT_SYMBOL(blk_mq_complete_request);
587688
588689 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
589690 __releases(hctx->srcu)
....@@ -606,40 +707,22 @@
606707 }
607708
608709 /**
609
- * blk_mq_complete_request - end I/O on a request
610
- * @rq: the request being processed
710
+ * blk_mq_start_request - Start processing a request
711
+ * @rq: Pointer to request to be started
611712 *
612
- * Description:
613
- * Ends all I/O on a request. It does not handle partial completions.
614
- * The actual completion happens out-of-order, through a IPI handler.
615
- **/
616
-void blk_mq_complete_request(struct request *rq)
617
-{
618
- if (unlikely(blk_should_fake_timeout(rq->q)))
619
- return;
620
- __blk_mq_complete_request(rq);
621
-}
622
-EXPORT_SYMBOL(blk_mq_complete_request);
623
-
624
-int blk_mq_request_started(struct request *rq)
625
-{
626
- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
627
-}
628
-EXPORT_SYMBOL_GPL(blk_mq_request_started);
629
-
713
+ * Function used by device drivers to notify the block layer that a request
714
+ * is going to be processed now, so blk layer can do proper initializations
715
+ * such as starting the timeout timer.
716
+ */
630717 void blk_mq_start_request(struct request *rq)
631718 {
632719 struct request_queue *q = rq->q;
633
-
634
- blk_mq_sched_started_request(rq);
635720
636721 trace_block_rq_issue(q, rq);
637722
638723 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
639724 rq->io_start_time_ns = ktime_get_ns();
640
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
641
- rq->throtl_size = blk_rq_sectors(rq);
642
-#endif
725
+ rq->stats_sectors = blk_rq_sectors(rq);
643726 rq->rq_flags |= RQF_STATS;
644727 rq_qos_issue(q, rq);
645728 }
....@@ -649,14 +732,10 @@
649732 blk_add_timer(rq);
650733 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
651734
652
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
653
- /*
654
- * Make sure space for the drain appears. We know we can do
655
- * this because max_hw_segments has been adjusted to be one
656
- * fewer than the device can handle.
657
- */
658
- rq->nr_phys_segments++;
659
- }
735
+#ifdef CONFIG_BLK_DEV_INTEGRITY
736
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
737
+ q->integrity.profile->prepare_fn(rq);
738
+#endif
660739 }
661740 EXPORT_SYMBOL(blk_mq_start_request);
662741
....@@ -672,8 +751,6 @@
672751 if (blk_mq_request_started(rq)) {
673752 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
674753 rq->rq_flags &= ~RQF_TIMED_OUT;
675
- if (q->dma_drain_size && blk_rq_bytes(rq))
676
- rq->nr_phys_segments--;
677754 }
678755 }
679756
....@@ -684,7 +761,6 @@
684761 /* this request will be re-inserted to io scheduler queue */
685762 blk_mq_sched_requeue_request(rq);
686763
687
- BUG_ON(blk_queued_rq(rq));
688764 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
689765 }
690766 EXPORT_SYMBOL(blk_mq_requeue_request);
....@@ -712,7 +788,7 @@
712788 * merge.
713789 */
714790 if (rq->rq_flags & RQF_DONTPREP)
715
- blk_mq_request_bypass_insert(rq, false);
791
+ blk_mq_request_bypass_insert(rq, false, false);
716792 else
717793 blk_mq_sched_insert_request(rq, true, false, false);
718794 }
....@@ -750,7 +826,6 @@
750826 if (kick_requeue_list)
751827 blk_mq_kick_requeue_list(q);
752828 }
753
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
754829
755830 void blk_mq_kick_requeue_list(struct request_queue *q)
756831 {
....@@ -777,6 +852,32 @@
777852 }
778853 EXPORT_SYMBOL(blk_mq_tag_to_rq);
779854
855
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
856
+ void *priv, bool reserved)
857
+{
858
+ /*
859
+ * If we find a request that isn't idle and the queue matches,
860
+ * we know the queue is busy. Return false to stop the iteration.
861
+ */
862
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
863
+ bool *busy = priv;
864
+
865
+ *busy = true;
866
+ return false;
867
+ }
868
+
869
+ return true;
870
+}
871
+
872
+bool blk_mq_queue_inflight(struct request_queue *q)
873
+{
874
+ bool busy = false;
875
+
876
+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
877
+ return busy;
878
+}
879
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
880
+
780881 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
781882 {
782883 req->rq_flags |= RQF_TIMED_OUT;
....@@ -801,7 +902,7 @@
801902 if (rq->rq_flags & RQF_TIMED_OUT)
802903 return false;
803904
804
- deadline = blk_rq_deadline(rq);
905
+ deadline = READ_ONCE(rq->deadline);
805906 if (time_after_eq(jiffies, deadline))
806907 return true;
807908
....@@ -812,43 +913,29 @@
812913 return false;
813914 }
814915
815
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
916
+void blk_mq_put_rq_ref(struct request *rq)
917
+{
918
+ if (is_flush_rq(rq))
919
+ rq->end_io(rq, 0);
920
+ else if (refcount_dec_and_test(&rq->ref))
921
+ __blk_mq_free_request(rq);
922
+}
923
+
924
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816925 struct request *rq, void *priv, bool reserved)
817926 {
818927 unsigned long *next = priv;
819928
820929 /*
821
- * Just do a quick check if it is expired before locking the request in
822
- * so we're not unnecessarilly synchronizing across CPUs.
823
- */
824
- if (!blk_mq_req_expired(rq, next))
825
- return;
826
-
827
- /*
828
- * We have reason to believe the request may be expired. Take a
829
- * reference on the request to lock this request lifetime into its
830
- * currently allocated context to prevent it from being reallocated in
831
- * the event the completion by-passes this timeout handler.
832
- *
833
- * If the reference was already released, then the driver beat the
834
- * timeout handler to posting a natural completion.
835
- */
836
- if (!refcount_inc_not_zero(&rq->ref))
837
- return;
838
-
839
- /*
840
- * The request is now locked and cannot be reallocated underneath the
841
- * timeout handler's processing. Re-verify this exact request is truly
842
- * expired; if it is not expired, then the request was completed and
843
- * reallocated as a new request.
930
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
931
+ * be reallocated underneath the timeout handler's processing, then
932
+ * the expire check is reliable. If the request is not expired, then
933
+ * it was completed and reallocated as a new request after returning
934
+ * from blk_mq_check_expired().
844935 */
845936 if (blk_mq_req_expired(rq, next))
846937 blk_mq_rq_timed_out(rq, reserved);
847
-
848
- if (is_flush_rq(rq, hctx))
849
- rq->end_io(rq, 0);
850
- else if (refcount_dec_and_test(&rq->ref))
851
- __blk_mq_free_request(rq);
938
+ return true;
852939 }
853940
854941 static void blk_mq_timeout_work(struct work_struct *work)
....@@ -905,9 +992,10 @@
905992 struct flush_busy_ctx_data *flush_data = data;
906993 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
907994 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
995
+ enum hctx_type type = hctx->type;
908996
909997 spin_lock(&ctx->lock);
910
- list_splice_tail_init(&ctx->rq_list, flush_data->list);
998
+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
911999 sbitmap_clear_bit(sb, bitnr);
9121000 spin_unlock(&ctx->lock);
9131001 return true;
....@@ -939,12 +1027,13 @@
9391027 struct dispatch_rq_data *dispatch_data = data;
9401028 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
9411029 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1030
+ enum hctx_type type = hctx->type;
9421031
9431032 spin_lock(&ctx->lock);
944
- if (!list_empty(&ctx->rq_list)) {
945
- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
1033
+ if (!list_empty(&ctx->rq_lists[type])) {
1034
+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
9461035 list_del_init(&dispatch_data->rq->queuelist);
947
- if (list_empty(&ctx->rq_list))
1036
+ if (list_empty(&ctx->rq_lists[type]))
9481037 sbitmap_clear_bit(sb, bitnr);
9491038 }
9501039 spin_unlock(&ctx->lock);
....@@ -955,7 +1044,7 @@
9551044 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
9561045 struct blk_mq_ctx *start)
9571046 {
958
- unsigned off = start ? start->index_hw : 0;
1047
+ unsigned off = start ? start->index_hw[hctx->type] : 0;
9591048 struct dispatch_rq_data data = {
9601049 .hctx = hctx,
9611050 .rq = NULL,
....@@ -975,33 +1064,44 @@
9751064 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
9761065 }
9771066
978
-bool blk_mq_get_driver_tag(struct request *rq)
1067
+static bool __blk_mq_get_driver_tag(struct request *rq)
9791068 {
980
- struct blk_mq_alloc_data data = {
981
- .q = rq->q,
982
- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
983
- .flags = BLK_MQ_REQ_NOWAIT,
984
- };
985
- bool shared;
1069
+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
1070
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1071
+ int tag;
9861072
987
- if (rq->tag != -1)
988
- goto done;
1073
+ blk_mq_tag_busy(rq->mq_hctx);
9891074
990
- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
991
- data.flags |= BLK_MQ_REQ_RESERVED;
992
-
993
- shared = blk_mq_tag_busy(data.hctx);
994
- rq->tag = blk_mq_get_tag(&data);
995
- if (rq->tag >= 0) {
996
- if (shared) {
997
- rq->rq_flags |= RQF_MQ_INFLIGHT;
998
- atomic_inc(&data.hctx->nr_active);
999
- }
1000
- data.hctx->tags->rqs[rq->tag] = rq;
1075
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1076
+ bt = rq->mq_hctx->tags->breserved_tags;
1077
+ tag_offset = 0;
1078
+ } else {
1079
+ if (!hctx_may_queue(rq->mq_hctx, bt))
1080
+ return false;
10011081 }
10021082
1003
-done:
1004
- return rq->tag != -1;
1083
+ tag = __sbitmap_queue_get(bt);
1084
+ if (tag == BLK_MQ_NO_TAG)
1085
+ return false;
1086
+
1087
+ rq->tag = tag + tag_offset;
1088
+ return true;
1089
+}
1090
+
1091
+static bool blk_mq_get_driver_tag(struct request *rq)
1092
+{
1093
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1094
+
1095
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1096
+ return false;
1097
+
1098
+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1099
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1100
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
1101
+ __blk_mq_inc_active_requests(hctx);
1102
+ }
1103
+ hctx->tags->rqs[rq->tag] = rq;
1104
+ return true;
10051105 }
10061106
10071107 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
....@@ -1012,7 +1112,13 @@
10121112 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
10131113
10141114 spin_lock(&hctx->dispatch_wait_lock);
1015
- list_del_init(&wait->entry);
1115
+ if (!list_empty(&wait->entry)) {
1116
+ struct sbitmap_queue *sbq;
1117
+
1118
+ list_del_init(&wait->entry);
1119
+ sbq = hctx->tags->bitmap_tags;
1120
+ atomic_dec(&sbq->ws_active);
1121
+ }
10161122 spin_unlock(&hctx->dispatch_wait_lock);
10171123
10181124 blk_mq_run_hw_queue(hctx, true);
....@@ -1028,13 +1134,13 @@
10281134 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
10291135 struct request *rq)
10301136 {
1137
+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
10311138 struct wait_queue_head *wq;
10321139 wait_queue_entry_t *wait;
10331140 bool ret;
10341141
1035
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1036
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1037
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1142
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1143
+ blk_mq_sched_mark_restart_hctx(hctx);
10381144
10391145 /*
10401146 * It's possible that a tag was freed in the window between the
....@@ -1051,7 +1157,7 @@
10511157 if (!list_empty_careful(&wait->entry))
10521158 return false;
10531159
1054
- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1160
+ wq = &bt_wait_ptr(sbq, hctx)->wait;
10551161
10561162 spin_lock_irq(&wq->lock);
10571163 spin_lock(&hctx->dispatch_wait_lock);
....@@ -1061,6 +1167,7 @@
10611167 return false;
10621168 }
10631169
1170
+ atomic_inc(&sbq->ws_active);
10641171 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
10651172 __add_wait_queue(wq, wait);
10661173
....@@ -1081,6 +1188,7 @@
10811188 * someone else gets the wakeup.
10821189 */
10831190 list_del_init(&wait->entry);
1191
+ atomic_dec(&sbq->ws_active);
10841192 spin_unlock(&hctx->dispatch_wait_lock);
10851193 spin_unlock_irq(&wq->lock);
10861194
....@@ -1099,9 +1207,6 @@
10991207 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
11001208 {
11011209 unsigned int ewma;
1102
-
1103
- if (hctx->queue->elevator)
1104
- return;
11051210
11061211 ewma = hctx->dispatch_busy;
11071212
....@@ -1135,22 +1240,83 @@
11351240 __blk_mq_requeue_request(rq);
11361241 }
11371242
1243
+static void blk_mq_handle_zone_resource(struct request *rq,
1244
+ struct list_head *zone_list)
1245
+{
1246
+ /*
1247
+ * If we end up here it is because we cannot dispatch a request to a
1248
+ * specific zone due to LLD level zone-write locking or other zone
1249
+ * related resource not being available. In this case, set the request
1250
+ * aside in zone_list for retrying it later.
1251
+ */
1252
+ list_add(&rq->queuelist, zone_list);
1253
+ __blk_mq_requeue_request(rq);
1254
+}
1255
+
1256
+enum prep_dispatch {
1257
+ PREP_DISPATCH_OK,
1258
+ PREP_DISPATCH_NO_TAG,
1259
+ PREP_DISPATCH_NO_BUDGET,
1260
+};
1261
+
1262
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1263
+ bool need_budget)
1264
+{
1265
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1266
+
1267
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
1268
+ blk_mq_put_driver_tag(rq);
1269
+ return PREP_DISPATCH_NO_BUDGET;
1270
+ }
1271
+
1272
+ if (!blk_mq_get_driver_tag(rq)) {
1273
+ /*
1274
+ * The initial allocation attempt failed, so we need to
1275
+ * rerun the hardware queue when a tag is freed. The
1276
+ * waitqueue takes care of that. If the queue is run
1277
+ * before we add this entry back on the dispatch list,
1278
+ * we'll re-run it below.
1279
+ */
1280
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
1281
+ /*
1282
+ * All budgets not got from this function will be put
1283
+ * together during handling partial dispatch
1284
+ */
1285
+ if (need_budget)
1286
+ blk_mq_put_dispatch_budget(rq->q);
1287
+ return PREP_DISPATCH_NO_TAG;
1288
+ }
1289
+ }
1290
+
1291
+ return PREP_DISPATCH_OK;
1292
+}
1293
+
1294
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1295
+static void blk_mq_release_budgets(struct request_queue *q,
1296
+ unsigned int nr_budgets)
1297
+{
1298
+ int i;
1299
+
1300
+ for (i = 0; i < nr_budgets; i++)
1301
+ blk_mq_put_dispatch_budget(q);
1302
+}
1303
+
11381304 /*
11391305 * Returns true if we did some work AND can potentially do more.
11401306 */
1141
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1142
- bool got_budget)
1307
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1308
+ unsigned int nr_budgets)
11431309 {
1144
- struct blk_mq_hw_ctx *hctx;
1310
+ enum prep_dispatch prep;
1311
+ struct request_queue *q = hctx->queue;
11451312 struct request *rq, *nxt;
1146
- bool no_tag = false;
11471313 int errors, queued;
11481314 blk_status_t ret = BLK_STS_OK;
1315
+ LIST_HEAD(zone_list);
1316
+ bool needs_resource = false;
11491317
11501318 if (list_empty(list))
11511319 return false;
1152
-
1153
- WARN_ON(!list_is_singular(list) && got_budget);
11541320
11551321 /*
11561322 * Now process all the entries, sending them to the driver.
....@@ -1161,29 +1327,10 @@
11611327
11621328 rq = list_first_entry(list, struct request, queuelist);
11631329
1164
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1165
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1330
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
1331
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1332
+ if (prep != PREP_DISPATCH_OK)
11661333 break;
1167
-
1168
- if (!blk_mq_get_driver_tag(rq)) {
1169
- /*
1170
- * The initial allocation attempt failed, so we need to
1171
- * rerun the hardware queue when a tag is freed. The
1172
- * waitqueue takes care of that. If the queue is run
1173
- * before we add this entry back on the dispatch list,
1174
- * we'll re-run it below.
1175
- */
1176
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1177
- blk_mq_put_dispatch_budget(hctx);
1178
- /*
1179
- * For non-shared tags, the RESTART check
1180
- * will suffice.
1181
- */
1182
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1183
- no_tag = true;
1184
- break;
1185
- }
1186
- }
11871334
11881335 list_del_init(&rq->queuelist);
11891336
....@@ -1200,32 +1347,63 @@
12001347 bd.last = !blk_mq_get_driver_tag(nxt);
12011348 }
12021349
1350
+ /*
1351
+ * once the request is queued to lld, no need to cover the
1352
+ * budget any more
1353
+ */
1354
+ if (nr_budgets)
1355
+ nr_budgets--;
12031356 ret = q->mq_ops->queue_rq(hctx, &bd);
1204
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1205
- blk_mq_handle_dev_resource(rq, list);
1357
+ switch (ret) {
1358
+ case BLK_STS_OK:
1359
+ queued++;
12061360 break;
1207
- }
1208
-
1209
- if (unlikely(ret != BLK_STS_OK)) {
1361
+ case BLK_STS_RESOURCE:
1362
+ needs_resource = true;
1363
+ fallthrough;
1364
+ case BLK_STS_DEV_RESOURCE:
1365
+ blk_mq_handle_dev_resource(rq, list);
1366
+ goto out;
1367
+ case BLK_STS_ZONE_RESOURCE:
1368
+ /*
1369
+ * Move the request to zone_list and keep going through
1370
+ * the dispatch list to find more requests the drive can
1371
+ * accept.
1372
+ */
1373
+ blk_mq_handle_zone_resource(rq, &zone_list);
1374
+ needs_resource = true;
1375
+ break;
1376
+ default:
12101377 errors++;
12111378 blk_mq_end_request(rq, BLK_STS_IOERR);
1212
- continue;
12131379 }
1214
-
1215
- queued++;
12161380 } while (!list_empty(list));
1381
+out:
1382
+ if (!list_empty(&zone_list))
1383
+ list_splice_tail_init(&zone_list, list);
12171384
12181385 hctx->dispatched[queued_to_index(queued)]++;
12191386
1387
+ /* If we didn't flush the entire list, we could have told the driver
1388
+ * there was more coming, but that turned out to be a lie.
1389
+ */
1390
+ if ((!list_empty(list) || errors || needs_resource ||
1391
+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
1392
+ q->mq_ops->commit_rqs(hctx);
12201393 /*
12211394 * Any items that need requeuing? Stuff them into hctx->dispatch,
12221395 * that is where we will continue on next queue run.
12231396 */
12241397 if (!list_empty(list)) {
12251398 bool needs_restart;
1399
+ /* For non-shared tags, the RESTART check will suffice */
1400
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1401
+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1402
+
1403
+ blk_mq_release_budgets(q, nr_budgets);
12261404
12271405 spin_lock(&hctx->lock);
1228
- list_splice_init(list, &hctx->dispatch);
1406
+ list_splice_tail_init(list, &hctx->dispatch);
12291407 spin_unlock(&hctx->lock);
12301408
12311409 /*
....@@ -1259,13 +1437,17 @@
12591437 *
12601438 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
12611439 * bit is set, run queue after a delay to avoid IO stalls
1262
- * that could otherwise occur if the queue is idle.
1440
+ * that could otherwise occur if the queue is idle. We'll do
1441
+ * similar if we couldn't get budget or couldn't lock a zone
1442
+ * and SCHED_RESTART is set.
12631443 */
12641444 needs_restart = blk_mq_sched_needs_restart(hctx);
1445
+ if (prep == PREP_DISPATCH_NO_BUDGET)
1446
+ needs_resource = true;
12651447 if (!needs_restart ||
12661448 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
12671449 blk_mq_run_hw_queue(hctx, true);
1268
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
1450
+ else if (needs_restart && needs_resource)
12691451 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
12701452
12711453 blk_mq_update_dispatch_busy(hctx, true);
....@@ -1273,16 +1455,15 @@
12731455 } else
12741456 blk_mq_update_dispatch_busy(hctx, false);
12751457
1276
- /*
1277
- * If the host/device is unable to accept more work, inform the
1278
- * caller of that.
1279
- */
1280
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1281
- return false;
1282
-
12831458 return (queued + errors) != 0;
12841459 }
12851460
1461
+/**
1462
+ * __blk_mq_run_hw_queue - Run a hardware queue.
1463
+ * @hctx: Pointer to the hardware queue to run.
1464
+ *
1465
+ * Send pending requests to the hardware.
1466
+ */
12861467 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
12871468 {
12881469 int srcu_idx;
....@@ -1380,6 +1561,15 @@
13801561 return next_cpu;
13811562 }
13821563
1564
+/**
1565
+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1566
+ * @hctx: Pointer to the hardware queue to run.
1567
+ * @async: If we want to run the queue asynchronously.
1568
+ * @msecs: Microseconds of delay to wait before running the queue.
1569
+ *
1570
+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
1571
+ * with a delay of @msecs.
1572
+ */
13831573 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
13841574 unsigned long msecs)
13851575 {
....@@ -1387,27 +1577,43 @@
13871577 return;
13881578
13891579 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1390
- int cpu = get_cpu();
1580
+ int cpu = get_cpu_light();
13911581 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
13921582 __blk_mq_run_hw_queue(hctx);
1393
- put_cpu();
1583
+ put_cpu_light();
13941584 return;
13951585 }
13961586
1397
- put_cpu();
1587
+ put_cpu_light();
13981588 }
13991589
14001590 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
14011591 msecs_to_jiffies(msecs));
14021592 }
14031593
1594
+/**
1595
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1596
+ * @hctx: Pointer to the hardware queue to run.
1597
+ * @msecs: Microseconds of delay to wait before running the queue.
1598
+ *
1599
+ * Run a hardware queue asynchronously with a delay of @msecs.
1600
+ */
14041601 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
14051602 {
14061603 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
14071604 }
14081605 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
14091606
1410
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1607
+/**
1608
+ * blk_mq_run_hw_queue - Start to run a hardware queue.
1609
+ * @hctx: Pointer to the hardware queue to run.
1610
+ * @async: If we want to run the queue asynchronously.
1611
+ *
1612
+ * Check if the request queue is not in a quiesced state and if there are
1613
+ * pending requests to be sent. If this is true, run the queue to send requests
1614
+ * to hardware.
1615
+ */
1616
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
14111617 {
14121618 int srcu_idx;
14131619 bool need_run;
....@@ -1425,28 +1631,101 @@
14251631 blk_mq_hctx_has_pending(hctx);
14261632 hctx_unlock(hctx, srcu_idx);
14271633
1428
- if (need_run) {
1634
+ if (need_run)
14291635 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1430
- return true;
1431
- }
1432
-
1433
- return false;
14341636 }
14351637 EXPORT_SYMBOL(blk_mq_run_hw_queue);
14361638
1639
+/*
1640
+ * Is the request queue handled by an IO scheduler that does not respect
1641
+ * hardware queues when dispatching?
1642
+ */
1643
+static bool blk_mq_has_sqsched(struct request_queue *q)
1644
+{
1645
+ struct elevator_queue *e = q->elevator;
1646
+
1647
+ if (e && e->type->ops.dispatch_request &&
1648
+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1649
+ return true;
1650
+ return false;
1651
+}
1652
+
1653
+/*
1654
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
1655
+ * scheduler.
1656
+ */
1657
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1658
+{
1659
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1660
+ /*
1661
+ * If the IO scheduler does not respect hardware queues when
1662
+ * dispatching, we just don't bother with multiple HW queues and
1663
+ * dispatch from hctx for the current CPU since running multiple queues
1664
+ * just causes lock contention inside the scheduler and pointless cache
1665
+ * bouncing.
1666
+ */
1667
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
1668
+
1669
+ if (!blk_mq_hctx_stopped(hctx))
1670
+ return hctx;
1671
+ return NULL;
1672
+}
1673
+
1674
+/**
1675
+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
1676
+ * @q: Pointer to the request queue to run.
1677
+ * @async: If we want to run the queue asynchronously.
1678
+ */
14371679 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
14381680 {
1439
- struct blk_mq_hw_ctx *hctx;
1681
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
14401682 int i;
14411683
1684
+ sq_hctx = NULL;
1685
+ if (blk_mq_has_sqsched(q))
1686
+ sq_hctx = blk_mq_get_sq_hctx(q);
14421687 queue_for_each_hw_ctx(q, hctx, i) {
14431688 if (blk_mq_hctx_stopped(hctx))
14441689 continue;
1445
-
1446
- blk_mq_run_hw_queue(hctx, async);
1690
+ /*
1691
+ * Dispatch from this hctx either if there's no hctx preferred
1692
+ * by IO scheduler or if it has requests that bypass the
1693
+ * scheduler.
1694
+ */
1695
+ if (!sq_hctx || sq_hctx == hctx ||
1696
+ !list_empty_careful(&hctx->dispatch))
1697
+ blk_mq_run_hw_queue(hctx, async);
14471698 }
14481699 }
14491700 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1701
+
1702
+/**
1703
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1704
+ * @q: Pointer to the request queue to run.
1705
+ * @msecs: Microseconds of delay to wait before running the queues.
1706
+ */
1707
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1708
+{
1709
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
1710
+ int i;
1711
+
1712
+ sq_hctx = NULL;
1713
+ if (blk_mq_has_sqsched(q))
1714
+ sq_hctx = blk_mq_get_sq_hctx(q);
1715
+ queue_for_each_hw_ctx(q, hctx, i) {
1716
+ if (blk_mq_hctx_stopped(hctx))
1717
+ continue;
1718
+ /*
1719
+ * Dispatch from this hctx either if there's no hctx preferred
1720
+ * by IO scheduler or if it has requests that bypass the
1721
+ * scheduler.
1722
+ */
1723
+ if (!sq_hctx || sq_hctx == hctx ||
1724
+ !list_empty_careful(&hctx->dispatch))
1725
+ blk_mq_delay_run_hw_queue(hctx, msecs);
1726
+ }
1727
+}
1728
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
14501729
14511730 /**
14521731 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
....@@ -1551,7 +1830,7 @@
15511830 /*
15521831 * If we are stopped, don't run the queue.
15531832 */
1554
- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1833
+ if (blk_mq_hctx_stopped(hctx))
15551834 return;
15561835
15571836 __blk_mq_run_hw_queue(hctx);
....@@ -1562,15 +1841,16 @@
15621841 bool at_head)
15631842 {
15641843 struct blk_mq_ctx *ctx = rq->mq_ctx;
1844
+ enum hctx_type type = hctx->type;
15651845
15661846 lockdep_assert_held(&ctx->lock);
15671847
15681848 trace_block_rq_insert(hctx->queue, rq);
15691849
15701850 if (at_head)
1571
- list_add(&rq->queuelist, &ctx->rq_list);
1851
+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
15721852 else
1573
- list_add_tail(&rq->queuelist, &ctx->rq_list);
1853
+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
15741854 }
15751855
15761856 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
....@@ -1584,17 +1864,25 @@
15841864 blk_mq_hctx_mark_pending(hctx, ctx);
15851865 }
15861866
1587
-/*
1867
+/**
1868
+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1869
+ * @rq: Pointer to request to be inserted.
1870
+ * @at_head: true if the request should be inserted at the head of the list.
1871
+ * @run_queue: If we should run the hardware queue after inserting the request.
1872
+ *
15881873 * Should only be used carefully, when the caller knows we want to
15891874 * bypass a potential IO scheduler on the target device.
15901875 */
1591
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1876
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1877
+ bool run_queue)
15921878 {
1593
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1594
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1879
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
15951880
15961881 spin_lock(&hctx->lock);
1597
- list_add_tail(&rq->queuelist, &hctx->dispatch);
1882
+ if (at_head)
1883
+ list_add(&rq->queuelist, &hctx->dispatch);
1884
+ else
1885
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
15981886 spin_unlock(&hctx->lock);
15991887
16001888 if (run_queue)
....@@ -1606,6 +1894,7 @@
16061894
16071895 {
16081896 struct request *rq;
1897
+ enum hctx_type type = hctx->type;
16091898
16101899 /*
16111900 * preemption doesn't flush plug list, so it's possible ctx->cpu is
....@@ -1617,95 +1906,87 @@
16171906 }
16181907
16191908 spin_lock(&ctx->lock);
1620
- list_splice_tail_init(list, &ctx->rq_list);
1909
+ list_splice_tail_init(list, &ctx->rq_lists[type]);
16211910 blk_mq_hctx_mark_pending(hctx, ctx);
16221911 spin_unlock(&ctx->lock);
16231912 }
16241913
1625
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1914
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
16261915 {
16271916 struct request *rqa = container_of(a, struct request, queuelist);
16281917 struct request *rqb = container_of(b, struct request, queuelist);
16291918
1630
- return !(rqa->mq_ctx < rqb->mq_ctx ||
1631
- (rqa->mq_ctx == rqb->mq_ctx &&
1632
- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1919
+ if (rqa->mq_ctx != rqb->mq_ctx)
1920
+ return rqa->mq_ctx > rqb->mq_ctx;
1921
+ if (rqa->mq_hctx != rqb->mq_hctx)
1922
+ return rqa->mq_hctx > rqb->mq_hctx;
1923
+
1924
+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
16331925 }
16341926
16351927 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
16361928 {
1637
- struct blk_mq_ctx *this_ctx;
1638
- struct request_queue *this_q;
1639
- struct request *rq;
16401929 LIST_HEAD(list);
1641
- LIST_HEAD(ctx_list);
1642
- unsigned int depth;
16431930
1931
+ if (list_empty(&plug->mq_list))
1932
+ return;
16441933 list_splice_init(&plug->mq_list, &list);
16451934
1646
- list_sort(NULL, &list, plug_ctx_cmp);
1935
+ if (plug->rq_count > 2 && plug->multiple_queues)
1936
+ list_sort(NULL, &list, plug_rq_cmp);
16471937
1648
- this_q = NULL;
1649
- this_ctx = NULL;
1650
- depth = 0;
1938
+ plug->rq_count = 0;
16511939
1652
- while (!list_empty(&list)) {
1653
- rq = list_entry_rq(list.next);
1654
- list_del_init(&rq->queuelist);
1655
- BUG_ON(!rq->q);
1656
- if (rq->mq_ctx != this_ctx) {
1657
- if (this_ctx) {
1658
- trace_block_unplug(this_q, depth, !from_schedule);
1659
- blk_mq_sched_insert_requests(this_q, this_ctx,
1660
- &ctx_list,
1661
- from_schedule);
1662
- }
1940
+ do {
1941
+ struct list_head rq_list;
1942
+ struct request *rq, *head_rq = list_entry_rq(list.next);
1943
+ struct list_head *pos = &head_rq->queuelist; /* skip first */
1944
+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1945
+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1946
+ unsigned int depth = 1;
16631947
1664
- this_ctx = rq->mq_ctx;
1665
- this_q = rq->q;
1666
- depth = 0;
1948
+ list_for_each_continue(pos, &list) {
1949
+ rq = list_entry_rq(pos);
1950
+ BUG_ON(!rq->q);
1951
+ if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1952
+ break;
1953
+ depth++;
16671954 }
16681955
1669
- depth++;
1670
- list_add_tail(&rq->queuelist, &ctx_list);
1671
- }
1672
-
1673
- /*
1674
- * If 'this_ctx' is set, we know we have entries to complete
1675
- * on 'ctx_list'. Do those.
1676
- */
1677
- if (this_ctx) {
1678
- trace_block_unplug(this_q, depth, !from_schedule);
1679
- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1956
+ list_cut_before(&rq_list, &list, pos);
1957
+ trace_block_unplug(head_rq->q, depth, !from_schedule);
1958
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
16801959 from_schedule);
1681
- }
1960
+ } while(!list_empty(&list));
16821961 }
16831962
1684
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1963
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1964
+ unsigned int nr_segs)
16851965 {
1686
- blk_init_request_from_bio(rq, bio);
1966
+ int err;
16871967
1688
- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1968
+ if (bio->bi_opf & REQ_RAHEAD)
1969
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
16891970
1690
- blk_account_io_start(rq, true);
1691
-}
1971
+ rq->__sector = bio->bi_iter.bi_sector;
1972
+ rq->write_hint = bio->bi_write_hint;
1973
+ blk_rq_bio_prep(rq, bio, nr_segs);
16921974
1693
-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1694
-{
1695
- if (rq->tag != -1)
1696
- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1975
+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
1976
+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1977
+ WARN_ON_ONCE(err);
16971978
1698
- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1979
+ blk_account_io_start(rq);
16991980 }
17001981
17011982 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
17021983 struct request *rq,
1703
- blk_qc_t *cookie)
1984
+ blk_qc_t *cookie, bool last)
17041985 {
17051986 struct request_queue *q = rq->q;
17061987 struct blk_mq_queue_data bd = {
17071988 .rq = rq,
1708
- .last = true,
1989
+ .last = last,
17091990 };
17101991 blk_qc_t new_cookie;
17111992 blk_status_t ret;
....@@ -1740,7 +2021,7 @@
17402021 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17412022 struct request *rq,
17422023 blk_qc_t *cookie,
1743
- bool bypass_insert)
2024
+ bool bypass_insert, bool last)
17442025 {
17452026 struct request_queue *q = rq->q;
17462027 bool run_queue = true;
....@@ -1761,23 +2042,35 @@
17612042 if (q->elevator && !bypass_insert)
17622043 goto insert;
17632044
1764
- if (!blk_mq_get_dispatch_budget(hctx))
2045
+ if (!blk_mq_get_dispatch_budget(q))
17652046 goto insert;
17662047
17672048 if (!blk_mq_get_driver_tag(rq)) {
1768
- blk_mq_put_dispatch_budget(hctx);
2049
+ blk_mq_put_dispatch_budget(q);
17692050 goto insert;
17702051 }
17712052
1772
- return __blk_mq_issue_directly(hctx, rq, cookie);
2053
+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
17732054 insert:
17742055 if (bypass_insert)
17752056 return BLK_STS_RESOURCE;
17762057
1777
- blk_mq_request_bypass_insert(rq, run_queue);
2058
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
2059
+
17782060 return BLK_STS_OK;
17792061 }
17802062
2063
+/**
2064
+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2065
+ * @hctx: Pointer of the associated hardware queue.
2066
+ * @rq: Pointer to request to be sent.
2067
+ * @cookie: Request queue cookie.
2068
+ *
2069
+ * If the device has enough resources to accept a new request now, send the
2070
+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
2071
+ * we can try send it another time in the future. Requests inserted at this
2072
+ * queue have higher priority.
2073
+ */
17812074 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
17822075 struct request *rq, blk_qc_t *cookie)
17832076 {
....@@ -1788,25 +2081,24 @@
17882081
17892082 hctx_lock(hctx, &srcu_idx);
17902083
1791
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
2084
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
17922085 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1793
- blk_mq_request_bypass_insert(rq, true);
2086
+ blk_mq_request_bypass_insert(rq, false, true);
17942087 else if (ret != BLK_STS_OK)
17952088 blk_mq_end_request(rq, ret);
17962089
17972090 hctx_unlock(hctx, srcu_idx);
17982091 }
17992092
1800
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
2093
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
18012094 {
18022095 blk_status_t ret;
18032096 int srcu_idx;
18042097 blk_qc_t unused_cookie;
1805
- struct blk_mq_ctx *ctx = rq->mq_ctx;
1806
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
2098
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
18072099
18082100 hctx_lock(hctx, &srcu_idx);
1809
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
2101
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
18102102 hctx_unlock(hctx, srcu_idx);
18112103
18122104 return ret;
....@@ -1815,104 +2107,169 @@
18152107 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
18162108 struct list_head *list)
18172109 {
2110
+ int queued = 0;
2111
+ int errors = 0;
2112
+
18182113 while (!list_empty(list)) {
18192114 blk_status_t ret;
18202115 struct request *rq = list_first_entry(list, struct request,
18212116 queuelist);
18222117
18232118 list_del_init(&rq->queuelist);
1824
- ret = blk_mq_request_issue_directly(rq);
2119
+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
18252120 if (ret != BLK_STS_OK) {
2121
+ errors++;
18262122 if (ret == BLK_STS_RESOURCE ||
18272123 ret == BLK_STS_DEV_RESOURCE) {
1828
- blk_mq_request_bypass_insert(rq,
2124
+ blk_mq_request_bypass_insert(rq, false,
18292125 list_empty(list));
18302126 break;
18312127 }
18322128 blk_mq_end_request(rq, ret);
1833
- }
2129
+ } else
2130
+ queued++;
2131
+ }
2132
+
2133
+ /*
2134
+ * If we didn't flush the entire list, we could have told
2135
+ * the driver there was more coming, but that turned out to
2136
+ * be a lie.
2137
+ */
2138
+ if ((!list_empty(list) || errors) &&
2139
+ hctx->queue->mq_ops->commit_rqs && queued)
2140
+ hctx->queue->mq_ops->commit_rqs(hctx);
2141
+}
2142
+
2143
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2144
+{
2145
+ list_add_tail(&rq->queuelist, &plug->mq_list);
2146
+ plug->rq_count++;
2147
+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2148
+ struct request *tmp;
2149
+
2150
+ tmp = list_first_entry(&plug->mq_list, struct request,
2151
+ queuelist);
2152
+ if (tmp->q != rq->q)
2153
+ plug->multiple_queues = true;
18342154 }
18352155 }
18362156
1837
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2157
+/*
2158
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
2159
+ * queues. This is important for md arrays to benefit from merging
2160
+ * requests.
2161
+ */
2162
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
18382163 {
2164
+ if (plug->multiple_queues)
2165
+ return BLK_MAX_REQUEST_COUNT * 2;
2166
+ return BLK_MAX_REQUEST_COUNT;
2167
+}
2168
+
2169
+/**
2170
+ * blk_mq_submit_bio - Create and send a request to block device.
2171
+ * @bio: Bio pointer.
2172
+ *
2173
+ * Builds up a request structure from @q and @bio and send to the device. The
2174
+ * request may not be queued directly to hardware if:
2175
+ * * This request can be merged with another one
2176
+ * * We want to place request at plug queue for possible future merging
2177
+ * * There is an IO scheduler active at this queue
2178
+ *
2179
+ * It will not queue the request if there is an error with the bio, or at the
2180
+ * request creation.
2181
+ *
2182
+ * Returns: Request queue cookie.
2183
+ */
2184
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
2185
+{
2186
+ struct request_queue *q = bio->bi_disk->queue;
18392187 const int is_sync = op_is_sync(bio->bi_opf);
18402188 const int is_flush_fua = op_is_flush(bio->bi_opf);
1841
- struct blk_mq_alloc_data data = { .flags = 0 };
2189
+ struct blk_mq_alloc_data data = {
2190
+ .q = q,
2191
+ };
18422192 struct request *rq;
1843
- unsigned int request_count = 0;
18442193 struct blk_plug *plug;
18452194 struct request *same_queue_rq = NULL;
2195
+ unsigned int nr_segs;
18462196 blk_qc_t cookie;
2197
+ blk_status_t ret;
18472198
18482199 blk_queue_bounce(q, &bio);
1849
-
1850
- blk_queue_split(q, &bio);
2200
+ __blk_queue_split(&bio, &nr_segs);
18512201
18522202 if (!bio_integrity_prep(bio))
1853
- return BLK_QC_T_NONE;
2203
+ goto queue_exit;
18542204
18552205 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1856
- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1857
- return BLK_QC_T_NONE;
2206
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2207
+ goto queue_exit;
18582208
1859
- if (blk_mq_sched_bio_merge(q, bio))
1860
- return BLK_QC_T_NONE;
2209
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2210
+ goto queue_exit;
18612211
1862
- rq_qos_throttle(q, bio, NULL);
2212
+ rq_qos_throttle(q, bio);
18632213
1864
- trace_block_getrq(q, bio, bio->bi_opf);
1865
-
1866
- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
2214
+ data.cmd_flags = bio->bi_opf;
2215
+ rq = __blk_mq_alloc_request(&data);
18672216 if (unlikely(!rq)) {
18682217 rq_qos_cleanup(q, bio);
18692218 if (bio->bi_opf & REQ_NOWAIT)
18702219 bio_wouldblock_error(bio);
1871
- return BLK_QC_T_NONE;
2220
+ goto queue_exit;
18722221 }
2222
+
2223
+ trace_block_getrq(q, bio, bio->bi_opf);
18732224
18742225 rq_qos_track(q, rq, bio);
18752226
18762227 cookie = request_to_qc_t(data.hctx, rq);
18772228
1878
- plug = current->plug;
1879
- if (unlikely(is_flush_fua)) {
1880
- blk_mq_put_ctx(data.ctx);
1881
- blk_mq_bio_to_request(rq, bio);
2229
+ blk_mq_bio_to_request(rq, bio, nr_segs);
18822230
1883
- /* bypass scheduler for flush rq */
2231
+ ret = blk_crypto_init_request(rq);
2232
+ if (ret != BLK_STS_OK) {
2233
+ bio->bi_status = ret;
2234
+ bio_endio(bio);
2235
+ blk_mq_free_request(rq);
2236
+ return BLK_QC_T_NONE;
2237
+ }
2238
+
2239
+ plug = blk_mq_plug(q, bio);
2240
+ if (unlikely(is_flush_fua)) {
2241
+ /* Bypass scheduler for flush requests */
18842242 blk_insert_flush(rq);
18852243 blk_mq_run_hw_queue(data.hctx, true);
1886
- } else if (plug && q->nr_hw_queues == 1) {
1887
- struct request *last = NULL;
1888
-
1889
- blk_mq_put_ctx(data.ctx);
1890
- blk_mq_bio_to_request(rq, bio);
1891
-
2244
+ } else if (plug && (q->nr_hw_queues == 1 ||
2245
+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) ||
2246
+ q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
18922247 /*
1893
- * @request_count may become stale because of schedule
1894
- * out, so check the list again.
2248
+ * Use plugging if we have a ->commit_rqs() hook as well, as
2249
+ * we know the driver uses bd->last in a smart fashion.
2250
+ *
2251
+ * Use normal plugging if this disk is slow HDD, as sequential
2252
+ * IO may benefit a lot from plug merging.
18952253 */
1896
- if (list_empty(&plug->mq_list))
1897
- request_count = 0;
1898
- else if (blk_queue_nomerges(q))
1899
- request_count = blk_plug_queued_count(q);
2254
+ unsigned int request_count = plug->rq_count;
2255
+ struct request *last = NULL;
19002256
19012257 if (!request_count)
19022258 trace_block_plug(q);
19032259 else
19042260 last = list_entry_rq(plug->mq_list.prev);
19052261
1906
- if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2262
+ if (request_count >= blk_plug_max_rq_count(plug) || (last &&
19072263 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
19082264 blk_flush_plug_list(plug, false);
19092265 trace_block_plug(q);
19102266 }
19112267
1912
- list_add_tail(&rq->queuelist, &plug->mq_list);
2268
+ blk_add_rq_to_plug(plug, rq);
2269
+ } else if (q->elevator) {
2270
+ /* Insert the request at the IO scheduler queue */
2271
+ blk_mq_sched_insert_request(rq, false, true, true);
19132272 } else if (plug && !blk_queue_nomerges(q)) {
1914
- blk_mq_bio_to_request(rq, bio);
1915
-
19162273 /*
19172274 * We do limited plugging. If the bio can be merged, do that.
19182275 * Otherwise the existing request in the plug list will be
....@@ -1922,30 +2279,74 @@
19222279 */
19232280 if (list_empty(&plug->mq_list))
19242281 same_queue_rq = NULL;
1925
- if (same_queue_rq)
2282
+ if (same_queue_rq) {
19262283 list_del_init(&same_queue_rq->queuelist);
1927
- list_add_tail(&rq->queuelist, &plug->mq_list);
1928
-
1929
- blk_mq_put_ctx(data.ctx);
2284
+ plug->rq_count--;
2285
+ }
2286
+ blk_add_rq_to_plug(plug, rq);
2287
+ trace_block_plug(q);
19302288
19312289 if (same_queue_rq) {
1932
- data.hctx = blk_mq_map_queue(q,
1933
- same_queue_rq->mq_ctx->cpu);
2290
+ data.hctx = same_queue_rq->mq_hctx;
2291
+ trace_block_unplug(q, 1, true);
19342292 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
19352293 &cookie);
19362294 }
1937
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1938
- !data.hctx->dispatch_busy)) {
1939
- blk_mq_put_ctx(data.ctx);
1940
- blk_mq_bio_to_request(rq, bio);
2295
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
2296
+ !data.hctx->dispatch_busy) {
2297
+ /*
2298
+ * There is no scheduler and we can try to send directly
2299
+ * to the hardware.
2300
+ */
19412301 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
19422302 } else {
1943
- blk_mq_put_ctx(data.ctx);
1944
- blk_mq_bio_to_request(rq, bio);
2303
+ /* Default case. */
19452304 blk_mq_sched_insert_request(rq, false, true, true);
19462305 }
19472306
19482307 return cookie;
2308
+queue_exit:
2309
+ blk_queue_exit(q);
2310
+ return BLK_QC_T_NONE;
2311
+}
2312
+
2313
+static size_t order_to_size(unsigned int order)
2314
+{
2315
+ return (size_t)PAGE_SIZE << order;
2316
+}
2317
+
2318
+/* called before freeing request pool in @tags */
2319
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
2320
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
2321
+{
2322
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
2323
+ struct page *page;
2324
+ unsigned long flags;
2325
+
2326
+ list_for_each_entry(page, &tags->page_list, lru) {
2327
+ unsigned long start = (unsigned long)page_address(page);
2328
+ unsigned long end = start + order_to_size(page->private);
2329
+ int i;
2330
+
2331
+ for (i = 0; i < set->queue_depth; i++) {
2332
+ struct request *rq = drv_tags->rqs[i];
2333
+ unsigned long rq_addr = (unsigned long)rq;
2334
+
2335
+ if (rq_addr >= start && rq_addr < end) {
2336
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
2337
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
2338
+ }
2339
+ }
2340
+ }
2341
+
2342
+ /*
2343
+ * Wait until all pending iteration is done.
2344
+ *
2345
+ * Request reference is cleared and it is guaranteed to be observed
2346
+ * after the ->lock is released.
2347
+ */
2348
+ spin_lock_irqsave(&drv_tags->lock, flags);
2349
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
19492350 }
19502351
19512352 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
....@@ -1966,42 +2367,44 @@
19662367 }
19672368 }
19682369
2370
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
2371
+
19692372 while (!list_empty(&tags->page_list)) {
19702373 page = list_first_entry(&tags->page_list, struct page, lru);
19712374 list_del_init(&page->lru);
19722375 /*
19732376 * Remove kmemleak object previously allocated in
1974
- * blk_mq_init_rq_map().
2377
+ * blk_mq_alloc_rqs().
19752378 */
19762379 kmemleak_free(page_address(page));
19772380 __free_pages(page, page->private);
19782381 }
19792382 }
19802383
1981
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2384
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
19822385 {
19832386 kfree(tags->rqs);
19842387 tags->rqs = NULL;
19852388 kfree(tags->static_rqs);
19862389 tags->static_rqs = NULL;
19872390
1988
- blk_mq_free_tags(tags);
2391
+ blk_mq_free_tags(tags, flags);
19892392 }
19902393
19912394 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
19922395 unsigned int hctx_idx,
19932396 unsigned int nr_tags,
1994
- unsigned int reserved_tags)
2397
+ unsigned int reserved_tags,
2398
+ unsigned int flags)
19952399 {
19962400 struct blk_mq_tags *tags;
19972401 int node;
19982402
1999
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2403
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20002404 if (node == NUMA_NO_NODE)
20012405 node = set->numa_node;
20022406
2003
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2004
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2407
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
20052408 if (!tags)
20062409 return NULL;
20072410
....@@ -2009,7 +2412,7 @@
20092412 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
20102413 node);
20112414 if (!tags->rqs) {
2012
- blk_mq_free_tags(tags);
2415
+ blk_mq_free_tags(tags, flags);
20132416 return NULL;
20142417 }
20152418
....@@ -2018,16 +2421,11 @@
20182421 node);
20192422 if (!tags->static_rqs) {
20202423 kfree(tags->rqs);
2021
- blk_mq_free_tags(tags);
2424
+ blk_mq_free_tags(tags, flags);
20222425 return NULL;
20232426 }
20242427
20252428 return tags;
2026
-}
2027
-
2028
-static size_t order_to_size(unsigned int order)
2029
-{
2030
- return (size_t)PAGE_SIZE << order;
20312429 }
20322430
20332431 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
....@@ -2052,7 +2450,7 @@
20522450 size_t rq_size, left;
20532451 int node;
20542452
2055
- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2453
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
20562454 if (node == NUMA_NO_NODE)
20572455 node = set->numa_node;
20582456
....@@ -2064,6 +2462,7 @@
20642462 */
20652463 rq_size = round_up(sizeof(struct request) + set->cmd_size,
20662464 cache_line_size());
2465
+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
20672466 left = rq_size * depth;
20682467
20692468 for (i = 0; i < depth; ) {
....@@ -2122,6 +2521,86 @@
21222521 return -ENOMEM;
21232522 }
21242523
2524
+struct rq_iter_data {
2525
+ struct blk_mq_hw_ctx *hctx;
2526
+ bool has_rq;
2527
+};
2528
+
2529
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2530
+{
2531
+ struct rq_iter_data *iter_data = data;
2532
+
2533
+ if (rq->mq_hctx != iter_data->hctx)
2534
+ return true;
2535
+ iter_data->has_rq = true;
2536
+ return false;
2537
+}
2538
+
2539
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2540
+{
2541
+ struct blk_mq_tags *tags = hctx->sched_tags ?
2542
+ hctx->sched_tags : hctx->tags;
2543
+ struct rq_iter_data data = {
2544
+ .hctx = hctx,
2545
+ };
2546
+
2547
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2548
+ return data.has_rq;
2549
+}
2550
+
2551
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2552
+ struct blk_mq_hw_ctx *hctx)
2553
+{
2554
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2555
+ return false;
2556
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2557
+ return false;
2558
+ return true;
2559
+}
2560
+
2561
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2562
+{
2563
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2564
+ struct blk_mq_hw_ctx, cpuhp_online);
2565
+
2566
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2567
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
2568
+ return 0;
2569
+
2570
+ /*
2571
+ * Prevent new request from being allocated on the current hctx.
2572
+ *
2573
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
2574
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2575
+ * seen once we return from the tag allocator.
2576
+ */
2577
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2578
+ smp_mb__after_atomic();
2579
+
2580
+ /*
2581
+ * Try to grab a reference to the queue and wait for any outstanding
2582
+ * requests. If we could not grab a reference the queue has been
2583
+ * frozen and there are no requests.
2584
+ */
2585
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2586
+ while (blk_mq_hctx_has_requests(hctx))
2587
+ msleep(5);
2588
+ percpu_ref_put(&hctx->queue->q_usage_counter);
2589
+ }
2590
+
2591
+ return 0;
2592
+}
2593
+
2594
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2595
+{
2596
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2597
+ struct blk_mq_hw_ctx, cpuhp_online);
2598
+
2599
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
2600
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2601
+ return 0;
2602
+}
2603
+
21252604 /*
21262605 * 'cpu' is going away. splice any existing rq_list entries from this
21272606 * software queue to the hw queue dispatch list, and ensure that it
....@@ -2132,13 +2611,18 @@
21322611 struct blk_mq_hw_ctx *hctx;
21332612 struct blk_mq_ctx *ctx;
21342613 LIST_HEAD(tmp);
2614
+ enum hctx_type type;
21352615
21362616 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2617
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
2618
+ return 0;
2619
+
21372620 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2621
+ type = hctx->type;
21382622
21392623 spin_lock(&ctx->lock);
2140
- if (!list_empty(&ctx->rq_list)) {
2141
- list_splice_init(&ctx->rq_list, &tmp);
2624
+ if (!list_empty(&ctx->rq_lists[type])) {
2625
+ list_splice_init(&ctx->rq_lists[type], &tmp);
21422626 blk_mq_hctx_clear_pending(hctx, ctx);
21432627 }
21442628 spin_unlock(&ctx->lock);
....@@ -2156,8 +2640,40 @@
21562640
21572641 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
21582642 {
2643
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2644
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2645
+ &hctx->cpuhp_online);
21592646 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
21602647 &hctx->cpuhp_dead);
2648
+}
2649
+
2650
+/*
2651
+ * Before freeing hw queue, clearing the flush request reference in
2652
+ * tags->rqs[] for avoiding potential UAF.
2653
+ */
2654
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
2655
+ unsigned int queue_depth, struct request *flush_rq)
2656
+{
2657
+ int i;
2658
+ unsigned long flags;
2659
+
2660
+ /* The hw queue may not be mapped yet */
2661
+ if (!tags)
2662
+ return;
2663
+
2664
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
2665
+
2666
+ for (i = 0; i < queue_depth; i++)
2667
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
2668
+
2669
+ /*
2670
+ * Wait until all pending iteration is done.
2671
+ *
2672
+ * Request reference is cleared and it is guaranteed to be observed
2673
+ * after the ->lock is released.
2674
+ */
2675
+ spin_lock_irqsave(&tags->lock, flags);
2676
+ spin_unlock_irqrestore(&tags->lock, flags);
21612677 }
21622678
21632679 /* hctx->ctxs will be freed in queue's release handler */
....@@ -2165,18 +2681,24 @@
21652681 struct blk_mq_tag_set *set,
21662682 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
21672683 {
2168
- blk_mq_debugfs_unregister_hctx(hctx);
2684
+ struct request *flush_rq = hctx->fq->flush_rq;
21692685
21702686 if (blk_mq_hw_queue_mapped(hctx))
21712687 blk_mq_tag_idle(hctx);
21722688
2689
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
2690
+ set->queue_depth, flush_rq);
21732691 if (set->ops->exit_request)
2174
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2692
+ set->ops->exit_request(set, flush_rq, hctx_idx);
21752693
21762694 if (set->ops->exit_hctx)
21772695 set->ops->exit_hctx(hctx, hctx_idx);
21782696
21792697 blk_mq_remove_cpuhp(hctx);
2698
+
2699
+ spin_lock(&q->unused_hctx_lock);
2700
+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
2701
+ spin_unlock(&q->unused_hctx_lock);
21802702 }
21812703
21822704 static void blk_mq_exit_hw_queues(struct request_queue *q,
....@@ -2188,112 +2710,160 @@
21882710 queue_for_each_hw_ctx(q, hctx, i) {
21892711 if (i == nr_queue)
21902712 break;
2713
+ blk_mq_debugfs_unregister_hctx(hctx);
21912714 blk_mq_exit_hctx(q, set, hctx, i);
21922715 }
2716
+}
2717
+
2718
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2719
+{
2720
+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2721
+
2722
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2723
+ __alignof__(struct blk_mq_hw_ctx)) !=
2724
+ sizeof(struct blk_mq_hw_ctx));
2725
+
2726
+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
2727
+ hw_ctx_size += sizeof(struct srcu_struct);
2728
+
2729
+ return hw_ctx_size;
21932730 }
21942731
21952732 static int blk_mq_init_hctx(struct request_queue *q,
21962733 struct blk_mq_tag_set *set,
21972734 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
21982735 {
2199
- int node;
2736
+ hctx->queue_num = hctx_idx;
22002737
2201
- node = hctx->numa_node;
2738
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
2739
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2740
+ &hctx->cpuhp_online);
2741
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2742
+
2743
+ hctx->tags = set->tags[hctx_idx];
2744
+
2745
+ if (set->ops->init_hctx &&
2746
+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2747
+ goto unregister_cpu_notifier;
2748
+
2749
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2750
+ hctx->numa_node))
2751
+ goto exit_hctx;
2752
+ return 0;
2753
+
2754
+ exit_hctx:
2755
+ if (set->ops->exit_hctx)
2756
+ set->ops->exit_hctx(hctx, hctx_idx);
2757
+ unregister_cpu_notifier:
2758
+ blk_mq_remove_cpuhp(hctx);
2759
+ return -1;
2760
+}
2761
+
2762
+static struct blk_mq_hw_ctx *
2763
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2764
+ int node)
2765
+{
2766
+ struct blk_mq_hw_ctx *hctx;
2767
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2768
+
2769
+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2770
+ if (!hctx)
2771
+ goto fail_alloc_hctx;
2772
+
2773
+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2774
+ goto free_hctx;
2775
+
2776
+ atomic_set(&hctx->nr_active, 0);
22022777 if (node == NUMA_NO_NODE)
2203
- node = hctx->numa_node = set->numa_node;
2778
+ node = set->numa_node;
2779
+ hctx->numa_node = node;
22042780
22052781 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
22062782 spin_lock_init(&hctx->lock);
22072783 INIT_LIST_HEAD(&hctx->dispatch);
22082784 hctx->queue = q;
2209
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2785
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
22102786
2211
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2212
-
2213
- hctx->tags = set->tags[hctx_idx];
2787
+ INIT_LIST_HEAD(&hctx->hctx_list);
22142788
22152789 /*
22162790 * Allocate space for all possible cpus to avoid allocation at
22172791 * runtime
22182792 */
22192793 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2220
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
2794
+ gfp, node);
22212795 if (!hctx->ctxs)
2222
- goto unregister_cpu_notifier;
2796
+ goto free_cpumask;
22232797
22242798 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2225
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
2799
+ gfp, node))
22262800 goto free_ctxs;
2227
-
22282801 hctx->nr_ctx = 0;
22292802
22302803 spin_lock_init(&hctx->dispatch_wait_lock);
22312804 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
22322805 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
22332806
2234
- if (set->ops->init_hctx &&
2235
- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2236
- goto free_bitmap;
2237
-
2238
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2239
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
2807
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
22402808 if (!hctx->fq)
2241
- goto exit_hctx;
2242
-
2243
- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2244
- goto free_fq;
2809
+ goto free_bitmap;
22452810
22462811 if (hctx->flags & BLK_MQ_F_BLOCKING)
22472812 init_srcu_struct(hctx->srcu);
2813
+ blk_mq_hctx_kobj_init(hctx);
22482814
2249
- blk_mq_debugfs_register_hctx(q, hctx);
2815
+ return hctx;
22502816
2251
- return 0;
2252
-
2253
- free_fq:
2254
- blk_free_flush_queue(hctx->fq);
2255
- exit_hctx:
2256
- if (set->ops->exit_hctx)
2257
- set->ops->exit_hctx(hctx, hctx_idx);
22582817 free_bitmap:
22592818 sbitmap_free(&hctx->ctx_map);
22602819 free_ctxs:
22612820 kfree(hctx->ctxs);
2262
- unregister_cpu_notifier:
2263
- blk_mq_remove_cpuhp(hctx);
2264
- return -1;
2821
+ free_cpumask:
2822
+ free_cpumask_var(hctx->cpumask);
2823
+ free_hctx:
2824
+ kfree(hctx);
2825
+ fail_alloc_hctx:
2826
+ return NULL;
22652827 }
22662828
22672829 static void blk_mq_init_cpu_queues(struct request_queue *q,
22682830 unsigned int nr_hw_queues)
22692831 {
2270
- unsigned int i;
2832
+ struct blk_mq_tag_set *set = q->tag_set;
2833
+ unsigned int i, j;
22712834
22722835 for_each_possible_cpu(i) {
22732836 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
22742837 struct blk_mq_hw_ctx *hctx;
2838
+ int k;
22752839
22762840 __ctx->cpu = i;
22772841 spin_lock_init(&__ctx->lock);
2278
- INIT_LIST_HEAD(&__ctx->rq_list);
2842
+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2843
+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2844
+
22792845 __ctx->queue = q;
22802846
22812847 /*
22822848 * Set local node, IFF we have more than one hw queue. If
22832849 * not, we remain on the home node of the device
22842850 */
2285
- hctx = blk_mq_map_queue(q, i);
2286
- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2287
- hctx->numa_node = local_memory_node(cpu_to_node(i));
2851
+ for (j = 0; j < set->nr_maps; j++) {
2852
+ hctx = blk_mq_map_queue_type(q, j, i);
2853
+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2854
+ hctx->numa_node = cpu_to_node(i);
2855
+ }
22882856 }
22892857 }
22902858
2291
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2859
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2860
+ int hctx_idx)
22922861 {
2862
+ unsigned int flags = set->flags;
22932863 int ret = 0;
22942864
22952865 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2296
- set->queue_depth, set->reserved_tags);
2866
+ set->queue_depth, set->reserved_tags, flags);
22972867 if (!set->tags[hctx_idx])
22982868 return false;
22992869
....@@ -2302,7 +2872,7 @@
23022872 if (!ret)
23032873 return true;
23042874
2305
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2875
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23062876 set->tags[hctx_idx] = NULL;
23072877 return false;
23082878 }
....@@ -2310,16 +2880,18 @@
23102880 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
23112881 unsigned int hctx_idx)
23122882 {
2313
- if (set->tags[hctx_idx]) {
2883
+ unsigned int flags = set->flags;
2884
+
2885
+ if (set->tags && set->tags[hctx_idx]) {
23142886 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2315
- blk_mq_free_rq_map(set->tags[hctx_idx]);
2887
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
23162888 set->tags[hctx_idx] = NULL;
23172889 }
23182890 }
23192891
23202892 static void blk_mq_map_swqueue(struct request_queue *q)
23212893 {
2322
- unsigned int i, hctx_idx;
2894
+ unsigned int i, j, hctx_idx;
23232895 struct blk_mq_hw_ctx *hctx;
23242896 struct blk_mq_ctx *ctx;
23252897 struct blk_mq_tag_set *set = q->tag_set;
....@@ -2336,25 +2908,52 @@
23362908 * If the cpu isn't present, the cpu is mapped to first hctx.
23372909 */
23382910 for_each_possible_cpu(i) {
2339
- hctx_idx = q->mq_map[i];
2340
- /* unmapped hw queue can be remapped after CPU topo changed */
2341
- if (!set->tags[hctx_idx] &&
2342
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2343
- /*
2344
- * If tags initialization fail for some hctx,
2345
- * that hctx won't be brought online. In this
2346
- * case, remap the current ctx to hctx[0] which
2347
- * is guaranteed to always have tags allocated
2348
- */
2349
- q->mq_map[i] = 0;
2350
- }
23512911
23522912 ctx = per_cpu_ptr(q->queue_ctx, i);
2353
- hctx = blk_mq_map_queue(q, i);
2913
+ for (j = 0; j < set->nr_maps; j++) {
2914
+ if (!set->map[j].nr_queues) {
2915
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2916
+ HCTX_TYPE_DEFAULT, i);
2917
+ continue;
2918
+ }
2919
+ hctx_idx = set->map[j].mq_map[i];
2920
+ /* unmapped hw queue can be remapped after CPU topo changed */
2921
+ if (!set->tags[hctx_idx] &&
2922
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2923
+ /*
2924
+ * If tags initialization fail for some hctx,
2925
+ * that hctx won't be brought online. In this
2926
+ * case, remap the current ctx to hctx[0] which
2927
+ * is guaranteed to always have tags allocated
2928
+ */
2929
+ set->map[j].mq_map[i] = 0;
2930
+ }
23542931
2355
- cpumask_set_cpu(i, hctx->cpumask);
2356
- ctx->index_hw = hctx->nr_ctx;
2357
- hctx->ctxs[hctx->nr_ctx++] = ctx;
2932
+ hctx = blk_mq_map_queue_type(q, j, i);
2933
+ ctx->hctxs[j] = hctx;
2934
+ /*
2935
+ * If the CPU is already set in the mask, then we've
2936
+ * mapped this one already. This can happen if
2937
+ * devices share queues across queue maps.
2938
+ */
2939
+ if (cpumask_test_cpu(i, hctx->cpumask))
2940
+ continue;
2941
+
2942
+ cpumask_set_cpu(i, hctx->cpumask);
2943
+ hctx->type = j;
2944
+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
2945
+ hctx->ctxs[hctx->nr_ctx++] = ctx;
2946
+
2947
+ /*
2948
+ * If the nr_ctx type overflows, we have exceeded the
2949
+ * amount of sw queues we can support.
2950
+ */
2951
+ BUG_ON(!hctx->nr_ctx);
2952
+ }
2953
+
2954
+ for (; j < HCTX_MAX_TYPES; j++)
2955
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
2956
+ HCTX_TYPE_DEFAULT, i);
23582957 }
23592958
23602959 queue_for_each_hw_ctx(q, hctx, i) {
....@@ -2403,14 +3002,14 @@
24033002
24043003 queue_for_each_hw_ctx(q, hctx, i) {
24053004 if (shared)
2406
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
3005
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24073006 else
2408
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
3007
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24093008 }
24103009 }
24113010
2412
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2413
- bool shared)
3011
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3012
+ bool shared)
24143013 {
24153014 struct request_queue *q;
24163015
....@@ -2428,12 +3027,12 @@
24283027 struct blk_mq_tag_set *set = q->tag_set;
24293028
24303029 mutex_lock(&set->tag_list_lock);
2431
- list_del_rcu(&q->tag_set_list);
3030
+ list_del(&q->tag_set_list);
24323031 if (list_is_singular(&set->tag_list)) {
24333032 /* just transitioned to unshared */
2434
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
3033
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
24353034 /* update existing queue */
2436
- blk_mq_update_tag_set_depth(set, false);
3035
+ blk_mq_update_tag_set_shared(set, false);
24373036 }
24383037 mutex_unlock(&set->tag_list_lock);
24393038 INIT_LIST_HEAD(&q->tag_set_list);
....@@ -2442,24 +3041,50 @@
24423041 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
24433042 struct request_queue *q)
24443043 {
2445
- q->tag_set = set;
2446
-
24473044 mutex_lock(&set->tag_list_lock);
24483045
24493046 /*
24503047 * Check to see if we're transitioning to shared (from 1 to 2 queues).
24513048 */
24523049 if (!list_empty(&set->tag_list) &&
2453
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2454
- set->flags |= BLK_MQ_F_TAG_SHARED;
3050
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3051
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
24553052 /* update existing queue */
2456
- blk_mq_update_tag_set_depth(set, true);
3053
+ blk_mq_update_tag_set_shared(set, true);
24573054 }
2458
- if (set->flags & BLK_MQ_F_TAG_SHARED)
3055
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
24593056 queue_set_hctx_shared(q, true);
2460
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
3057
+ list_add_tail(&q->tag_set_list, &set->tag_list);
24613058
24623059 mutex_unlock(&set->tag_list_lock);
3060
+}
3061
+
3062
+/* All allocations will be freed in release handler of q->mq_kobj */
3063
+static int blk_mq_alloc_ctxs(struct request_queue *q)
3064
+{
3065
+ struct blk_mq_ctxs *ctxs;
3066
+ int cpu;
3067
+
3068
+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3069
+ if (!ctxs)
3070
+ return -ENOMEM;
3071
+
3072
+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3073
+ if (!ctxs->queue_ctx)
3074
+ goto fail;
3075
+
3076
+ for_each_possible_cpu(cpu) {
3077
+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3078
+ ctx->ctxs = ctxs;
3079
+ }
3080
+
3081
+ q->mq_kobj = &ctxs->kobj;
3082
+ q->queue_ctx = ctxs->queue_ctx;
3083
+
3084
+ return 0;
3085
+ fail:
3086
+ kfree(ctxs);
3087
+ return -ENOMEM;
24633088 }
24643089
24653090 /*
....@@ -2470,17 +3095,17 @@
24703095 */
24713096 void blk_mq_release(struct request_queue *q)
24723097 {
2473
- struct blk_mq_hw_ctx *hctx;
2474
- unsigned int i;
3098
+ struct blk_mq_hw_ctx *hctx, *next;
3099
+ int i;
24753100
2476
- /* hctx kobj stays in hctx */
2477
- queue_for_each_hw_ctx(q, hctx, i) {
2478
- if (!hctx)
2479
- continue;
3101
+ queue_for_each_hw_ctx(q, hctx, i)
3102
+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3103
+
3104
+ /* all hctx are in .unused_hctx_list now */
3105
+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3106
+ list_del_init(&hctx->hctx_list);
24803107 kobject_put(&hctx->kobj);
24813108 }
2482
-
2483
- q->mq_map = NULL;
24843109
24853110 kfree(q->queue_hw_ctx);
24863111
....@@ -2489,102 +3114,184 @@
24893114 * both share lifetime with request queue.
24903115 */
24913116 blk_mq_sysfs_deinit(q);
2492
-
2493
- free_percpu(q->queue_ctx);
24943117 }
24953118
2496
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3119
+struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3120
+ void *queuedata)
24973121 {
24983122 struct request_queue *uninit_q, *q;
24993123
2500
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
3124
+ uninit_q = blk_alloc_queue(set->numa_node);
25013125 if (!uninit_q)
25023126 return ERR_PTR(-ENOMEM);
3127
+ uninit_q->queuedata = queuedata;
25033128
2504
- q = blk_mq_init_allocated_queue(set, uninit_q);
3129
+ /*
3130
+ * Initialize the queue without an elevator. device_add_disk() will do
3131
+ * the initialization.
3132
+ */
3133
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
25053134 if (IS_ERR(q))
25063135 blk_cleanup_queue(uninit_q);
25073136
25083137 return q;
25093138 }
3139
+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
3140
+
3141
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3142
+{
3143
+ return blk_mq_init_queue_data(set, NULL);
3144
+}
25103145 EXPORT_SYMBOL(blk_mq_init_queue);
25113146
2512
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
3147
+/*
3148
+ * Helper for setting up a queue with mq ops, given queue depth, and
3149
+ * the passed in mq ops flags.
3150
+ */
3151
+struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
3152
+ const struct blk_mq_ops *ops,
3153
+ unsigned int queue_depth,
3154
+ unsigned int set_flags)
25133155 {
2514
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
3156
+ struct request_queue *q;
3157
+ int ret;
25153158
2516
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2517
- __alignof__(struct blk_mq_hw_ctx)) !=
2518
- sizeof(struct blk_mq_hw_ctx));
3159
+ memset(set, 0, sizeof(*set));
3160
+ set->ops = ops;
3161
+ set->nr_hw_queues = 1;
3162
+ set->nr_maps = 1;
3163
+ set->queue_depth = queue_depth;
3164
+ set->numa_node = NUMA_NO_NODE;
3165
+ set->flags = set_flags;
25193166
2520
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2521
- hw_ctx_size += sizeof(struct srcu_struct);
3167
+ ret = blk_mq_alloc_tag_set(set);
3168
+ if (ret)
3169
+ return ERR_PTR(ret);
25223170
2523
- return hw_ctx_size;
3171
+ q = blk_mq_init_queue(set);
3172
+ if (IS_ERR(q)) {
3173
+ blk_mq_free_tag_set(set);
3174
+ return q;
3175
+ }
3176
+
3177
+ return q;
3178
+}
3179
+EXPORT_SYMBOL(blk_mq_init_sq_queue);
3180
+
3181
+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3182
+ struct blk_mq_tag_set *set, struct request_queue *q,
3183
+ int hctx_idx, int node)
3184
+{
3185
+ struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3186
+
3187
+ /* reuse dead hctx first */
3188
+ spin_lock(&q->unused_hctx_lock);
3189
+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3190
+ if (tmp->numa_node == node) {
3191
+ hctx = tmp;
3192
+ break;
3193
+ }
3194
+ }
3195
+ if (hctx)
3196
+ list_del_init(&hctx->hctx_list);
3197
+ spin_unlock(&q->unused_hctx_lock);
3198
+
3199
+ if (!hctx)
3200
+ hctx = blk_mq_alloc_hctx(q, set, node);
3201
+ if (!hctx)
3202
+ goto fail;
3203
+
3204
+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3205
+ goto free_hctx;
3206
+
3207
+ return hctx;
3208
+
3209
+ free_hctx:
3210
+ kobject_put(&hctx->kobj);
3211
+ fail:
3212
+ return NULL;
25243213 }
25253214
25263215 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
25273216 struct request_queue *q)
25283217 {
2529
- int i, j;
3218
+ int i, j, end;
25303219 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
25313220
2532
- blk_mq_sysfs_unregister(q);
3221
+ if (q->nr_hw_queues < set->nr_hw_queues) {
3222
+ struct blk_mq_hw_ctx **new_hctxs;
3223
+
3224
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
3225
+ sizeof(*new_hctxs), GFP_KERNEL,
3226
+ set->numa_node);
3227
+ if (!new_hctxs)
3228
+ return;
3229
+ if (hctxs)
3230
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3231
+ sizeof(*hctxs));
3232
+ q->queue_hw_ctx = new_hctxs;
3233
+ kfree(hctxs);
3234
+ hctxs = new_hctxs;
3235
+ }
25333236
25343237 /* protect against switching io scheduler */
25353238 mutex_lock(&q->sysfs_lock);
25363239 for (i = 0; i < set->nr_hw_queues; i++) {
25373240 int node;
3241
+ struct blk_mq_hw_ctx *hctx;
25383242
2539
- if (hctxs[i])
3243
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3244
+ /*
3245
+ * If the hw queue has been mapped to another numa node,
3246
+ * we need to realloc the hctx. If allocation fails, fallback
3247
+ * to use the previous one.
3248
+ */
3249
+ if (hctxs[i] && (hctxs[i]->numa_node == node))
25403250 continue;
25413251
2542
- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2543
- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2544
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2545
- node);
2546
- if (!hctxs[i])
2547
- break;
2548
-
2549
- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2550
- GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2551
- node)) {
2552
- kfree(hctxs[i]);
2553
- hctxs[i] = NULL;
2554
- break;
3252
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3253
+ if (hctx) {
3254
+ if (hctxs[i])
3255
+ blk_mq_exit_hctx(q, set, hctxs[i], i);
3256
+ hctxs[i] = hctx;
3257
+ } else {
3258
+ if (hctxs[i])
3259
+ pr_warn("Allocate new hctx on node %d fails,\
3260
+ fallback to previous one on node %d\n",
3261
+ node, hctxs[i]->numa_node);
3262
+ else
3263
+ break;
25553264 }
2556
-
2557
- atomic_set(&hctxs[i]->nr_active, 0);
2558
- hctxs[i]->numa_node = node;
2559
- hctxs[i]->queue_num = i;
2560
-
2561
- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2562
- free_cpumask_var(hctxs[i]->cpumask);
2563
- kfree(hctxs[i]);
2564
- hctxs[i] = NULL;
2565
- break;
2566
- }
2567
- blk_mq_hctx_kobj_init(hctxs[i]);
25683265 }
2569
- for (j = i; j < q->nr_hw_queues; j++) {
3266
+ /*
3267
+ * Increasing nr_hw_queues fails. Free the newly allocated
3268
+ * hctxs and keep the previous q->nr_hw_queues.
3269
+ */
3270
+ if (i != set->nr_hw_queues) {
3271
+ j = q->nr_hw_queues;
3272
+ end = i;
3273
+ } else {
3274
+ j = i;
3275
+ end = q->nr_hw_queues;
3276
+ q->nr_hw_queues = set->nr_hw_queues;
3277
+ }
3278
+
3279
+ for (; j < end; j++) {
25703280 struct blk_mq_hw_ctx *hctx = hctxs[j];
25713281
25723282 if (hctx) {
25733283 if (hctx->tags)
25743284 blk_mq_free_map_and_requests(set, j);
25753285 blk_mq_exit_hctx(q, set, hctx, j);
2576
- kobject_put(&hctx->kobj);
25773286 hctxs[j] = NULL;
2578
-
25793287 }
25803288 }
2581
- q->nr_hw_queues = i;
25823289 mutex_unlock(&q->sysfs_lock);
2583
- blk_mq_sysfs_register(q);
25843290 }
25853291
25863292 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2587
- struct request_queue *q)
3293
+ struct request_queue *q,
3294
+ bool elevator_init)
25883295 {
25893296 /* mark the queue as mq asap */
25903297 q->mq_ops = set->ops;
....@@ -2595,19 +3302,14 @@
25953302 if (!q->poll_cb)
25963303 goto err_exit;
25973304
2598
- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2599
- if (!q->queue_ctx)
2600
- goto err_exit;
3305
+ if (blk_mq_alloc_ctxs(q))
3306
+ goto err_poll;
26013307
26023308 /* init q->mq_kobj and sw queues' kobjects */
26033309 blk_mq_sysfs_init(q);
26043310
2605
- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2606
- GFP_KERNEL, set->numa_node);
2607
- if (!q->queue_hw_ctx)
2608
- goto err_percpu;
2609
-
2610
- q->mq_map = set->mq_map;
3311
+ INIT_LIST_HEAD(&q->unused_hctx_list);
3312
+ spin_lock_init(&q->unused_hctx_lock);
26113313
26123314 blk_mq_realloc_hw_ctxs(set, q);
26133315 if (!q->nr_hw_queues)
....@@ -2616,12 +3318,12 @@
26163318 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
26173319 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
26183320
2619
- q->nr_queues = nr_cpu_ids;
3321
+ q->tag_set = set;
26203322
26213323 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2622
-
2623
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2624
- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
3324
+ if (set->nr_maps > HCTX_TYPE_POLL &&
3325
+ set->map[HCTX_TYPE_POLL].nr_queues)
3326
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
26253327
26263328 q->sg_reserved_size = INT_MAX;
26273329
....@@ -2629,41 +3331,29 @@
26293331 INIT_LIST_HEAD(&q->requeue_list);
26303332 spin_lock_init(&q->requeue_lock);
26313333
2632
- blk_queue_make_request(q, blk_mq_make_request);
2633
- if (q->mq_ops->poll)
2634
- q->poll_fn = blk_mq_poll;
2635
-
2636
- /*
2637
- * Do this after blk_queue_make_request() overrides it...
2638
- */
26393334 q->nr_requests = set->queue_depth;
26403335
26413336 /*
26423337 * Default to classic polling
26433338 */
2644
- q->poll_nsec = -1;
2645
-
2646
- if (set->ops->complete)
2647
- blk_queue_softirq_done(q, set->ops->complete);
3339
+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
26483340
26493341 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
26503342 blk_mq_add_queue_tag_set(set, q);
26513343 blk_mq_map_swqueue(q);
26523344
2653
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2654
- int ret;
2655
-
2656
- ret = elevator_init_mq(q);
2657
- if (ret)
2658
- return ERR_PTR(ret);
2659
- }
3345
+ if (elevator_init)
3346
+ elevator_init_mq(q);
26603347
26613348 return q;
26623349
26633350 err_hctxs:
26643351 kfree(q->queue_hw_ctx);
2665
-err_percpu:
2666
- free_percpu(q->queue_ctx);
3352
+ q->nr_hw_queues = 0;
3353
+ blk_mq_sysfs_deinit(q);
3354
+err_poll:
3355
+ blk_stat_free_callback(q->poll_cb);
3356
+ q->poll_cb = NULL;
26673357 err_exit:
26683358 q->mq_ops = NULL;
26693359 return ERR_PTR(-ENOMEM);
....@@ -2681,38 +3371,21 @@
26813371 blk_mq_del_queue_tag_set(q);
26823372 }
26833373
2684
-/* Basically redo blk_mq_init_queue with queue frozen */
2685
-static void blk_mq_queue_reinit(struct request_queue *q)
2686
-{
2687
- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2688
-
2689
- blk_mq_debugfs_unregister_hctxs(q);
2690
- blk_mq_sysfs_unregister(q);
2691
-
2692
- /*
2693
- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2694
- * we should change hctx numa_node according to the new topology (this
2695
- * involves freeing and re-allocating memory, worth doing?)
2696
- */
2697
- blk_mq_map_swqueue(q);
2698
-
2699
- blk_mq_sysfs_register(q);
2700
- blk_mq_debugfs_register_hctxs(q);
2701
-}
2702
-
27033374 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
27043375 {
27053376 int i;
27063377
2707
- for (i = 0; i < set->nr_hw_queues; i++)
2708
- if (!__blk_mq_alloc_rq_map(set, i))
3378
+ for (i = 0; i < set->nr_hw_queues; i++) {
3379
+ if (!__blk_mq_alloc_map_and_request(set, i))
27093380 goto out_unwind;
3381
+ cond_resched();
3382
+ }
27103383
27113384 return 0;
27123385
27133386 out_unwind:
27143387 while (--i >= 0)
2715
- blk_mq_free_rq_map(set->tags[i]);
3388
+ blk_mq_free_map_and_requests(set, i);
27163389
27173390 return -ENOMEM;
27183391 }
....@@ -2722,7 +3395,7 @@
27223395 * may reduce the depth asked for, if memory is tight. set->queue_depth
27233396 * will be updated to reflect the allocated depth.
27243397 */
2725
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3398
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
27263399 {
27273400 unsigned int depth;
27283401 int err;
....@@ -2754,7 +3427,17 @@
27543427
27553428 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
27563429 {
2757
- if (set->ops->map_queues) {
3430
+ /*
3431
+ * blk_mq_map_queues() and multiple .map_queues() implementations
3432
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3433
+ * number of hardware queues.
3434
+ */
3435
+ if (set->nr_maps == 1)
3436
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3437
+
3438
+ if (set->ops->map_queues && !is_kdump_kernel()) {
3439
+ int i;
3440
+
27583441 /*
27593442 * transport .map_queues is usually done in the following
27603443 * way:
....@@ -2762,18 +3445,44 @@
27623445 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
27633446 * mask = get_cpu_mask(queue)
27643447 * for_each_cpu(cpu, mask)
2765
- * set->mq_map[cpu] = queue;
3448
+ * set->map[x].mq_map[cpu] = queue;
27663449 * }
27673450 *
27683451 * When we need to remap, the table has to be cleared for
27693452 * killing stale mapping since one CPU may not be mapped
27703453 * to any hw queue.
27713454 */
2772
- blk_mq_clear_mq_map(set);
3455
+ for (i = 0; i < set->nr_maps; i++)
3456
+ blk_mq_clear_mq_map(&set->map[i]);
27733457
27743458 return set->ops->map_queues(set);
2775
- } else
2776
- return blk_mq_map_queues(set);
3459
+ } else {
3460
+ BUG_ON(set->nr_maps > 1);
3461
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3462
+ }
3463
+}
3464
+
3465
+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3466
+ int cur_nr_hw_queues, int new_nr_hw_queues)
3467
+{
3468
+ struct blk_mq_tags **new_tags;
3469
+
3470
+ if (cur_nr_hw_queues >= new_nr_hw_queues)
3471
+ return 0;
3472
+
3473
+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3474
+ GFP_KERNEL, set->numa_node);
3475
+ if (!new_tags)
3476
+ return -ENOMEM;
3477
+
3478
+ if (set->tags)
3479
+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
3480
+ sizeof(*set->tags));
3481
+ kfree(set->tags);
3482
+ set->tags = new_tags;
3483
+ set->nr_hw_queues = new_nr_hw_queues;
3484
+
3485
+ return 0;
27773486 }
27783487
27793488 /*
....@@ -2784,7 +3493,7 @@
27843493 */
27853494 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
27863495 {
2787
- int ret;
3496
+ int i, ret;
27883497
27893498 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
27903499
....@@ -2807,6 +3516,11 @@
28073516 set->queue_depth = BLK_MQ_MAX_DEPTH;
28083517 }
28093518
3519
+ if (!set->nr_maps)
3520
+ set->nr_maps = 1;
3521
+ else if (set->nr_maps > HCTX_MAX_TYPES)
3522
+ return -EINVAL;
3523
+
28103524 /*
28113525 * If a crashdump is active, then we are potentially in a very
28123526 * memory constrained environment. Limit us to 1 queue and
....@@ -2814,42 +3528,59 @@
28143528 */
28153529 if (is_kdump_kernel()) {
28163530 set->nr_hw_queues = 1;
3531
+ set->nr_maps = 1;
28173532 set->queue_depth = min(64U, set->queue_depth);
28183533 }
28193534 /*
2820
- * There is no use for more h/w queues than cpus.
3535
+ * There is no use for more h/w queues than cpus if we just have
3536
+ * a single map
28213537 */
2822
- if (set->nr_hw_queues > nr_cpu_ids)
3538
+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
28233539 set->nr_hw_queues = nr_cpu_ids;
28243540
2825
- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2826
- GFP_KERNEL, set->numa_node);
2827
- if (!set->tags)
3541
+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
28283542 return -ENOMEM;
28293543
28303544 ret = -ENOMEM;
2831
- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2832
- GFP_KERNEL, set->numa_node);
2833
- if (!set->mq_map)
2834
- goto out_free_tags;
3545
+ for (i = 0; i < set->nr_maps; i++) {
3546
+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3547
+ sizeof(set->map[i].mq_map[0]),
3548
+ GFP_KERNEL, set->numa_node);
3549
+ if (!set->map[i].mq_map)
3550
+ goto out_free_mq_map;
3551
+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3552
+ }
28353553
28363554 ret = blk_mq_update_queue_map(set);
28373555 if (ret)
28383556 goto out_free_mq_map;
28393557
2840
- ret = blk_mq_alloc_rq_maps(set);
3558
+ ret = blk_mq_alloc_map_and_requests(set);
28413559 if (ret)
28423560 goto out_free_mq_map;
3561
+
3562
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
3563
+ atomic_set(&set->active_queues_shared_sbitmap, 0);
3564
+
3565
+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
3566
+ ret = -ENOMEM;
3567
+ goto out_free_mq_rq_maps;
3568
+ }
3569
+ }
28433570
28443571 mutex_init(&set->tag_list_lock);
28453572 INIT_LIST_HEAD(&set->tag_list);
28463573
28473574 return 0;
28483575
3576
+out_free_mq_rq_maps:
3577
+ for (i = 0; i < set->nr_hw_queues; i++)
3578
+ blk_mq_free_map_and_requests(set, i);
28493579 out_free_mq_map:
2850
- kfree(set->mq_map);
2851
- set->mq_map = NULL;
2852
-out_free_tags:
3580
+ for (i = 0; i < set->nr_maps; i++) {
3581
+ kfree(set->map[i].mq_map);
3582
+ set->map[i].mq_map = NULL;
3583
+ }
28533584 kfree(set->tags);
28543585 set->tags = NULL;
28553586 return ret;
....@@ -2858,13 +3589,18 @@
28583589
28593590 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
28603591 {
2861
- int i;
3592
+ int i, j;
28623593
2863
- for (i = 0; i < nr_cpu_ids; i++)
3594
+ for (i = 0; i < set->nr_hw_queues; i++)
28643595 blk_mq_free_map_and_requests(set, i);
28653596
2866
- kfree(set->mq_map);
2867
- set->mq_map = NULL;
3597
+ if (blk_mq_is_sbitmap_shared(set->flags))
3598
+ blk_mq_exit_shared_sbitmap(set);
3599
+
3600
+ for (j = 0; j < set->nr_maps; j++) {
3601
+ kfree(set->map[j].mq_map);
3602
+ set->map[j].mq_map = NULL;
3603
+ }
28683604
28693605 kfree(set->tags);
28703606 set->tags = NULL;
....@@ -2880,6 +3616,9 @@
28803616 if (!set)
28813617 return -EINVAL;
28823618
3619
+ if (q->nr_requests == nr)
3620
+ return 0;
3621
+
28833622 blk_mq_freeze_queue(q);
28843623 blk_mq_quiesce_queue(q);
28853624
....@@ -2894,14 +3633,16 @@
28943633 if (!hctx->sched_tags) {
28953634 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
28963635 false);
3636
+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
3637
+ blk_mq_tag_resize_shared_sbitmap(set, nr);
28973638 } else {
28983639 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
28993640 nr, true);
29003641 }
29013642 if (ret)
29023643 break;
2903
- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2904
- q->elevator->type->ops.mq.depth_updated(hctx);
3644
+ if (q->elevator && q->elevator->type->ops.depth_updated)
3645
+ q->elevator->type->ops.depth_updated(hctx);
29053646 }
29063647
29073648 if (!ret)
....@@ -2988,20 +3729,19 @@
29883729 {
29893730 struct request_queue *q;
29903731 LIST_HEAD(head);
3732
+ int prev_nr_hw_queues;
29913733
29923734 lockdep_assert_held(&set->tag_list_lock);
29933735
2994
- if (nr_hw_queues > nr_cpu_ids)
3736
+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
29953737 nr_hw_queues = nr_cpu_ids;
2996
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3738
+ if (nr_hw_queues < 1)
3739
+ return;
3740
+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
29973741 return;
29983742
29993743 list_for_each_entry(q, &set->tag_list, tag_set_list)
30003744 blk_mq_freeze_queue(q);
3001
- /*
3002
- * Sync with blk_mq_queue_tag_busy_iter.
3003
- */
3004
- synchronize_rcu();
30053745 /*
30063746 * Switch IO scheduler to 'none', cleaning up the data associated
30073747 * with the previous scheduler. We will switch back once we are done
....@@ -3011,11 +3751,35 @@
30113751 if (!blk_mq_elv_switch_none(&head, q))
30123752 goto switch_back;
30133753
3754
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3755
+ blk_mq_debugfs_unregister_hctxs(q);
3756
+ blk_mq_sysfs_unregister(q);
3757
+ }
3758
+
3759
+ prev_nr_hw_queues = set->nr_hw_queues;
3760
+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3761
+ 0)
3762
+ goto reregister;
3763
+
30143764 set->nr_hw_queues = nr_hw_queues;
3765
+fallback:
30153766 blk_mq_update_queue_map(set);
30163767 list_for_each_entry(q, &set->tag_list, tag_set_list) {
30173768 blk_mq_realloc_hw_ctxs(set, q);
3018
- blk_mq_queue_reinit(q);
3769
+ if (q->nr_hw_queues != set->nr_hw_queues) {
3770
+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3771
+ nr_hw_queues, prev_nr_hw_queues);
3772
+ set->nr_hw_queues = prev_nr_hw_queues;
3773
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3774
+ goto fallback;
3775
+ }
3776
+ blk_mq_map_swqueue(q);
3777
+ }
3778
+
3779
+reregister:
3780
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
3781
+ blk_mq_sysfs_register(q);
3782
+ blk_mq_debugfs_register_hctxs(q);
30193783 }
30203784
30213785 switch_back:
....@@ -3069,7 +3833,6 @@
30693833 }
30703834
30713835 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3072
- struct blk_mq_hw_ctx *hctx,
30733836 struct request *rq)
30743837 {
30753838 unsigned long ret = 0;
....@@ -3102,7 +3865,6 @@
31023865 }
31033866
31043867 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3105
- struct blk_mq_hw_ctx *hctx,
31063868 struct request *rq)
31073869 {
31083870 struct hrtimer_sleeper hs;
....@@ -3114,18 +3876,15 @@
31143876 return false;
31153877
31163878 /*
3117
- * poll_nsec can be:
3879
+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
31183880 *
3119
- * -1: don't ever hybrid sleep
31203881 * 0: use half of prev avg
31213882 * >0: use this specific value
31223883 */
3123
- if (q->poll_nsec == -1)
3124
- return false;
3125
- else if (q->poll_nsec > 0)
3884
+ if (q->poll_nsec > 0)
31263885 nsecs = q->poll_nsec;
31273886 else
3128
- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3887
+ nsecs = blk_mq_poll_nsecs(q, rq);
31293888
31303889 if (!nsecs)
31313890 return false;
....@@ -3139,15 +3898,14 @@
31393898 kt = nsecs;
31403899
31413900 mode = HRTIMER_MODE_REL;
3142
- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3901
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
31433902 hrtimer_set_expires(&hs.timer, kt);
31443903
3145
- hrtimer_init_sleeper(&hs, current);
31463904 do {
31473905 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
31483906 break;
31493907 set_current_state(TASK_UNINTERRUPTIBLE);
3150
- hrtimer_start_expires(&hs.timer, mode);
3908
+ hrtimer_sleeper_start_expires(&hs, mode);
31513909 if (hs.task)
31523910 io_schedule();
31533911 hrtimer_cancel(&hs.timer);
....@@ -3159,59 +3917,14 @@
31593917 return true;
31603918 }
31613919
3162
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3920
+static bool blk_mq_poll_hybrid(struct request_queue *q,
3921
+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
31633922 {
3164
- struct request_queue *q = hctx->queue;
3165
- long state;
3166
-
3167
- /*
3168
- * If we sleep, have the caller restart the poll loop to reset
3169
- * the state. Like for the other success return cases, the
3170
- * caller is responsible for checking if the IO completed. If
3171
- * the IO isn't complete, we'll get called again and will go
3172
- * straight to the busy poll loop.
3173
- */
3174
- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3175
- return true;
3176
-
3177
- hctx->poll_considered++;
3178
-
3179
- state = current->state;
3180
- while (!need_resched()) {
3181
- int ret;
3182
-
3183
- hctx->poll_invoked++;
3184
-
3185
- ret = q->mq_ops->poll(hctx, rq->tag);
3186
- if (ret > 0) {
3187
- hctx->poll_success++;
3188
- set_current_state(TASK_RUNNING);
3189
- return true;
3190
- }
3191
-
3192
- if (signal_pending_state(state, current))
3193
- set_current_state(TASK_RUNNING);
3194
-
3195
- if (current->state == TASK_RUNNING)
3196
- return true;
3197
- if (ret < 0)
3198
- break;
3199
- cpu_relax();
3200
- }
3201
-
3202
- __set_current_state(TASK_RUNNING);
3203
- return false;
3204
-}
3205
-
3206
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3207
-{
3208
- struct blk_mq_hw_ctx *hctx;
32093923 struct request *rq;
32103924
3211
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3925
+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
32123926 return false;
32133927
3214
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
32153928 if (!blk_qc_t_is_internal(cookie))
32163929 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
32173930 else {
....@@ -3226,13 +3939,97 @@
32263939 return false;
32273940 }
32283941
3229
- return __blk_mq_poll(hctx, rq);
3942
+ return blk_mq_poll_hybrid_sleep(q, rq);
32303943 }
3944
+
3945
+/**
3946
+ * blk_poll - poll for IO completions
3947
+ * @q: the queue
3948
+ * @cookie: cookie passed back at IO submission time
3949
+ * @spin: whether to spin for completions
3950
+ *
3951
+ * Description:
3952
+ * Poll for completions on the passed in queue. Returns number of
3953
+ * completed entries found. If @spin is true, then blk_poll will continue
3954
+ * looping until at least one completion is found, unless the task is
3955
+ * otherwise marked running (or we need to reschedule).
3956
+ */
3957
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3958
+{
3959
+ struct blk_mq_hw_ctx *hctx;
3960
+ long state;
3961
+
3962
+ if (!blk_qc_t_valid(cookie) ||
3963
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3964
+ return 0;
3965
+
3966
+ if (current->plug)
3967
+ blk_flush_plug_list(current->plug, false);
3968
+
3969
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3970
+
3971
+ /*
3972
+ * If we sleep, have the caller restart the poll loop to reset
3973
+ * the state. Like for the other success return cases, the
3974
+ * caller is responsible for checking if the IO completed. If
3975
+ * the IO isn't complete, we'll get called again and will go
3976
+ * straight to the busy poll loop.
3977
+ */
3978
+ if (blk_mq_poll_hybrid(q, hctx, cookie))
3979
+ return 1;
3980
+
3981
+ hctx->poll_considered++;
3982
+
3983
+ state = current->state;
3984
+ do {
3985
+ int ret;
3986
+
3987
+ hctx->poll_invoked++;
3988
+
3989
+ ret = q->mq_ops->poll(hctx);
3990
+ if (ret > 0) {
3991
+ hctx->poll_success++;
3992
+ __set_current_state(TASK_RUNNING);
3993
+ return ret;
3994
+ }
3995
+
3996
+ if (signal_pending_state(state, current))
3997
+ __set_current_state(TASK_RUNNING);
3998
+
3999
+ if (current->state == TASK_RUNNING)
4000
+ return 1;
4001
+ if (ret < 0 || !spin)
4002
+ break;
4003
+ cpu_relax();
4004
+ } while (!need_resched());
4005
+
4006
+ __set_current_state(TASK_RUNNING);
4007
+ return 0;
4008
+}
4009
+EXPORT_SYMBOL_GPL(blk_poll);
4010
+
4011
+unsigned int blk_mq_rq_cpu(struct request *rq)
4012
+{
4013
+ return rq->mq_ctx->cpu;
4014
+}
4015
+EXPORT_SYMBOL(blk_mq_rq_cpu);
32314016
32324017 static int __init blk_mq_init(void)
32334018 {
4019
+ int i;
4020
+
4021
+ for_each_possible_cpu(i)
4022
+ init_llist_head(&per_cpu(blk_cpu_done, i));
4023
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4024
+
4025
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4026
+ "block/softirq:dead", NULL,
4027
+ blk_softirq_cpu_dead);
32344028 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
32354029 blk_mq_hctx_notify_dead);
4030
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4031
+ blk_mq_hctx_notify_online,
4032
+ blk_mq_hctx_notify_offline);
32364033 return 0;
32374034 }
32384035 subsys_initcall(blk_mq_init);