~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Block multiqueue core code
3	4	*
..	..	@@ -25,30 +26,36 @@
25	26	#include <linux/delay.h>
26	27	#include <linux/crash_dump.h>
27	28	#include <linux/prefetch.h>
	29	+#include <linux/blk-crypto.h>
28	30
29	31	#include <trace/events/block.h>
30	32
31	33	#include <linux/blk-mq.h>
	34	+#include <linux/t10-pi.h>
32	35	#include "blk.h"
33	36	#include "blk-mq.h"
34	37	#include "blk-mq-debugfs.h"
35	38	#include "blk-mq-tag.h"
	39	+#include "blk-pm.h"
36	40	#include "blk-stat.h"
37	41	#include "blk-mq-sched.h"
38	42	#include "blk-rq-qos.h"
39	43
40		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
	44	+#include <trace/hooks/block.h>
	45	+
	46	+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
	47	+
41	48	static void blk_mq_poll_stats_start(struct request_queue *q);
42	49	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
43	50
44	51	static int blk_mq_poll_stats_bkt(const struct request *rq)
45	52	{
46		- int ddir, bytes, bucket;
	53	+ int ddir, sectors, bucket;
47	54
48	55	ddir = rq_data_dir(rq);
49		- bytes = blk_rq_bytes(rq);
	56	+ sectors = blk_rq_stats_sectors(rq);
50	57
51		- bucket = ddir + 2*(ilog2(bytes) - 9);
	58	+ bucket = ddir + 2 * ilog2(sectors);
52	59
53	60	if (bucket < 0)
54	61	return -1;
..	..	@@ -59,7 +66,8 @@
59	66	}
60	67
61	68	/*
62		- * Check if any of the ctx's have pending work in this hardware queue
	69	+ * Check if any of the ctx, dispatch list or elevator
	70	+ * have pending work in this hardware queue.
63	71	*/
64	72	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
65	73	{
..	..	@@ -74,75 +82,67 @@
74	82	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
75	83	struct blk_mq_ctx *ctx)
76	84	{
77		- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78		- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
	85	+ const int bit = ctx->index_hw[hctx->type];
	86	+
	87	+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
	88	+ sbitmap_set_bit(&hctx->ctx_map, bit);
79	89	}
80	90
81	91	static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
82	92	struct blk_mq_ctx *ctx)
83	93	{
84		- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
	94	+ const int bit = ctx->index_hw[hctx->type];
	95	+
	96	+ sbitmap_clear_bit(&hctx->ctx_map, bit);
85	97	}
86	98
87	99	struct mq_inflight {
88	100	struct hd_struct *part;
89		- unsigned int *inflight;
	101	+ unsigned int inflight[2];
90	102	};
91	103
92		-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
	104	+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93	105	struct request rq, void priv,
94	106	bool reserved)
95	107	{
96	108	struct mq_inflight *mi = priv;
97	109
98		- /*
99		- * index[0] counts the specific partition that was asked for. index[1]
100		- * counts the ones that are active on the whole device, so increment
101		- * that if mi->part is indeed a partition, and not a whole device.
102		- */
103		- if (rq->part == mi->part)
104		- mi->inflight[0]++;
105		- if (mi->part->partno)
106		- mi->inflight[1]++;
107		-}
108		-
109		-void blk_mq_in_flight(struct request_queue q, struct hd_struct part,
110		- unsigned int inflight[2])
111		-{
112		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113		-
114		- inflight[0] = inflight[1] = 0;
115		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116		-}
117		-
118		-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119		- struct request rq, void priv,
120		- bool reserved)
121		-{
122		- struct mq_inflight *mi = priv;
123		-
124		- if (rq->part == mi->part)
	110	+ if ((!mi->part->partno \|\| rq->part == mi->part) &&
	111	+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125	112	mi->inflight[rq_data_dir(rq)]++;
	113	+
	114	+ return true;
	115	+}
	116	+
	117	+unsigned int blk_mq_in_flight(struct request_queue q, struct hd_struct part)
	118	+{
	119	+ struct mq_inflight mi = { .part = part };
	120	+
	121	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	122	+
	123	+ return mi.inflight[0] + mi.inflight[1];
126	124	}
127	125
128	126	void blk_mq_in_flight_rw(struct request_queue q, struct hd_struct part,
129	127	unsigned int inflight[2])
130	128	{
131		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
	129	+ struct mq_inflight mi = { .part = part };
132	130
133		- inflight[0] = inflight[1] = 0;
134		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
	131	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	132	+ inflight[0] = mi.inflight[0];
	133	+ inflight[1] = mi.inflight[1];
135	134	}
136	135
137	136	void blk_freeze_queue_start(struct request_queue *q)
138	137	{
139		- int freeze_depth;
140		-
141		- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142		- if (freeze_depth == 1) {
	138	+ mutex_lock(&q->mq_freeze_lock);
	139	+ if (++q->mq_freeze_depth == 1) {
143	140	percpu_ref_kill(&q->q_usage_counter);
144		- if (q->mq_ops)
	141	+ mutex_unlock(&q->mq_freeze_lock);
	142	+ if (queue_is_mq(q))
145	143	blk_mq_run_hw_queues(q, false);
	144	+ } else {
	145	+ mutex_unlock(&q->mq_freeze_lock);
146	146	}
147	147	}
148	148	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
..	..	@@ -176,8 +176,6 @@
176	176	* exported to drivers as the only user for unfreeze is blk_mq.
177	177	*/
178	178	blk_freeze_queue_start(q);
179		- if (!q->mq_ops)
180		- blk_drain_queue(q);
181	179	blk_mq_freeze_queue_wait(q);
182	180	}
183	181
..	..	@@ -193,14 +191,14 @@
193	191
194	192	void blk_mq_unfreeze_queue(struct request_queue *q)
195	193	{
196		- int freeze_depth;
197		-
198		- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199		- WARN_ON_ONCE(freeze_depth < 0);
200		- if (!freeze_depth) {
201		- percpu_ref_reinit(&q->q_usage_counter);
	194	+ mutex_lock(&q->mq_freeze_lock);
	195	+ q->mq_freeze_depth--;
	196	+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
	197	+ if (!q->mq_freeze_depth) {
	198	+ percpu_ref_resurrect(&q->q_usage_counter);
202	199	wake_up_all(&q->mq_freeze_wq);
203	200	}
	201	+ mutex_unlock(&q->mq_freeze_lock);
204	202	}
205	203	EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206	204
..	..	@@ -268,40 +266,37 @@
268	266	blk_mq_tag_wakeup_all(hctx->tags, true);
269	267	}
270	268
271		-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
	269	+/*
	270	+ * Only need start/end time stamping if we have iostat or
	271	+ * blk stats enabled, or using an IO scheduler.
	272	+ */
	273	+static inline bool blk_mq_need_time_stamp(struct request *rq)
272	274	{
273		- return blk_mq_has_free_tags(hctx->tags);
	275	+ return (rq->rq_flags & (RQF_IO_STAT \| RQF_STATS)) \|\| rq->q->elevator;
274	276	}
275		-EXPORT_SYMBOL(blk_mq_can_queue);
276	277
277	278	static struct request blk_mq_rq_ctx_init(struct blk_mq_alloc_data data,
278		- unsigned int tag, unsigned int op)
	279	+ unsigned int tag, u64 alloc_time_ns)
279	280	{
280	281	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281	282	struct request *rq = tags->static_rqs[tag];
282		- req_flags_t rq_flags = 0;
283	283
284		- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285		- rq->tag = -1;
	284	+ if (data->q->elevator) {
	285	+ rq->tag = BLK_MQ_NO_TAG;
286	286	rq->internal_tag = tag;
287	287	} else {
288		- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289		- rq_flags = RQF_MQ_INFLIGHT;
290		- atomic_inc(&data->hctx->nr_active);
291		- }
292	288	rq->tag = tag;
293		- rq->internal_tag = -1;
294		- data->hctx->tags->rqs[rq->tag] = rq;
	289	+ rq->internal_tag = BLK_MQ_NO_TAG;
295	290	}
296	291
297	292	/* csd/requeue_work/fifo_time is initialized before use */
298	293	rq->q = data->q;
299	294	rq->mq_ctx = data->ctx;
300		- rq->rq_flags = rq_flags;
301		- rq->cpu = -1;
302		- rq->cmd_flags = op;
303		- if (data->flags & BLK_MQ_REQ_PREEMPT)
304		- rq->rq_flags \|= RQF_PREEMPT;
	295	+ rq->mq_hctx = data->hctx;
	296	+ rq->rq_flags = 0;
	297	+ rq->cmd_flags = data->cmd_flags;
	298	+ if (data->flags & BLK_MQ_REQ_PM)
	299	+ rq->rq_flags \|= RQF_PM;
305	300	if (blk_queue_io_stat(data->q))
306	301	rq->rq_flags \|= RQF_IO_STAT;
307	302	INIT_LIST_HEAD(&rq->queuelist);
..	..	@@ -309,100 +304,110 @@
309	304	RB_CLEAR_NODE(&rq->rb_node);
310	305	rq->rq_disk = NULL;
311	306	rq->part = NULL;
312		- rq->start_time_ns = ktime_get_ns();
	307	+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	308	+ rq->alloc_time_ns = alloc_time_ns;
	309	+#endif
	310	+ if (blk_mq_need_time_stamp(rq))
	311	+ rq->start_time_ns = ktime_get_ns();
	312	+ else
	313	+ rq->start_time_ns = 0;
313	314	rq->io_start_time_ns = 0;
	315	+ rq->stats_sectors = 0;
314	316	rq->nr_phys_segments = 0;
315	317	#if defined(CONFIG_BLK_DEV_INTEGRITY)
316	318	rq->nr_integrity_segments = 0;
317	319	#endif
318		- rq->special = NULL;
	320	+ blk_crypto_rq_set_defaults(rq);
319	321	/* tag was already set */
320		- rq->extra_len = 0;
321		- rq->__deadline = 0;
	322	+ WRITE_ONCE(rq->deadline, 0);
322	323
323		-#ifdef CONFIG_PREEMPT_RT_FULL
324		- INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
325		-#endif
326		- INIT_LIST_HEAD(&rq->timeout_list);
327	324	rq->timeout = 0;
328	325
329	326	rq->end_io = NULL;
330	327	rq->end_io_data = NULL;
331		- rq->next_rq = NULL;
332	328
333		-#ifdef CONFIG_BLK_CGROUP
334		- rq->rl = NULL;
335		-#endif
336		-
337		- data->ctx->rq_dispatched[op_is_sync(op)]++;
	329	+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
338	330	refcount_set(&rq->ref, 1);
	331	+
	332	+ if (!op_is_flush(data->cmd_flags)) {
	333	+ struct elevator_queue *e = data->q->elevator;
	334	+
	335	+ rq->elv.icq = NULL;
	336	+ if (e && e->type->ops.prepare_request) {
	337	+ if (e->type->icq_cache)
	338	+ blk_mq_sched_assign_ioc(rq);
	339	+
	340	+ e->type->ops.prepare_request(rq);
	341	+ rq->rq_flags \|= RQF_ELVPRIV;
	342	+ }
	343	+ }
	344	+
	345	+ data->hctx->queued++;
	346	+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
339	347	return rq;
340	348	}
341	349
342		-static struct request blk_mq_get_request(struct request_queue q,
343		- struct bio *bio, unsigned int op,
344		- struct blk_mq_alloc_data *data)
	350	+static struct request __blk_mq_alloc_request(struct blk_mq_alloc_data data)
345	351	{
	352	+ struct request_queue *q = data->q;
346	353	struct elevator_queue *e = q->elevator;
347		- struct request *rq;
	354	+ u64 alloc_time_ns = 0;
348	355	unsigned int tag;
349		- bool put_ctx_on_error = false;
350	356
351		- blk_queue_enter_live(q);
352		- data->q = q;
353		- if (likely(!data->ctx)) {
354		- data->ctx = blk_mq_get_ctx(q);
355		- put_ctx_on_error = true;
356		- }
357		- if (likely(!data->hctx))
358		- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
359		- if (op & REQ_NOWAIT)
	357	+ /* alloc_time includes depth and tag waits */
	358	+ if (blk_queue_rq_alloc_time(q))
	359	+ alloc_time_ns = ktime_get_ns();
	360	+
	361	+ if (data->cmd_flags & REQ_NOWAIT)
360	362	data->flags \|= BLK_MQ_REQ_NOWAIT;
361	363
362	364	if (e) {
363		- data->flags \|= BLK_MQ_REQ_INTERNAL;
364		-
365	365	/*
366	366	* Flush requests are special and go directly to the
367	367	* dispatch list. Don't include reserved tags in the
368	368	* limiting, as it isn't useful.
369	369	*/
370		- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
	370	+ if (!op_is_flush(data->cmd_flags) &&
	371	+ e->type->ops.limit_depth &&
371	372	!(data->flags & BLK_MQ_REQ_RESERVED))
372		- e->type->ops.mq.limit_depth(op, data);
373		- } else {
	373	+ e->type->ops.limit_depth(data->cmd_flags, data);
	374	+ }
	375	+
	376	+retry:
	377	+ data->ctx = blk_mq_get_ctx(q);
	378	+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
	379	+ if (!e)
374	380	blk_mq_tag_busy(data->hctx);
375		- }
376	381
	382	+ /*
	383	+ * Waiting allocations only fail because of an inactive hctx. In that
	384	+ * case just retry the hctx assignment and tag allocation as CPU hotplug
	385	+ * should have migrated us to an online CPU by now.
	386	+ */
377	387	tag = blk_mq_get_tag(data);
378		- if (tag == BLK_MQ_TAG_FAIL) {
379		- if (put_ctx_on_error) {
380		- blk_mq_put_ctx(data->ctx);
381		- data->ctx = NULL;
382		- }
383		- blk_queue_exit(q);
384		- return NULL;
385		- }
	388	+ if (tag == BLK_MQ_NO_TAG) {
	389	+ if (data->flags & BLK_MQ_REQ_NOWAIT)
	390	+ return NULL;
386	391
387		- rq = blk_mq_rq_ctx_init(data, tag, op);
388		- if (!op_is_flush(op)) {
389		- rq->elv.icq = NULL;
390		- if (e && e->type->ops.mq.prepare_request) {
391		- if (e->type->icq_cache && rq_ioc(bio))
392		- blk_mq_sched_assign_ioc(rq, bio);
393		-
394		- e->type->ops.mq.prepare_request(rq, bio);
395		- rq->rq_flags \|= RQF_ELVPRIV;
396		- }
	392	+ /*
	393	+ * Give up the CPU and sleep for a random short time to ensure
	394	+ * that thread using a realtime scheduling class are migrated
	395	+ * off the CPU, and thus off the hctx that is going away.
	396	+ */
	397	+ msleep(3);
	398	+ goto retry;
397	399	}
398		- data->hctx->queued++;
399		- return rq;
	400	+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
400	401	}
401	402
402	403	struct request blk_mq_alloc_request(struct request_queue q, unsigned int op,
403	404	blk_mq_req_flags_t flags)
404	405	{
405		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
	406	+ struct blk_mq_alloc_data data = {
	407	+ .q = q,
	408	+ .flags = flags,
	409	+ .cmd_flags = op,
	410	+ };
406	411	struct request *rq;
407	412	int ret;
408	413
..	..	@@ -410,28 +415,35 @@
410	415	if (ret)
411	416	return ERR_PTR(ret);
412	417
413		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
414		- blk_queue_exit(q);
415		-
	418	+ rq = __blk_mq_alloc_request(&data);
416	419	if (!rq)
417		- return ERR_PTR(-EWOULDBLOCK);
418		-
419		- blk_mq_put_ctx(alloc_data.ctx);
420		-
	420	+ goto out_queue_exit;
421	421	rq->__data_len = 0;
422	422	rq->__sector = (sector_t) -1;
423	423	rq->bio = rq->biotail = NULL;
424	424	return rq;
	425	+out_queue_exit:
	426	+ blk_queue_exit(q);
	427	+ return ERR_PTR(-EWOULDBLOCK);
425	428	}
426	429	EXPORT_SYMBOL(blk_mq_alloc_request);
427	430
428	431	struct request blk_mq_alloc_request_hctx(struct request_queue q,
429	432	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
430	433	{
431		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
432		- struct request *rq;
	434	+ struct blk_mq_alloc_data data = {
	435	+ .q = q,
	436	+ .flags = flags,
	437	+ .cmd_flags = op,
	438	+ };
	439	+ u64 alloc_time_ns = 0;
433	440	unsigned int cpu;
	441	+ unsigned int tag;
434	442	int ret;
	443	+
	444	+ /* alloc_time includes depth and tag waits */
	445	+ if (blk_queue_rq_alloc_time(q))
	446	+ alloc_time_ns = ktime_get_ns();
435	447
436	448	/*
437	449	* If the tag allocator sleeps we could get an allocation for a
..	..	@@ -439,7 +451,8 @@
439	451	* allocator for this for the rare use case of a command tied to
440	452	* a specific queue.
441	453	*/
442		- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
	454	+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) \|\|
	455	+ WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
443	456	return ERR_PTR(-EINVAL);
444	457
445	458	if (hctx_idx >= q->nr_hw_queues)
..	..	@@ -453,21 +466,27 @@
453	466	* Check if the hardware context is actually mapped to anything.
454	467	* If not tell the caller that it should skip this queue.
455	468	*/
456		- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
457		- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
458		- blk_queue_exit(q);
459		- return ERR_PTR(-EXDEV);
460		- }
461		- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
462		- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
	469	+ ret = -EXDEV;
	470	+ data.hctx = q->queue_hw_ctx[hctx_idx];
	471	+ if (!blk_mq_hw_queue_mapped(data.hctx))
	472	+ goto out_queue_exit;
	473	+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
	474	+ if (cpu >= nr_cpu_ids)
	475	+ goto out_queue_exit;
	476	+ data.ctx = __blk_mq_get_ctx(q, cpu);
463	477
464		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
	478	+ if (!q->elevator)
	479	+ blk_mq_tag_busy(data.hctx);
	480	+
	481	+ ret = -EWOULDBLOCK;
	482	+ tag = blk_mq_get_tag(&data);
	483	+ if (tag == BLK_MQ_NO_TAG)
	484	+ goto out_queue_exit;
	485	+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
	486	+
	487	+out_queue_exit:
465	488	blk_queue_exit(q);
466		-
467		- if (!rq)
468		- return ERR_PTR(-EWOULDBLOCK);
469		-
470		- return rq;
	489	+ return ERR_PTR(ret);
471	490	}
472	491	EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
473	492
..	..	@@ -475,13 +494,16 @@
475	494	{
476	495	struct request_queue *q = rq->q;
477	496	struct blk_mq_ctx *ctx = rq->mq_ctx;
478		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	497	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
479	498	const int sched_tag = rq->internal_tag;
480	499
481		- if (rq->tag != -1)
482		- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
483		- if (sched_tag != -1)
484		- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	500	+ blk_crypto_free_request(rq);
	501	+ blk_pm_mark_last_busy(rq);
	502	+ rq->mq_hctx = NULL;
	503	+ if (rq->tag != BLK_MQ_NO_TAG)
	504	+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
	505	+ if (sched_tag != BLK_MQ_NO_TAG)
	506	+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
485	507	blk_mq_sched_restart(hctx);
486	508	blk_queue_exit(q);
487	509	}
..	..	@@ -491,11 +513,11 @@
491	513	struct request_queue *q = rq->q;
492	514	struct elevator_queue *e = q->elevator;
493	515	struct blk_mq_ctx *ctx = rq->mq_ctx;
494		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	516	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
495	517
496	518	if (rq->rq_flags & RQF_ELVPRIV) {
497		- if (e && e->type->ops.mq.finish_request)
498		- e->type->ops.mq.finish_request(rq);
	519	+ if (e && e->type->ops.finish_request)
	520	+ e->type->ops.finish_request(rq);
499	521	if (rq->elv.icq) {
500	522	put_io_context(rq->elv.icq->ioc);
501	523	rq->elv.icq = NULL;
..	..	@@ -504,15 +526,12 @@
504	526
505	527	ctx->rq_completed[rq_is_sync(rq)]++;
506	528	if (rq->rq_flags & RQF_MQ_INFLIGHT)
507		- atomic_dec(&hctx->nr_active);
	529	+ __blk_mq_dec_active_requests(hctx);
508	530
509	531	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
510	532	laptop_io_completion(q->backing_dev_info);
511	533
512	534	rq_qos_done(q, rq);
513		-
514		- if (blk_rq_rl(rq))
515		- blk_put_rl(blk_rq_rl(rq));
516	535
517	536	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
518	537	if (refcount_dec_and_test(&rq->ref))
..	..	@@ -522,12 +541,17 @@
522	541
523	542	inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
524	543	{
525		- u64 now = ktime_get_ns();
	544	+ u64 now = 0;
	545	+
	546	+ if (blk_mq_need_time_stamp(rq))
	547	+ now = ktime_get_ns();
526	548
527	549	if (rq->rq_flags & RQF_STATS) {
528	550	blk_mq_poll_stats_start(rq->q);
529	551	blk_stat_add(rq, now);
530	552	}
	553	+
	554	+ blk_mq_sched_completed_request(rq, now);
531	555
532	556	blk_account_io_done(rq, now);
533	557
..	..	@@ -535,8 +559,6 @@
535	559	rq_qos_done(rq->q, rq);
536	560	rq->end_io(rq, error);
537	561	} else {
538		- if (unlikely(blk_bidi_rq(rq)))
539		- blk_mq_free_request(rq->next_rq);
540	562	blk_mq_free_request(rq);
541	563	}
542	564	}
..	..	@@ -550,63 +572,139 @@
550	572	}
551	573	EXPORT_SYMBOL(blk_mq_end_request);
552	574
553		-#ifdef CONFIG_PREEMPT_RT_FULL
554		-
555		-void __blk_mq_complete_request_remote_work(struct work_struct *work)
	575	+/*
	576	+ * Softirq action handler - move entries to local list and loop over them
	577	+ * while passing them to the queue registered handler.
	578	+ */
	579	+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
556	580	{
557		- struct request *rq = container_of(work, struct request, work);
	581	+ struct list_head *cpu_list, local_list;
558	582
559		- rq->q->softirq_done_fn(rq);
	583	+ local_irq_disable();
	584	+ cpu_list = this_cpu_ptr(&blk_cpu_done);
	585	+ list_replace_init(cpu_list, &local_list);
	586	+ local_irq_enable();
	587	+
	588	+ while (!list_empty(&local_list)) {
	589	+ struct request *rq;
	590	+
	591	+ rq = list_entry(local_list.next, struct request, ipi_list);
	592	+ list_del_init(&rq->ipi_list);
	593	+ rq->q->mq_ops->complete(rq);
	594	+ }
560	595	}
561	596
562		-#else
	597	+static void blk_mq_trigger_softirq(struct request *rq)
	598	+{
	599	+ struct list_head *list;
	600	+ unsigned long flags;
	601	+
	602	+ local_irq_save(flags);
	603	+ list = this_cpu_ptr(&blk_cpu_done);
	604	+ list_add_tail(&rq->ipi_list, list);
	605	+
	606	+ /*
	607	+ * If the list only contains our just added request, signal a raise of
	608	+ * the softirq. If there are already entries there, someone already
	609	+ * raised the irq but it hasn't run yet.
	610	+ */
	611	+ if (list->next == &rq->ipi_list)
	612	+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
	613	+ local_irq_restore(flags);
	614	+}
	615	+
	616	+static int blk_softirq_cpu_dead(unsigned int cpu)
	617	+{
	618	+ /*
	619	+ * If a CPU goes away, splice its entries to the current CPU
	620	+ * and trigger a run of the softirq
	621	+ */
	622	+ local_irq_disable();
	623	+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
	624	+ this_cpu_ptr(&blk_cpu_done));
	625	+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
	626	+ local_irq_enable();
	627	+
	628	+ return 0;
	629	+}
	630	+
563	631
564	632	static void __blk_mq_complete_request_remote(void *data)
565	633	{
566	634	struct request *rq = data;
567	635
568		- rq->q->softirq_done_fn(rq);
	636	+ /*
	637	+ * For most of single queue controllers, there is only one irq vector
	638	+ * for handling I/O completion, and the only irq's affinity is set
	639	+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
	640	+ * is handled on one specific CPU.
	641	+ *
	642	+ * So complete I/O requests in softirq context in case of single queue
	643	+ * devices to avoid degrading I/O performance due to irqsoff latency.
	644	+ */
	645	+ if (rq->q->nr_hw_queues == 1)
	646	+ blk_mq_trigger_softirq(rq);
	647	+ else
	648	+ rq->q->mq_ops->complete(rq);
569	649	}
570		-#endif
571	650
572		-static void __blk_mq_complete_request(struct request *rq)
	651	+static inline bool blk_mq_complete_need_ipi(struct request *rq)
573	652	{
574		- struct blk_mq_ctx *ctx = rq->mq_ctx;
575		- bool shared = false;
576		- int cpu;
	653	+ int cpu = raw_smp_processor_id();
577	654
578		- if (!blk_mq_mark_complete(rq))
579		- return;
580		- if (rq->internal_tag != -1)
581		- blk_mq_sched_completed_request(rq);
	655	+ if (!IS_ENABLED(CONFIG_SMP) \|\|
	656	+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
	657	+ return false;
582	658
583		- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
584		- rq->q->softirq_done_fn(rq);
585		- return;
586		- }
	659	+ /* same CPU or cache domain? Complete locally */
	660	+ if (cpu == rq->mq_ctx->cpu \|\|
	661	+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
	662	+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
	663	+ return false;
587	664
588		- cpu = get_cpu_light();
589		- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
590		- shared = cpus_share_cache(cpu, ctx->cpu);
	665	+ /* don't try to IPI to an offline CPU */
	666	+ return cpu_online(rq->mq_ctx->cpu);
	667	+}
591	668
592		- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
593		-#ifdef CONFIG_PREEMPT_RT_FULL
594		- /*
595		- * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
596		- * here. But we could try to invoke it one the CPU like this.
597		- */
598		- schedule_work_on(ctx->cpu, &rq->work);
599		-#else
	669	+bool blk_mq_complete_request_remote(struct request *rq)
	670	+{
	671	+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
	672	+
	673	+ /*
	674	+ * For a polled request, always complete locallly, it's pointless
	675	+ * to redirect the completion.
	676	+ */
	677	+ if (rq->cmd_flags & REQ_HIPRI)
	678	+ return false;
	679	+
	680	+ if (blk_mq_complete_need_ipi(rq)) {
600	681	rq->csd.func = __blk_mq_complete_request_remote;
601	682	rq->csd.info = rq;
602	683	rq->csd.flags = 0;
603		- smp_call_function_single_async(ctx->cpu, &rq->csd);
604		-#endif
	684	+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
605	685	} else {
606		- rq->q->softirq_done_fn(rq);
	686	+ if (rq->q->nr_hw_queues > 1)
	687	+ return false;
	688	+ blk_mq_trigger_softirq(rq);
607	689	}
608		- put_cpu_light();
	690	+
	691	+ return true;
609	692	}
	693	+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
	694	+
	695	+/**
	696	+ * blk_mq_complete_request - end I/O on a request
	697	+ * @rq: the request being processed
	698	+ *
	699	+ * Description:
	700	+ * Complete a request by scheduling the ->complete_rq operation.
	701	+ **/
	702	+void blk_mq_complete_request(struct request *rq)
	703	+{
	704	+ if (!blk_mq_complete_request_remote(rq))
	705	+ rq->q->mq_ops->complete(rq);
	706	+}
	707	+EXPORT_SYMBOL(blk_mq_complete_request);
610	708
611	709	static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
612	710	__releases(hctx->srcu)
..	..	@@ -629,40 +727,22 @@
629	727	}
630	728
631	729	/**
632		- * blk_mq_complete_request - end I/O on a request
633		- * @rq: the request being processed
	730	+ * blk_mq_start_request - Start processing a request
	731	+ * @rq: Pointer to request to be started
634	732	*
635		- * Description:
636		- * Ends all I/O on a request. It does not handle partial completions.
637		- * The actual completion happens out-of-order, through a IPI handler.
638		- **/
639		-void blk_mq_complete_request(struct request *rq)
640		-{
641		- if (unlikely(blk_should_fake_timeout(rq->q)))
642		- return;
643		- __blk_mq_complete_request(rq);
644		-}
645		-EXPORT_SYMBOL(blk_mq_complete_request);
646		-
647		-int blk_mq_request_started(struct request *rq)
648		-{
649		- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
650		-}
651		-EXPORT_SYMBOL_GPL(blk_mq_request_started);
652		-
	733	+ * Function used by device drivers to notify the block layer that a request
	734	+ * is going to be processed now, so blk layer can do proper initializations
	735	+ * such as starting the timeout timer.
	736	+ */
653	737	void blk_mq_start_request(struct request *rq)
654	738	{
655	739	struct request_queue *q = rq->q;
656		-
657		- blk_mq_sched_started_request(rq);
658	740
659	741	trace_block_rq_issue(q, rq);
660	742
661	743	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
662	744	rq->io_start_time_ns = ktime_get_ns();
663		-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
664		- rq->throtl_size = blk_rq_sectors(rq);
665		-#endif
	745	+ rq->stats_sectors = blk_rq_sectors(rq);
666	746	rq->rq_flags \|= RQF_STATS;
667	747	rq_qos_issue(q, rq);
668	748	}
..	..	@@ -672,14 +752,10 @@
672	752	blk_add_timer(rq);
673	753	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
674	754
675		- if (q->dma_drain_size && blk_rq_bytes(rq)) {
676		- /*
677		- * Make sure space for the drain appears. We know we can do
678		- * this because max_hw_segments has been adjusted to be one
679		- * fewer than the device can handle.
680		- */
681		- rq->nr_phys_segments++;
682		- }
	755	+#ifdef CONFIG_BLK_DEV_INTEGRITY
	756	+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
	757	+ q->integrity.profile->prepare_fn(rq);
	758	+#endif
683	759	}
684	760	EXPORT_SYMBOL(blk_mq_start_request);
685	761
..	..	@@ -695,8 +771,6 @@
695	771	if (blk_mq_request_started(rq)) {
696	772	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
697	773	rq->rq_flags &= ~RQF_TIMED_OUT;
698		- if (q->dma_drain_size && blk_rq_bytes(rq))
699		- rq->nr_phys_segments--;
700	774	}
701	775	}
702	776
..	..	@@ -707,7 +781,6 @@
707	781	/* this request will be re-inserted to io scheduler queue */
708	782	blk_mq_sched_requeue_request(rq);
709	783
710		- BUG_ON(blk_queued_rq(rq));
711	784	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
712	785	}
713	786	EXPORT_SYMBOL(blk_mq_requeue_request);
..	..	@@ -735,7 +808,7 @@
735	808	* merge.
736	809	*/
737	810	if (rq->rq_flags & RQF_DONTPREP)
738		- blk_mq_request_bypass_insert(rq, false);
	811	+ blk_mq_request_bypass_insert(rq, false, false);
739	812	else
740	813	blk_mq_sched_insert_request(rq, true, false, false);
741	814	}
..	..	@@ -773,7 +846,6 @@
773	846	if (kick_requeue_list)
774	847	blk_mq_kick_requeue_list(q);
775	848	}
776		-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
777	849
778	850	void blk_mq_kick_requeue_list(struct request_queue *q)
779	851	{
..	..	@@ -800,6 +872,32 @@
800	872	}
801	873	EXPORT_SYMBOL(blk_mq_tag_to_rq);
802	874
	875	+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx hctx, struct request rq,
	876	+ void *priv, bool reserved)
	877	+{
	878	+ /*
	879	+ * If we find a request that isn't idle and the queue matches,
	880	+ * we know the queue is busy. Return false to stop the iteration.
	881	+ */
	882	+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
	883	+ bool *busy = priv;
	884	+
	885	+ *busy = true;
	886	+ return false;
	887	+ }
	888	+
	889	+ return true;
	890	+}
	891	+
	892	+bool blk_mq_queue_inflight(struct request_queue *q)
	893	+{
	894	+ bool busy = false;
	895	+
	896	+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
	897	+ return busy;
	898	+}
	899	+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
	900	+
803	901	static void blk_mq_rq_timed_out(struct request *req, bool reserved)
804	902	{
805	903	req->rq_flags \|= RQF_TIMED_OUT;
..	..	@@ -824,7 +922,7 @@
824	922	if (rq->rq_flags & RQF_TIMED_OUT)
825	923	return false;
826	924
827		- deadline = blk_rq_deadline(rq);
	925	+ deadline = READ_ONCE(rq->deadline);
828	926	if (time_after_eq(jiffies, deadline))
829	927	return true;
830	928
..	..	@@ -835,43 +933,29 @@
835	933	return false;
836	934	}
837	935
838		-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
	936	+void blk_mq_put_rq_ref(struct request *rq)
	937	+{
	938	+ if (is_flush_rq(rq))
	939	+ rq->end_io(rq, 0);
	940	+ else if (refcount_dec_and_test(&rq->ref))
	941	+ __blk_mq_free_request(rq);
	942	+}
	943	+
	944	+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
839	945	struct request rq, void priv, bool reserved)
840	946	{
841	947	unsigned long *next = priv;
842	948
843	949	/*
844		- * Just do a quick check if it is expired before locking the request in
845		- * so we're not unnecessarilly synchronizing across CPUs.
846		- */
847		- if (!blk_mq_req_expired(rq, next))
848		- return;
849		-
850		- /*
851		- * We have reason to believe the request may be expired. Take a
852		- * reference on the request to lock this request lifetime into its
853		- * currently allocated context to prevent it from being reallocated in
854		- * the event the completion by-passes this timeout handler.
855		- *
856		- * If the reference was already released, then the driver beat the
857		- * timeout handler to posting a natural completion.
858		- */
859		- if (!refcount_inc_not_zero(&rq->ref))
860		- return;
861		-
862		- /*
863		- * The request is now locked and cannot be reallocated underneath the
864		- * timeout handler's processing. Re-verify this exact request is truly
865		- * expired; if it is not expired, then the request was completed and
866		- * reallocated as a new request.
	950	+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
	951	+ * be reallocated underneath the timeout handler's processing, then
	952	+ * the expire check is reliable. If the request is not expired, then
	953	+ * it was completed and reallocated as a new request after returning
	954	+ * from blk_mq_check_expired().
867	955	*/
868	956	if (blk_mq_req_expired(rq, next))
869	957	blk_mq_rq_timed_out(rq, reserved);
870		-
871		- if (is_flush_rq(rq, hctx))
872		- rq->end_io(rq, 0);
873		- else if (refcount_dec_and_test(&rq->ref))
874		- __blk_mq_free_request(rq);
	958	+ return true;
875	959	}
876	960
877	961	static void blk_mq_timeout_work(struct work_struct *work)
..	..	@@ -928,9 +1012,10 @@
928	1012	struct flush_busy_ctx_data *flush_data = data;
929	1013	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
930	1014	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1015	+ enum hctx_type type = hctx->type;
931	1016
932	1017	spin_lock(&ctx->lock);
933		- list_splice_tail_init(&ctx->rq_list, flush_data->list);
	1018	+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
934	1019	sbitmap_clear_bit(sb, bitnr);
935	1020	spin_unlock(&ctx->lock);
936	1021	return true;
..	..	@@ -962,12 +1047,13 @@
962	1047	struct dispatch_rq_data *dispatch_data = data;
963	1048	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
964	1049	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1050	+ enum hctx_type type = hctx->type;
965	1051
966	1052	spin_lock(&ctx->lock);
967		- if (!list_empty(&ctx->rq_list)) {
968		- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
	1053	+ if (!list_empty(&ctx->rq_lists[type])) {
	1054	+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
969	1055	list_del_init(&dispatch_data->rq->queuelist);
970		- if (list_empty(&ctx->rq_list))
	1056	+ if (list_empty(&ctx->rq_lists[type]))
971	1057	sbitmap_clear_bit(sb, bitnr);
972	1058	}
973	1059	spin_unlock(&ctx->lock);
..	..	@@ -978,7 +1064,7 @@
978	1064	struct request blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx hctx,
979	1065	struct blk_mq_ctx *start)
980	1066	{
981		- unsigned off = start ? start->index_hw : 0;
	1067	+ unsigned off = start ? start->index_hw[hctx->type] : 0;
982	1068	struct dispatch_rq_data data = {
983	1069	.hctx = hctx,
984	1070	.rq = NULL,
..	..	@@ -998,33 +1084,44 @@
998	1084	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
999	1085	}
1000	1086
1001		-bool blk_mq_get_driver_tag(struct request *rq)
	1087	+static bool __blk_mq_get_driver_tag(struct request *rq)
1002	1088	{
1003		- struct blk_mq_alloc_data data = {
1004		- .q = rq->q,
1005		- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
1006		- .flags = BLK_MQ_REQ_NOWAIT,
1007		- };
1008		- bool shared;
	1089	+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
	1090	+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
	1091	+ int tag;
1009	1092
1010		- if (rq->tag != -1)
1011		- goto done;
	1093	+ blk_mq_tag_busy(rq->mq_hctx);
1012	1094
1013		- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1014		- data.flags \|= BLK_MQ_REQ_RESERVED;
1015		-
1016		- shared = blk_mq_tag_busy(data.hctx);
1017		- rq->tag = blk_mq_get_tag(&data);
1018		- if (rq->tag >= 0) {
1019		- if (shared) {
1020		- rq->rq_flags \|= RQF_MQ_INFLIGHT;
1021		- atomic_inc(&data.hctx->nr_active);
1022		- }
1023		- data.hctx->tags->rqs[rq->tag] = rq;
	1095	+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
	1096	+ bt = rq->mq_hctx->tags->breserved_tags;
	1097	+ tag_offset = 0;
	1098	+ } else {
	1099	+ if (!hctx_may_queue(rq->mq_hctx, bt))
	1100	+ return false;
1024	1101	}
1025	1102
1026		-done:
1027		- return rq->tag != -1;
	1103	+ tag = __sbitmap_queue_get(bt);
	1104	+ if (tag == BLK_MQ_NO_TAG)
	1105	+ return false;
	1106	+
	1107	+ rq->tag = tag + tag_offset;
	1108	+ return true;
	1109	+}
	1110	+
	1111	+static bool blk_mq_get_driver_tag(struct request *rq)
	1112	+{
	1113	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1114	+
	1115	+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
	1116	+ return false;
	1117	+
	1118	+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
	1119	+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
	1120	+ rq->rq_flags \|= RQF_MQ_INFLIGHT;
	1121	+ __blk_mq_inc_active_requests(hctx);
	1122	+ }
	1123	+ hctx->tags->rqs[rq->tag] = rq;
	1124	+ return true;
1028	1125	}
1029	1126
1030	1127	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
..	..	@@ -1035,7 +1132,13 @@
1035	1132	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1036	1133
1037	1134	spin_lock(&hctx->dispatch_wait_lock);
1038		- list_del_init(&wait->entry);
	1135	+ if (!list_empty(&wait->entry)) {
	1136	+ struct sbitmap_queue *sbq;
	1137	+
	1138	+ list_del_init(&wait->entry);
	1139	+ sbq = hctx->tags->bitmap_tags;
	1140	+ atomic_dec(&sbq->ws_active);
	1141	+ }
1039	1142	spin_unlock(&hctx->dispatch_wait_lock);
1040	1143
1041	1144	blk_mq_run_hw_queue(hctx, true);
..	..	@@ -1051,13 +1154,13 @@
1051	1154	static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1052	1155	struct request *rq)
1053	1156	{
	1157	+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
1054	1158	struct wait_queue_head *wq;
1055	1159	wait_queue_entry_t *wait;
1056	1160	bool ret;
1057	1161
1058		- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1059		- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1060		- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
	1162	+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	1163	+ blk_mq_sched_mark_restart_hctx(hctx);
1061	1164
1062	1165	/*
1063	1166	* It's possible that a tag was freed in the window between the
..	..	@@ -1074,7 +1177,7 @@
1074	1177	if (!list_empty_careful(&wait->entry))
1075	1178	return false;
1076	1179
1077		- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
	1180	+ wq = &bt_wait_ptr(sbq, hctx)->wait;
1078	1181
1079	1182	spin_lock_irq(&wq->lock);
1080	1183	spin_lock(&hctx->dispatch_wait_lock);
..	..	@@ -1084,6 +1187,7 @@
1084	1187	return false;
1085	1188	}
1086	1189
	1190	+ atomic_inc(&sbq->ws_active);
1087	1191	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1088	1192	__add_wait_queue(wq, wait);
1089	1193
..	..	@@ -1104,6 +1208,7 @@
1104	1208	* someone else gets the wakeup.
1105	1209	*/
1106	1210	list_del_init(&wait->entry);
	1211	+ atomic_dec(&sbq->ws_active);
1107	1212	spin_unlock(&hctx->dispatch_wait_lock);
1108	1213	spin_unlock_irq(&wq->lock);
1109	1214
..	..	@@ -1122,9 +1227,6 @@
1122	1227	static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1123	1228	{
1124	1229	unsigned int ewma;
1125		-
1126		- if (hctx->queue->elevator)
1127		- return;
1128	1230
1129	1231	ewma = hctx->dispatch_busy;
1130	1232
..	..	@@ -1158,22 +1260,83 @@
1158	1260	__blk_mq_requeue_request(rq);
1159	1261	}
1160	1262
	1263	+static void blk_mq_handle_zone_resource(struct request *rq,
	1264	+ struct list_head *zone_list)
	1265	+{
	1266	+ /*
	1267	+ * If we end up here it is because we cannot dispatch a request to a
	1268	+ * specific zone due to LLD level zone-write locking or other zone
	1269	+ * related resource not being available. In this case, set the request
	1270	+ * aside in zone_list for retrying it later.
	1271	+ */
	1272	+ list_add(&rq->queuelist, zone_list);
	1273	+ __blk_mq_requeue_request(rq);
	1274	+}
	1275	+
	1276	+enum prep_dispatch {
	1277	+ PREP_DISPATCH_OK,
	1278	+ PREP_DISPATCH_NO_TAG,
	1279	+ PREP_DISPATCH_NO_BUDGET,
	1280	+};
	1281	+
	1282	+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
	1283	+ bool need_budget)
	1284	+{
	1285	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1286	+
	1287	+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
	1288	+ blk_mq_put_driver_tag(rq);
	1289	+ return PREP_DISPATCH_NO_BUDGET;
	1290	+ }
	1291	+
	1292	+ if (!blk_mq_get_driver_tag(rq)) {
	1293	+ /*
	1294	+ * The initial allocation attempt failed, so we need to
	1295	+ * rerun the hardware queue when a tag is freed. The
	1296	+ * waitqueue takes care of that. If the queue is run
	1297	+ * before we add this entry back on the dispatch list,
	1298	+ * we'll re-run it below.
	1299	+ */
	1300	+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
	1301	+ /*
	1302	+ * All budgets not got from this function will be put
	1303	+ * together during handling partial dispatch
	1304	+ */
	1305	+ if (need_budget)
	1306	+ blk_mq_put_dispatch_budget(rq->q);
	1307	+ return PREP_DISPATCH_NO_TAG;
	1308	+ }
	1309	+ }
	1310	+
	1311	+ return PREP_DISPATCH_OK;
	1312	+}
	1313	+
	1314	+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
	1315	+static void blk_mq_release_budgets(struct request_queue *q,
	1316	+ unsigned int nr_budgets)
	1317	+{
	1318	+ int i;
	1319	+
	1320	+ for (i = 0; i < nr_budgets; i++)
	1321	+ blk_mq_put_dispatch_budget(q);
	1322	+}
	1323	+
1161	1324	/*
1162	1325	* Returns true if we did some work AND can potentially do more.
1163	1326	*/
1164		-bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1165		- bool got_budget)
	1327	+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx hctx, struct list_head list,
	1328	+ unsigned int nr_budgets)
1166	1329	{
1167		- struct blk_mq_hw_ctx *hctx;
	1330	+ enum prep_dispatch prep;
	1331	+ struct request_queue *q = hctx->queue;
1168	1332	struct request rq, nxt;
1169		- bool no_tag = false;
1170	1333	int errors, queued;
1171	1334	blk_status_t ret = BLK_STS_OK;
	1335	+ LIST_HEAD(zone_list);
	1336	+ bool needs_resource = false;
1172	1337
1173	1338	if (list_empty(list))
1174	1339	return false;
1175		-
1176		- WARN_ON(!list_is_singular(list) && got_budget);
1177	1340
1178	1341	/*
1179	1342	* Now process all the entries, sending them to the driver.
..	..	@@ -1184,29 +1347,10 @@
1184	1347
1185	1348	rq = list_first_entry(list, struct request, queuelist);
1186	1349
1187		- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1188		- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
	1350	+ WARN_ON_ONCE(hctx != rq->mq_hctx);
	1351	+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
	1352	+ if (prep != PREP_DISPATCH_OK)
1189	1353	break;
1190		-
1191		- if (!blk_mq_get_driver_tag(rq)) {
1192		- /*
1193		- * The initial allocation attempt failed, so we need to
1194		- * rerun the hardware queue when a tag is freed. The
1195		- * waitqueue takes care of that. If the queue is run
1196		- * before we add this entry back on the dispatch list,
1197		- * we'll re-run it below.
1198		- */
1199		- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1200		- blk_mq_put_dispatch_budget(hctx);
1201		- /*
1202		- * For non-shared tags, the RESTART check
1203		- * will suffice.
1204		- */
1205		- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1206		- no_tag = true;
1207		- break;
1208		- }
1209		- }
1210	1354
1211	1355	list_del_init(&rq->queuelist);
1212	1356
..	..	@@ -1223,32 +1367,63 @@
1223	1367	bd.last = !blk_mq_get_driver_tag(nxt);
1224	1368	}
1225	1369
	1370	+ /*
	1371	+ * once the request is queued to lld, no need to cover the
	1372	+ * budget any more
	1373	+ */
	1374	+ if (nr_budgets)
	1375	+ nr_budgets--;
1226	1376	ret = q->mq_ops->queue_rq(hctx, &bd);
1227		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE) {
1228		- blk_mq_handle_dev_resource(rq, list);
	1377	+ switch (ret) {
	1378	+ case BLK_STS_OK:
	1379	+ queued++;
1229	1380	break;
1230		- }
1231		-
1232		- if (unlikely(ret != BLK_STS_OK)) {
	1381	+ case BLK_STS_RESOURCE:
	1382	+ needs_resource = true;
	1383	+ fallthrough;
	1384	+ case BLK_STS_DEV_RESOURCE:
	1385	+ blk_mq_handle_dev_resource(rq, list);
	1386	+ goto out;
	1387	+ case BLK_STS_ZONE_RESOURCE:
	1388	+ /*
	1389	+ * Move the request to zone_list and keep going through
	1390	+ * the dispatch list to find more requests the drive can
	1391	+ * accept.
	1392	+ */
	1393	+ blk_mq_handle_zone_resource(rq, &zone_list);
	1394	+ needs_resource = true;
	1395	+ break;
	1396	+ default:
1233	1397	errors++;
1234	1398	blk_mq_end_request(rq, BLK_STS_IOERR);
1235		- continue;
1236	1399	}
1237		-
1238		- queued++;
1239	1400	} while (!list_empty(list));
	1401	+out:
	1402	+ if (!list_empty(&zone_list))
	1403	+ list_splice_tail_init(&zone_list, list);
1240	1404
1241	1405	hctx->dispatched[queued_to_index(queued)]++;
1242	1406
	1407	+ /* If we didn't flush the entire list, we could have told the driver
	1408	+ * there was more coming, but that turned out to be a lie.
	1409	+ */
	1410	+ if ((!list_empty(list) \|\| errors \|\| needs_resource \|\|
	1411	+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
	1412	+ q->mq_ops->commit_rqs(hctx);
1243	1413	/*
1244	1414	* Any items that need requeuing? Stuff them into hctx->dispatch,
1245	1415	* that is where we will continue on next queue run.
1246	1416	*/
1247	1417	if (!list_empty(list)) {
1248	1418	bool needs_restart;
	1419	+ /* For non-shared tags, the RESTART check will suffice */
	1420	+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
	1421	+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
	1422	+
	1423	+ blk_mq_release_budgets(q, nr_budgets);
1249	1424
1250	1425	spin_lock(&hctx->lock);
1251		- list_splice_init(list, &hctx->dispatch);
	1426	+ list_splice_tail_init(list, &hctx->dispatch);
1252	1427	spin_unlock(&hctx->lock);
1253	1428
1254	1429	/*
..	..	@@ -1282,13 +1457,17 @@
1282	1457	*
1283	1458	* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1284	1459	* bit is set, run queue after a delay to avoid IO stalls
1285		- * that could otherwise occur if the queue is idle.
	1460	+ * that could otherwise occur if the queue is idle. We'll do
	1461	+ * similar if we couldn't get budget or couldn't lock a zone
	1462	+ * and SCHED_RESTART is set.
1286	1463	*/
1287	1464	needs_restart = blk_mq_sched_needs_restart(hctx);
	1465	+ if (prep == PREP_DISPATCH_NO_BUDGET)
	1466	+ needs_resource = true;
1288	1467	if (!needs_restart \|\|
1289	1468	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1290	1469	blk_mq_run_hw_queue(hctx, true);
1291		- else if (needs_restart && (ret == BLK_STS_RESOURCE))
	1470	+ else if (needs_restart && needs_resource)
1292	1471	blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1293	1472
1294	1473	blk_mq_update_dispatch_busy(hctx, true);
..	..	@@ -1296,16 +1475,15 @@
1296	1475	} else
1297	1476	blk_mq_update_dispatch_busy(hctx, false);
1298	1477
1299		- /*
1300		- * If the host/device is unable to accept more work, inform the
1301		- * caller of that.
1302		- */
1303		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1304		- return false;
1305		-
1306	1478	return (queued + errors) != 0;
1307	1479	}
1308	1480
	1481	+/**
	1482	+ * __blk_mq_run_hw_queue - Run a hardware queue.
	1483	+ * @hctx: Pointer to the hardware queue to run.
	1484	+ *
	1485	+ * Send pending requests to the hardware.
	1486	+ */
1309	1487	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1310	1488	{
1311	1489	int srcu_idx;
..	..	@@ -1403,6 +1581,15 @@
1403	1581	return next_cpu;
1404	1582	}
1405	1583
	1584	+/**
	1585	+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
	1586	+ * @hctx: Pointer to the hardware queue to run.
	1587	+ * @async: If we want to run the queue asynchronously.
	1588	+ * @msecs: Microseconds of delay to wait before running the queue.
	1589	+ *
	1590	+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
	1591	+ * with a delay of @msecs.
	1592	+ */
1406	1593	static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1407	1594	unsigned long msecs)
1408	1595	{
..	..	@@ -1410,27 +1597,43 @@
1410	1597	return;
1411	1598
1412	1599	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1413		- int cpu = get_cpu_light();
	1600	+ int cpu = get_cpu();
1414	1601	if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1415	1602	__blk_mq_run_hw_queue(hctx);
1416		- put_cpu_light();
	1603	+ put_cpu();
1417	1604	return;
1418	1605	}
1419	1606
1420		- put_cpu_light();
	1607	+ put_cpu();
1421	1608	}
1422	1609
1423	1610	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1424	1611	msecs_to_jiffies(msecs));
1425	1612	}
1426	1613
	1614	+/**
	1615	+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
	1616	+ * @hctx: Pointer to the hardware queue to run.
	1617	+ * @msecs: Microseconds of delay to wait before running the queue.
	1618	+ *
	1619	+ * Run a hardware queue asynchronously with a delay of @msecs.
	1620	+ */
1427	1621	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1428	1622	{
1429	1623	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1430	1624	}
1431	1625	EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1432	1626
1433		-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
	1627	+/**
	1628	+ * blk_mq_run_hw_queue - Start to run a hardware queue.
	1629	+ * @hctx: Pointer to the hardware queue to run.
	1630	+ * @async: If we want to run the queue asynchronously.
	1631	+ *
	1632	+ * Check if the request queue is not in a quiesced state and if there are
	1633	+ * pending requests to be sent. If this is true, run the queue to send requests
	1634	+ * to hardware.
	1635	+ */
	1636	+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1434	1637	{
1435	1638	int srcu_idx;
1436	1639	bool need_run;
..	..	@@ -1448,28 +1651,101 @@
1448	1651	blk_mq_hctx_has_pending(hctx);
1449	1652	hctx_unlock(hctx, srcu_idx);
1450	1653
1451		- if (need_run) {
	1654	+ if (need_run)
1452	1655	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1453		- return true;
1454		- }
1455		-
1456		- return false;
1457	1656	}
1458	1657	EXPORT_SYMBOL(blk_mq_run_hw_queue);
1459	1658
	1659	+/*
	1660	+ * Is the request queue handled by an IO scheduler that does not respect
	1661	+ * hardware queues when dispatching?
	1662	+ */
	1663	+static bool blk_mq_has_sqsched(struct request_queue *q)
	1664	+{
	1665	+ struct elevator_queue *e = q->elevator;
	1666	+
	1667	+ if (e && e->type->ops.dispatch_request &&
	1668	+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
	1669	+ return true;
	1670	+ return false;
	1671	+}
	1672	+
	1673	+/*
	1674	+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
	1675	+ * scheduler.
	1676	+ */
	1677	+static struct blk_mq_hw_ctx blk_mq_get_sq_hctx(struct request_queue q)
	1678	+{
	1679	+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	1680	+ /*
	1681	+ * If the IO scheduler does not respect hardware queues when
	1682	+ * dispatching, we just don't bother with multiple HW queues and
	1683	+ * dispatch from hctx for the current CPU since running multiple queues
	1684	+ * just causes lock contention inside the scheduler and pointless cache
	1685	+ * bouncing.
	1686	+ */
	1687	+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
	1688	+
	1689	+ if (!blk_mq_hctx_stopped(hctx))
	1690	+ return hctx;
	1691	+ return NULL;
	1692	+}
	1693	+
	1694	+/**
	1695	+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
	1696	+ * @q: Pointer to the request queue to run.
	1697	+ * @async: If we want to run the queue asynchronously.
	1698	+ */
1460	1699	void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1461	1700	{
1462		- struct blk_mq_hw_ctx *hctx;
	1701	+ struct blk_mq_hw_ctx hctx, sq_hctx;
1463	1702	int i;
1464	1703
	1704	+ sq_hctx = NULL;
	1705	+ if (blk_mq_has_sqsched(q))
	1706	+ sq_hctx = blk_mq_get_sq_hctx(q);
1465	1707	queue_for_each_hw_ctx(q, hctx, i) {
1466	1708	if (blk_mq_hctx_stopped(hctx))
1467	1709	continue;
1468		-
1469		- blk_mq_run_hw_queue(hctx, async);
	1710	+ /*
	1711	+ * Dispatch from this hctx either if there's no hctx preferred
	1712	+ * by IO scheduler or if it has requests that bypass the
	1713	+ * scheduler.
	1714	+ */
	1715	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1716	+ !list_empty_careful(&hctx->dispatch))
	1717	+ blk_mq_run_hw_queue(hctx, async);
1470	1718	}
1471	1719	}
1472	1720	EXPORT_SYMBOL(blk_mq_run_hw_queues);
	1721	+
	1722	+/**
	1723	+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
	1724	+ * @q: Pointer to the request queue to run.
	1725	+ * @msecs: Microseconds of delay to wait before running the queues.
	1726	+ */
	1727	+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
	1728	+{
	1729	+ struct blk_mq_hw_ctx hctx, sq_hctx;
	1730	+ int i;
	1731	+
	1732	+ sq_hctx = NULL;
	1733	+ if (blk_mq_has_sqsched(q))
	1734	+ sq_hctx = blk_mq_get_sq_hctx(q);
	1735	+ queue_for_each_hw_ctx(q, hctx, i) {
	1736	+ if (blk_mq_hctx_stopped(hctx))
	1737	+ continue;
	1738	+ /*
	1739	+ * Dispatch from this hctx either if there's no hctx preferred
	1740	+ * by IO scheduler or if it has requests that bypass the
	1741	+ * scheduler.
	1742	+ */
	1743	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1744	+ !list_empty_careful(&hctx->dispatch))
	1745	+ blk_mq_delay_run_hw_queue(hctx, msecs);
	1746	+ }
	1747	+}
	1748	+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1473	1749
1474	1750	/**
1475	1751	* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
..	..	@@ -1574,7 +1850,7 @@
1574	1850	/*
1575	1851	* If we are stopped, don't run the queue.
1576	1852	*/
1577		- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
	1853	+ if (blk_mq_hctx_stopped(hctx))
1578	1854	return;
1579	1855
1580	1856	__blk_mq_run_hw_queue(hctx);
..	..	@@ -1585,15 +1861,16 @@
1585	1861	bool at_head)
1586	1862	{
1587	1863	struct blk_mq_ctx *ctx = rq->mq_ctx;
	1864	+ enum hctx_type type = hctx->type;
1588	1865
1589	1866	lockdep_assert_held(&ctx->lock);
1590	1867
1591	1868	trace_block_rq_insert(hctx->queue, rq);
1592	1869
1593	1870	if (at_head)
1594		- list_add(&rq->queuelist, &ctx->rq_list);
	1871	+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
1595	1872	else
1596		- list_add_tail(&rq->queuelist, &ctx->rq_list);
	1873	+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1597	1874	}
1598	1875
1599	1876	void __blk_mq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
..	..	@@ -1607,17 +1884,25 @@
1607	1884	blk_mq_hctx_mark_pending(hctx, ctx);
1608	1885	}
1609	1886
1610		-/*
	1887	+/**
	1888	+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
	1889	+ * @rq: Pointer to request to be inserted.
	1890	+ * @at_head: true if the request should be inserted at the head of the list.
	1891	+ * @run_queue: If we should run the hardware queue after inserting the request.
	1892	+ *
1611	1893	* Should only be used carefully, when the caller knows we want to
1612	1894	* bypass a potential IO scheduler on the target device.
1613	1895	*/
1614		-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
	1896	+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
	1897	+ bool run_queue)
1615	1898	{
1616		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1617		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	1899	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1618	1900
1619	1901	spin_lock(&hctx->lock);
1620		- list_add_tail(&rq->queuelist, &hctx->dispatch);
	1902	+ if (at_head)
	1903	+ list_add(&rq->queuelist, &hctx->dispatch);
	1904	+ else
	1905	+ list_add_tail(&rq->queuelist, &hctx->dispatch);
1621	1906	spin_unlock(&hctx->lock);
1622	1907
1623	1908	if (run_queue)
..	..	@@ -1629,6 +1914,7 @@
1629	1914
1630	1915	{
1631	1916	struct request *rq;
	1917	+ enum hctx_type type = hctx->type;
1632	1918
1633	1919	/*
1634	1920	* preemption doesn't flush plug list, so it's possible ctx->cpu is
..	..	@@ -1640,95 +1926,87 @@
1640	1926	}
1641	1927
1642	1928	spin_lock(&ctx->lock);
1643		- list_splice_tail_init(list, &ctx->rq_list);
	1929	+ list_splice_tail_init(list, &ctx->rq_lists[type]);
1644	1930	blk_mq_hctx_mark_pending(hctx, ctx);
1645	1931	spin_unlock(&ctx->lock);
1646	1932	}
1647	1933
1648		-static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
	1934	+static int plug_rq_cmp(void priv, struct list_head a, struct list_head *b)
1649	1935	{
1650	1936	struct request *rqa = container_of(a, struct request, queuelist);
1651	1937	struct request *rqb = container_of(b, struct request, queuelist);
1652	1938
1653		- return !(rqa->mq_ctx < rqb->mq_ctx \|\|
1654		- (rqa->mq_ctx == rqb->mq_ctx &&
1655		- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
	1939	+ if (rqa->mq_ctx != rqb->mq_ctx)
	1940	+ return rqa->mq_ctx > rqb->mq_ctx;
	1941	+ if (rqa->mq_hctx != rqb->mq_hctx)
	1942	+ return rqa->mq_hctx > rqb->mq_hctx;
	1943	+
	1944	+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1656	1945	}
1657	1946
1658	1947	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1659	1948	{
1660		- struct blk_mq_ctx *this_ctx;
1661		- struct request_queue *this_q;
1662		- struct request *rq;
1663	1949	LIST_HEAD(list);
1664		- LIST_HEAD(ctx_list);
1665		- unsigned int depth;
1666	1950
	1951	+ if (list_empty(&plug->mq_list))
	1952	+ return;
1667	1953	list_splice_init(&plug->mq_list, &list);
1668	1954
1669		- list_sort(NULL, &list, plug_ctx_cmp);
	1955	+ if (plug->rq_count > 2 && plug->multiple_queues)
	1956	+ list_sort(NULL, &list, plug_rq_cmp);
1670	1957
1671		- this_q = NULL;
1672		- this_ctx = NULL;
1673		- depth = 0;
	1958	+ plug->rq_count = 0;
1674	1959
1675		- while (!list_empty(&list)) {
1676		- rq = list_entry_rq(list.next);
1677		- list_del_init(&rq->queuelist);
1678		- BUG_ON(!rq->q);
1679		- if (rq->mq_ctx != this_ctx) {
1680		- if (this_ctx) {
1681		- trace_block_unplug(this_q, depth, !from_schedule);
1682		- blk_mq_sched_insert_requests(this_q, this_ctx,
1683		- &ctx_list,
1684		- from_schedule);
1685		- }
	1960	+ do {
	1961	+ struct list_head rq_list;
	1962	+ struct request rq, head_rq = list_entry_rq(list.next);
	1963	+ struct list_head pos = &head_rq->queuelist; / skip first */
	1964	+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
	1965	+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
	1966	+ unsigned int depth = 1;
1686	1967
1687		- this_ctx = rq->mq_ctx;
1688		- this_q = rq->q;
1689		- depth = 0;
	1968	+ list_for_each_continue(pos, &list) {
	1969	+ rq = list_entry_rq(pos);
	1970	+ BUG_ON(!rq->q);
	1971	+ if (rq->mq_hctx != this_hctx \|\| rq->mq_ctx != this_ctx)
	1972	+ break;
	1973	+ depth++;
1690	1974	}
1691	1975
1692		- depth++;
1693		- list_add_tail(&rq->queuelist, &ctx_list);
1694		- }
1695		-
1696		- /*
1697		- * If 'this_ctx' is set, we know we have entries to complete
1698		- * on 'ctx_list'. Do those.
1699		- */
1700		- if (this_ctx) {
1701		- trace_block_unplug(this_q, depth, !from_schedule);
1702		- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
	1976	+ list_cut_before(&rq_list, &list, pos);
	1977	+ trace_block_unplug(head_rq->q, depth, !from_schedule);
	1978	+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1703	1979	from_schedule);
1704		- }
	1980	+ } while(!list_empty(&list));
1705	1981	}
1706	1982
1707		-static void blk_mq_bio_to_request(struct request rq, struct bio bio)
	1983	+static void blk_mq_bio_to_request(struct request rq, struct bio bio,
	1984	+ unsigned int nr_segs)
1708	1985	{
1709		- blk_init_request_from_bio(rq, bio);
	1986	+ int err;
1710	1987
1711		- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
	1988	+ if (bio->bi_opf & REQ_RAHEAD)
	1989	+ rq->cmd_flags \|= REQ_FAILFAST_MASK;
1712	1990
1713		- blk_account_io_start(rq, true);
1714		-}
	1991	+ rq->__sector = bio->bi_iter.bi_sector;
	1992	+ rq->write_hint = bio->bi_write_hint;
	1993	+ blk_rq_bio_prep(rq, bio, nr_segs);
1715	1994
1716		-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx hctx, struct request rq)
1717		-{
1718		- if (rq->tag != -1)
1719		- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
	1995	+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
	1996	+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
	1997	+ WARN_ON_ONCE(err);
1720	1998
1721		- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
	1999	+ blk_account_io_start(rq);
1722	2000	}
1723	2001
1724	2002	static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1725	2003	struct request *rq,
1726		- blk_qc_t *cookie)
	2004	+ blk_qc_t *cookie, bool last)
1727	2005	{
1728	2006	struct request_queue *q = rq->q;
1729	2007	struct blk_mq_queue_data bd = {
1730	2008	.rq = rq,
1731		- .last = true,
	2009	+ .last = last,
1732	2010	};
1733	2011	blk_qc_t new_cookie;
1734	2012	blk_status_t ret;
..	..	@@ -1763,7 +2041,7 @@
1763	2041	static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1764	2042	struct request *rq,
1765	2043	blk_qc_t *cookie,
1766		- bool bypass_insert)
	2044	+ bool bypass_insert, bool last)
1767	2045	{
1768	2046	struct request_queue *q = rq->q;
1769	2047	bool run_queue = true;
..	..	@@ -1784,23 +2062,35 @@
1784	2062	if (q->elevator && !bypass_insert)
1785	2063	goto insert;
1786	2064
1787		- if (!blk_mq_get_dispatch_budget(hctx))
	2065	+ if (!blk_mq_get_dispatch_budget(q))
1788	2066	goto insert;
1789	2067
1790	2068	if (!blk_mq_get_driver_tag(rq)) {
1791		- blk_mq_put_dispatch_budget(hctx);
	2069	+ blk_mq_put_dispatch_budget(q);
1792	2070	goto insert;
1793	2071	}
1794	2072
1795		- return __blk_mq_issue_directly(hctx, rq, cookie);
	2073	+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
1796	2074	insert:
1797	2075	if (bypass_insert)
1798	2076	return BLK_STS_RESOURCE;
1799	2077
1800		- blk_mq_request_bypass_insert(rq, run_queue);
	2078	+ blk_mq_sched_insert_request(rq, false, run_queue, false);
	2079	+
1801	2080	return BLK_STS_OK;
1802	2081	}
1803	2082
	2083	+/**
	2084	+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
	2085	+ * @hctx: Pointer of the associated hardware queue.
	2086	+ * @rq: Pointer to request to be sent.
	2087	+ * @cookie: Request queue cookie.
	2088	+ *
	2089	+ * If the device has enough resources to accept a new request now, send the
	2090	+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
	2091	+ * we can try send it another time in the future. Requests inserted at this
	2092	+ * queue have higher priority.
	2093	+ */
1804	2094	static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1805	2095	struct request rq, blk_qc_t cookie)
1806	2096	{
..	..	@@ -1811,25 +2101,24 @@
1811	2101
1812	2102	hctx_lock(hctx, &srcu_idx);
1813	2103
1814		- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
	2104	+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1815	2105	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1816		- blk_mq_request_bypass_insert(rq, true);
	2106	+ blk_mq_request_bypass_insert(rq, false, true);
1817	2107	else if (ret != BLK_STS_OK)
1818	2108	blk_mq_end_request(rq, ret);
1819	2109
1820	2110	hctx_unlock(hctx, srcu_idx);
1821	2111	}
1822	2112
1823		-blk_status_t blk_mq_request_issue_directly(struct request *rq)
	2113	+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1824	2114	{
1825	2115	blk_status_t ret;
1826	2116	int srcu_idx;
1827	2117	blk_qc_t unused_cookie;
1828		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1829		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	2118	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1830	2119
1831	2120	hctx_lock(hctx, &srcu_idx);
1832		- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
	2121	+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1833	2122	hctx_unlock(hctx, srcu_idx);
1834	2123
1835	2124	return ret;
..	..	@@ -1838,104 +2127,169 @@
1838	2127	void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1839	2128	struct list_head *list)
1840	2129	{
	2130	+ int queued = 0;
	2131	+ int errors = 0;
	2132	+
1841	2133	while (!list_empty(list)) {
1842	2134	blk_status_t ret;
1843	2135	struct request *rq = list_first_entry(list, struct request,
1844	2136	queuelist);
1845	2137
1846	2138	list_del_init(&rq->queuelist);
1847		- ret = blk_mq_request_issue_directly(rq);
	2139	+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
1848	2140	if (ret != BLK_STS_OK) {
	2141	+ errors++;
1849	2142	if (ret == BLK_STS_RESOURCE \|\|
1850	2143	ret == BLK_STS_DEV_RESOURCE) {
1851		- blk_mq_request_bypass_insert(rq,
	2144	+ blk_mq_request_bypass_insert(rq, false,
1852	2145	list_empty(list));
1853	2146	break;
1854	2147	}
1855	2148	blk_mq_end_request(rq, ret);
1856		- }
	2149	+ } else
	2150	+ queued++;
	2151	+ }
	2152	+
	2153	+ /*
	2154	+ * If we didn't flush the entire list, we could have told
	2155	+ * the driver there was more coming, but that turned out to
	2156	+ * be a lie.
	2157	+ */
	2158	+ if ((!list_empty(list) \|\| errors) &&
	2159	+ hctx->queue->mq_ops->commit_rqs && queued)
	2160	+ hctx->queue->mq_ops->commit_rqs(hctx);
	2161	+}
	2162	+
	2163	+static void blk_add_rq_to_plug(struct blk_plug plug, struct request rq)
	2164	+{
	2165	+ list_add_tail(&rq->queuelist, &plug->mq_list);
	2166	+ plug->rq_count++;
	2167	+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
	2168	+ struct request *tmp;
	2169	+
	2170	+ tmp = list_first_entry(&plug->mq_list, struct request,
	2171	+ queuelist);
	2172	+ if (tmp->q != rq->q)
	2173	+ plug->multiple_queues = true;
1857	2174	}
1858	2175	}
1859	2176
1860		-static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
	2177	+/*
	2178	+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
	2179	+ * queues. This is important for md arrays to benefit from merging
	2180	+ * requests.
	2181	+ */
	2182	+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1861	2183	{
	2184	+ if (plug->multiple_queues)
	2185	+ return BLK_MAX_REQUEST_COUNT * 2;
	2186	+ return BLK_MAX_REQUEST_COUNT;
	2187	+}
	2188	+
	2189	+/**
	2190	+ * blk_mq_submit_bio - Create and send a request to block device.
	2191	+ * @bio: Bio pointer.
	2192	+ *
	2193	+ * Builds up a request structure from @q and @bio and send to the device. The
	2194	+ * request may not be queued directly to hardware if:
	2195	+ * * This request can be merged with another one
	2196	+ * * We want to place request at plug queue for possible future merging
	2197	+ * * There is an IO scheduler active at this queue
	2198	+ *
	2199	+ * It will not queue the request if there is an error with the bio, or at the
	2200	+ * request creation.
	2201	+ *
	2202	+ * Returns: Request queue cookie.
	2203	+ */
	2204	+blk_qc_t blk_mq_submit_bio(struct bio *bio)
	2205	+{
	2206	+ struct request_queue *q = bio->bi_disk->queue;
1862	2207	const int is_sync = op_is_sync(bio->bi_opf);
1863	2208	const int is_flush_fua = op_is_flush(bio->bi_opf);
1864		- struct blk_mq_alloc_data data = { .flags = 0 };
	2209	+ struct blk_mq_alloc_data data = {
	2210	+ .q = q,
	2211	+ };
1865	2212	struct request *rq;
1866		- unsigned int request_count = 0;
1867	2213	struct blk_plug *plug;
1868	2214	struct request *same_queue_rq = NULL;
	2215	+ unsigned int nr_segs;
1869	2216	blk_qc_t cookie;
	2217	+ blk_status_t ret;
1870	2218
1871	2219	blk_queue_bounce(q, &bio);
1872		-
1873		- blk_queue_split(q, &bio);
	2220	+ __blk_queue_split(&bio, &nr_segs);
1874	2221
1875	2222	if (!bio_integrity_prep(bio))
1876		- return BLK_QC_T_NONE;
	2223	+ goto queue_exit;
1877	2224
1878	2225	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1879		- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1880		- return BLK_QC_T_NONE;
	2226	+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
	2227	+ goto queue_exit;
1881	2228
1882		- if (blk_mq_sched_bio_merge(q, bio))
1883		- return BLK_QC_T_NONE;
	2229	+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
	2230	+ goto queue_exit;
1884	2231
1885		- rq_qos_throttle(q, bio, NULL);
	2232	+ rq_qos_throttle(q, bio);
1886	2233
1887		- trace_block_getrq(q, bio, bio->bi_opf);
1888		-
1889		- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
	2234	+ data.cmd_flags = bio->bi_opf;
	2235	+ rq = __blk_mq_alloc_request(&data);
1890	2236	if (unlikely(!rq)) {
1891	2237	rq_qos_cleanup(q, bio);
1892	2238	if (bio->bi_opf & REQ_NOWAIT)
1893	2239	bio_wouldblock_error(bio);
1894		- return BLK_QC_T_NONE;
	2240	+ goto queue_exit;
1895	2241	}
	2242	+
	2243	+ trace_block_getrq(q, bio, bio->bi_opf);
1896	2244
1897	2245	rq_qos_track(q, rq, bio);
1898	2246
1899	2247	cookie = request_to_qc_t(data.hctx, rq);
1900	2248
1901		- plug = current->plug;
1902		- if (unlikely(is_flush_fua)) {
1903		- blk_mq_put_ctx(data.ctx);
1904		- blk_mq_bio_to_request(rq, bio);
	2249	+ blk_mq_bio_to_request(rq, bio, nr_segs);
1905	2250
1906		- /* bypass scheduler for flush rq */
	2251	+ ret = blk_crypto_rq_get_keyslot(rq);
	2252	+ if (ret != BLK_STS_OK) {
	2253	+ bio->bi_status = ret;
	2254	+ bio_endio(bio);
	2255	+ blk_mq_free_request(rq);
	2256	+ return BLK_QC_T_NONE;
	2257	+ }
	2258	+
	2259	+ plug = blk_mq_plug(q, bio);
	2260	+ if (unlikely(is_flush_fua)) {
	2261	+ /* Bypass scheduler for flush requests */
1907	2262	blk_insert_flush(rq);
1908	2263	blk_mq_run_hw_queue(data.hctx, true);
1909		- } else if (plug && q->nr_hw_queues == 1) {
1910		- struct request *last = NULL;
1911		-
1912		- blk_mq_put_ctx(data.ctx);
1913		- blk_mq_bio_to_request(rq, bio);
1914		-
	2264	+ } else if (plug && (q->nr_hw_queues == 1 \|\|
	2265	+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) \|\|
	2266	+ q->mq_ops->commit_rqs \|\| !blk_queue_nonrot(q))) {
1915	2267	/*
1916		- * @request_count may become stale because of schedule
1917		- * out, so check the list again.
	2268	+ * Use plugging if we have a ->commit_rqs() hook as well, as
	2269	+ * we know the driver uses bd->last in a smart fashion.
	2270	+ *
	2271	+ * Use normal plugging if this disk is slow HDD, as sequential
	2272	+ * IO may benefit a lot from plug merging.
1918	2273	*/
1919		- if (list_empty(&plug->mq_list))
1920		- request_count = 0;
1921		- else if (blk_queue_nomerges(q))
1922		- request_count = blk_plug_queued_count(q);
	2274	+ unsigned int request_count = plug->rq_count;
	2275	+ struct request *last = NULL;
1923	2276
1924	2277	if (!request_count)
1925	2278	trace_block_plug(q);
1926	2279	else
1927	2280	last = list_entry_rq(plug->mq_list.prev);
1928	2281
1929		- if (request_count >= BLK_MAX_REQUEST_COUNT \|\| (last &&
	2282	+ if (request_count >= blk_plug_max_rq_count(plug) \|\| (last &&
1930	2283	blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1931	2284	blk_flush_plug_list(plug, false);
1932	2285	trace_block_plug(q);
1933	2286	}
1934	2287
1935		- list_add_tail(&rq->queuelist, &plug->mq_list);
	2288	+ blk_add_rq_to_plug(plug, rq);
	2289	+ } else if (q->elevator) {
	2290	+ /* Insert the request at the IO scheduler queue */
	2291	+ blk_mq_sched_insert_request(rq, false, true, true);
1936	2292	} else if (plug && !blk_queue_nomerges(q)) {
1937		- blk_mq_bio_to_request(rq, bio);
1938		-
1939	2293	/*
1940	2294	* We do limited plugging. If the bio can be merged, do that.
1941	2295	* Otherwise the existing request in the plug list will be
..	..	@@ -1945,30 +2299,74 @@
1945	2299	*/
1946	2300	if (list_empty(&plug->mq_list))
1947	2301	same_queue_rq = NULL;
1948		- if (same_queue_rq)
	2302	+ if (same_queue_rq) {
1949	2303	list_del_init(&same_queue_rq->queuelist);
1950		- list_add_tail(&rq->queuelist, &plug->mq_list);
1951		-
1952		- blk_mq_put_ctx(data.ctx);
	2304	+ plug->rq_count--;
	2305	+ }
	2306	+ blk_add_rq_to_plug(plug, rq);
	2307	+ trace_block_plug(q);
1953	2308
1954	2309	if (same_queue_rq) {
1955		- data.hctx = blk_mq_map_queue(q,
1956		- same_queue_rq->mq_ctx->cpu);
	2310	+ data.hctx = same_queue_rq->mq_hctx;
	2311	+ trace_block_unplug(q, 1, true);
1957	2312	blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1958	2313	&cookie);
1959	2314	}
1960		- } else if ((q->nr_hw_queues > 1 && is_sync) \|\| (!q->elevator &&
1961		- !data.hctx->dispatch_busy)) {
1962		- blk_mq_put_ctx(data.ctx);
1963		- blk_mq_bio_to_request(rq, bio);
	2315	+ } else if ((q->nr_hw_queues > 1 && is_sync) \|\|
	2316	+ !data.hctx->dispatch_busy) {
	2317	+ /*
	2318	+ * There is no scheduler and we can try to send directly
	2319	+ * to the hardware.
	2320	+ */
1964	2321	blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1965	2322	} else {
1966		- blk_mq_put_ctx(data.ctx);
1967		- blk_mq_bio_to_request(rq, bio);
	2323	+ /* Default case. */
1968	2324	blk_mq_sched_insert_request(rq, false, true, true);
1969	2325	}
1970	2326
1971	2327	return cookie;
	2328	+queue_exit:
	2329	+ blk_queue_exit(q);
	2330	+ return BLK_QC_T_NONE;
	2331	+}
	2332	+
	2333	+static size_t order_to_size(unsigned int order)
	2334	+{
	2335	+ return (size_t)PAGE_SIZE << order;
	2336	+}
	2337	+
	2338	+/* called before freeing request pool in @tags */
	2339	+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
	2340	+ struct blk_mq_tags *tags, unsigned int hctx_idx)
	2341	+{
	2342	+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
	2343	+ struct page *page;
	2344	+ unsigned long flags;
	2345	+
	2346	+ list_for_each_entry(page, &tags->page_list, lru) {
	2347	+ unsigned long start = (unsigned long)page_address(page);
	2348	+ unsigned long end = start + order_to_size(page->private);
	2349	+ int i;
	2350	+
	2351	+ for (i = 0; i < set->queue_depth; i++) {
	2352	+ struct request *rq = drv_tags->rqs[i];
	2353	+ unsigned long rq_addr = (unsigned long)rq;
	2354	+
	2355	+ if (rq_addr >= start && rq_addr < end) {
	2356	+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
	2357	+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
	2358	+ }
	2359	+ }
	2360	+ }
	2361	+
	2362	+ /*
	2363	+ * Wait until all pending iteration is done.
	2364	+ *
	2365	+ * Request reference is cleared and it is guaranteed to be observed
	2366	+ * after the ->lock is released.
	2367	+ */
	2368	+ spin_lock_irqsave(&drv_tags->lock, flags);
	2369	+ spin_unlock_irqrestore(&drv_tags->lock, flags);
1972	2370	}
1973	2371
1974	2372	void blk_mq_free_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
..	..	@@ -1989,42 +2387,44 @@
1989	2387	}
1990	2388	}
1991	2389
	2390	+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
	2391	+
1992	2392	while (!list_empty(&tags->page_list)) {
1993	2393	page = list_first_entry(&tags->page_list, struct page, lru);
1994	2394	list_del_init(&page->lru);
1995	2395	/*
1996	2396	* Remove kmemleak object previously allocated in
1997		- * blk_mq_init_rq_map().
	2397	+ * blk_mq_alloc_rqs().
1998	2398	*/
1999	2399	kmemleak_free(page_address(page));
2000	2400	__free_pages(page, page->private);
2001	2401	}
2002	2402	}
2003	2403
2004		-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
	2404	+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
2005	2405	{
2006	2406	kfree(tags->rqs);
2007	2407	tags->rqs = NULL;
2008	2408	kfree(tags->static_rqs);
2009	2409	tags->static_rqs = NULL;
2010	2410
2011		- blk_mq_free_tags(tags);
	2411	+ blk_mq_free_tags(tags, flags);
2012	2412	}
2013	2413
2014	2414	struct blk_mq_tags blk_mq_alloc_rq_map(struct blk_mq_tag_set set,
2015	2415	unsigned int hctx_idx,
2016	2416	unsigned int nr_tags,
2017		- unsigned int reserved_tags)
	2417	+ unsigned int reserved_tags,
	2418	+ unsigned int flags)
2018	2419	{
2019	2420	struct blk_mq_tags *tags;
2020	2421	int node;
2021	2422
2022		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2423	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2023	2424	if (node == NUMA_NO_NODE)
2024	2425	node = set->numa_node;
2025	2426
2026		- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2027		- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
	2427	+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
2028	2428	if (!tags)
2029	2429	return NULL;
2030	2430
..	..	@@ -2032,7 +2432,7 @@
2032	2432	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2033	2433	node);
2034	2434	if (!tags->rqs) {
2035		- blk_mq_free_tags(tags);
	2435	+ blk_mq_free_tags(tags, flags);
2036	2436	return NULL;
2037	2437	}
2038	2438
..	..	@@ -2041,16 +2441,11 @@
2041	2441	node);
2042	2442	if (!tags->static_rqs) {
2043	2443	kfree(tags->rqs);
2044		- blk_mq_free_tags(tags);
	2444	+ blk_mq_free_tags(tags, flags);
2045	2445	return NULL;
2046	2446	}
2047	2447
2048	2448	return tags;
2049		-}
2050		-
2051		-static size_t order_to_size(unsigned int order)
2052		-{
2053		- return (size_t)PAGE_SIZE << order;
2054	2449	}
2055	2450
2056	2451	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
..	..	@@ -2075,7 +2470,7 @@
2075	2470	size_t rq_size, left;
2076	2471	int node;
2077	2472
2078		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2473	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2079	2474	if (node == NUMA_NO_NODE)
2080	2475	node = set->numa_node;
2081	2476
..	..	@@ -2087,6 +2482,7 @@
2087	2482	*/
2088	2483	rq_size = round_up(sizeof(struct request) + set->cmd_size,
2089	2484	cache_line_size());
	2485	+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
2090	2486	left = rq_size * depth;
2091	2487
2092	2488	for (i = 0; i < depth; ) {
..	..	@@ -2145,6 +2541,86 @@
2145	2541	return -ENOMEM;
2146	2542	}
2147	2543
	2544	+struct rq_iter_data {
	2545	+ struct blk_mq_hw_ctx *hctx;
	2546	+ bool has_rq;
	2547	+};
	2548	+
	2549	+static bool blk_mq_has_request(struct request rq, void data, bool reserved)
	2550	+{
	2551	+ struct rq_iter_data *iter_data = data;
	2552	+
	2553	+ if (rq->mq_hctx != iter_data->hctx)
	2554	+ return true;
	2555	+ iter_data->has_rq = true;
	2556	+ return false;
	2557	+}
	2558	+
	2559	+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
	2560	+{
	2561	+ struct blk_mq_tags *tags = hctx->sched_tags ?
	2562	+ hctx->sched_tags : hctx->tags;
	2563	+ struct rq_iter_data data = {
	2564	+ .hctx = hctx,
	2565	+ };
	2566	+
	2567	+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
	2568	+ return data.has_rq;
	2569	+}
	2570	+
	2571	+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
	2572	+ struct blk_mq_hw_ctx *hctx)
	2573	+{
	2574	+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
	2575	+ return false;
	2576	+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
	2577	+ return false;
	2578	+ return true;
	2579	+}
	2580	+
	2581	+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
	2582	+{
	2583	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2584	+ struct blk_mq_hw_ctx, cpuhp_online);
	2585	+
	2586	+ if (!cpumask_test_cpu(cpu, hctx->cpumask) \|\|
	2587	+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
	2588	+ return 0;
	2589	+
	2590	+ /*
	2591	+ * Prevent new request from being allocated on the current hctx.
	2592	+ *
	2593	+ * The smp_mb__after_atomic() Pairs with the implied barrier in
	2594	+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
	2595	+ * seen once we return from the tag allocator.
	2596	+ */
	2597	+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2598	+ smp_mb__after_atomic();
	2599	+
	2600	+ /*
	2601	+ * Try to grab a reference to the queue and wait for any outstanding
	2602	+ * requests. If we could not grab a reference the queue has been
	2603	+ * frozen and there are no requests.
	2604	+ */
	2605	+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
	2606	+ while (blk_mq_hctx_has_requests(hctx))
	2607	+ msleep(5);
	2608	+ percpu_ref_put(&hctx->queue->q_usage_counter);
	2609	+ }
	2610	+
	2611	+ return 0;
	2612	+}
	2613	+
	2614	+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
	2615	+{
	2616	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2617	+ struct blk_mq_hw_ctx, cpuhp_online);
	2618	+
	2619	+ if (cpumask_test_cpu(cpu, hctx->cpumask))
	2620	+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2621	+ return 0;
	2622	+}
	2623	+
2148	2624	/*
2149	2625	* 'cpu' is going away. splice any existing rq_list entries from this
2150	2626	* software queue to the hw queue dispatch list, and ensure that it
..	..	@@ -2155,13 +2631,18 @@
2155	2631	struct blk_mq_hw_ctx *hctx;
2156	2632	struct blk_mq_ctx *ctx;
2157	2633	LIST_HEAD(tmp);
	2634	+ enum hctx_type type;
2158	2635
2159	2636	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
	2637	+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
	2638	+ return 0;
	2639	+
2160	2640	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
	2641	+ type = hctx->type;
2161	2642
2162	2643	spin_lock(&ctx->lock);
2163		- if (!list_empty(&ctx->rq_list)) {
2164		- list_splice_init(&ctx->rq_list, &tmp);
	2644	+ if (!list_empty(&ctx->rq_lists[type])) {
	2645	+ list_splice_init(&ctx->rq_lists[type], &tmp);
2165	2646	blk_mq_hctx_clear_pending(hctx, ctx);
2166	2647	}
2167	2648	spin_unlock(&ctx->lock);
..	..	@@ -2179,8 +2660,40 @@
2179	2660
2180	2661	static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2181	2662	{
	2663	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2664	+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2665	+ &hctx->cpuhp_online);
2182	2666	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2183	2667	&hctx->cpuhp_dead);
	2668	+}
	2669	+
	2670	+/*
	2671	+ * Before freeing hw queue, clearing the flush request reference in
	2672	+ * tags->rqs[] for avoiding potential UAF.
	2673	+ */
	2674	+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
	2675	+ unsigned int queue_depth, struct request *flush_rq)
	2676	+{
	2677	+ int i;
	2678	+ unsigned long flags;
	2679	+
	2680	+ /* The hw queue may not be mapped yet */
	2681	+ if (!tags)
	2682	+ return;
	2683	+
	2684	+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
	2685	+
	2686	+ for (i = 0; i < queue_depth; i++)
	2687	+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
	2688	+
	2689	+ /*
	2690	+ * Wait until all pending iteration is done.
	2691	+ *
	2692	+ * Request reference is cleared and it is guaranteed to be observed
	2693	+ * after the ->lock is released.
	2694	+ */
	2695	+ spin_lock_irqsave(&tags->lock, flags);
	2696	+ spin_unlock_irqrestore(&tags->lock, flags);
2184	2697	}
2185	2698
2186	2699	/* hctx->ctxs will be freed in queue's release handler */
..	..	@@ -2188,18 +2701,24 @@
2188	2701	struct blk_mq_tag_set *set,
2189	2702	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2190	2703	{
2191		- blk_mq_debugfs_unregister_hctx(hctx);
	2704	+ struct request *flush_rq = hctx->fq->flush_rq;
2192	2705
2193	2706	if (blk_mq_hw_queue_mapped(hctx))
2194	2707	blk_mq_tag_idle(hctx);
2195	2708
	2709	+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
	2710	+ set->queue_depth, flush_rq);
2196	2711	if (set->ops->exit_request)
2197		- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
	2712	+ set->ops->exit_request(set, flush_rq, hctx_idx);
2198	2713
2199	2714	if (set->ops->exit_hctx)
2200	2715	set->ops->exit_hctx(hctx, hctx_idx);
2201	2716
2202	2717	blk_mq_remove_cpuhp(hctx);
	2718	+
	2719	+ spin_lock(&q->unused_hctx_lock);
	2720	+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
	2721	+ spin_unlock(&q->unused_hctx_lock);
2203	2722	}
2204	2723
2205	2724	static void blk_mq_exit_hw_queues(struct request_queue *q,
..	..	@@ -2211,112 +2730,160 @@
2211	2730	queue_for_each_hw_ctx(q, hctx, i) {
2212	2731	if (i == nr_queue)
2213	2732	break;
	2733	+ blk_mq_debugfs_unregister_hctx(hctx);
2214	2734	blk_mq_exit_hctx(q, set, hctx, i);
2215	2735	}
	2736	+}
	2737	+
	2738	+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	2739	+{
	2740	+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	2741	+
	2742	+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
	2743	+ __alignof__(struct blk_mq_hw_ctx)) !=
	2744	+ sizeof(struct blk_mq_hw_ctx));
	2745	+
	2746	+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
	2747	+ hw_ctx_size += sizeof(struct srcu_struct);
	2748	+
	2749	+ return hw_ctx_size;
2216	2750	}
2217	2751
2218	2752	static int blk_mq_init_hctx(struct request_queue *q,
2219	2753	struct blk_mq_tag_set *set,
2220	2754	struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2221	2755	{
2222		- int node;
	2756	+ hctx->queue_num = hctx_idx;
2223	2757
2224		- node = hctx->numa_node;
	2758	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2759	+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2760	+ &hctx->cpuhp_online);
	2761	+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
	2762	+
	2763	+ hctx->tags = set->tags[hctx_idx];
	2764	+
	2765	+ if (set->ops->init_hctx &&
	2766	+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
	2767	+ goto unregister_cpu_notifier;
	2768	+
	2769	+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
	2770	+ hctx->numa_node))
	2771	+ goto exit_hctx;
	2772	+ return 0;
	2773	+
	2774	+ exit_hctx:
	2775	+ if (set->ops->exit_hctx)
	2776	+ set->ops->exit_hctx(hctx, hctx_idx);
	2777	+ unregister_cpu_notifier:
	2778	+ blk_mq_remove_cpuhp(hctx);
	2779	+ return -1;
	2780	+}
	2781	+
	2782	+static struct blk_mq_hw_ctx *
	2783	+blk_mq_alloc_hctx(struct request_queue q, struct blk_mq_tag_set set,
	2784	+ int node)
	2785	+{
	2786	+ struct blk_mq_hw_ctx *hctx;
	2787	+ gfp_t gfp = GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY;
	2788	+
	2789	+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
	2790	+ if (!hctx)
	2791	+ goto fail_alloc_hctx;
	2792	+
	2793	+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
	2794	+ goto free_hctx;
	2795	+
	2796	+ atomic_set(&hctx->nr_active, 0);
2225	2797	if (node == NUMA_NO_NODE)
2226		- node = hctx->numa_node = set->numa_node;
	2798	+ node = set->numa_node;
	2799	+ hctx->numa_node = node;
2227	2800
2228	2801	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2229	2802	spin_lock_init(&hctx->lock);
2230	2803	INIT_LIST_HEAD(&hctx->dispatch);
2231	2804	hctx->queue = q;
2232		- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
	2805	+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
2233	2806
2234		- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2235		-
2236		- hctx->tags = set->tags[hctx_idx];
	2807	+ INIT_LIST_HEAD(&hctx->hctx_list);
2237	2808
2238	2809	/*
2239	2810	* Allocate space for all possible cpus to avoid allocation at
2240	2811	* runtime
2241	2812	*/
2242	2813	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2243		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node);
	2814	+ gfp, node);
2244	2815	if (!hctx->ctxs)
2245		- goto unregister_cpu_notifier;
	2816	+ goto free_cpumask;
2246	2817
2247	2818	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2248		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node))
	2819	+ gfp, node))
2249	2820	goto free_ctxs;
2250		-
2251	2821	hctx->nr_ctx = 0;
2252	2822
2253	2823	spin_lock_init(&hctx->dispatch_wait_lock);
2254	2824	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2255	2825	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2256	2826
2257		- if (set->ops->init_hctx &&
2258		- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2259		- goto free_bitmap;
2260		-
2261		- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2262		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
	2827	+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2263	2828	if (!hctx->fq)
2264		- goto exit_hctx;
2265		-
2266		- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2267		- goto free_fq;
	2829	+ goto free_bitmap;
2268	2830
2269	2831	if (hctx->flags & BLK_MQ_F_BLOCKING)
2270	2832	init_srcu_struct(hctx->srcu);
	2833	+ blk_mq_hctx_kobj_init(hctx);
2271	2834
2272		- blk_mq_debugfs_register_hctx(q, hctx);
	2835	+ return hctx;
2273	2836
2274		- return 0;
2275		-
2276		- free_fq:
2277		- blk_free_flush_queue(hctx->fq);
2278		- exit_hctx:
2279		- if (set->ops->exit_hctx)
2280		- set->ops->exit_hctx(hctx, hctx_idx);
2281	2837	free_bitmap:
2282	2838	sbitmap_free(&hctx->ctx_map);
2283	2839	free_ctxs:
2284	2840	kfree(hctx->ctxs);
2285		- unregister_cpu_notifier:
2286		- blk_mq_remove_cpuhp(hctx);
2287		- return -1;
	2841	+ free_cpumask:
	2842	+ free_cpumask_var(hctx->cpumask);
	2843	+ free_hctx:
	2844	+ kfree(hctx);
	2845	+ fail_alloc_hctx:
	2846	+ return NULL;
2288	2847	}
2289	2848
2290	2849	static void blk_mq_init_cpu_queues(struct request_queue *q,
2291	2850	unsigned int nr_hw_queues)
2292	2851	{
2293		- unsigned int i;
	2852	+ struct blk_mq_tag_set *set = q->tag_set;
	2853	+ unsigned int i, j;
2294	2854
2295	2855	for_each_possible_cpu(i) {
2296	2856	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2297	2857	struct blk_mq_hw_ctx *hctx;
	2858	+ int k;
2298	2859
2299	2860	__ctx->cpu = i;
2300	2861	spin_lock_init(&__ctx->lock);
2301		- INIT_LIST_HEAD(&__ctx->rq_list);
	2862	+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
	2863	+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
	2864	+
2302	2865	__ctx->queue = q;
2303	2866
2304	2867	/*
2305	2868	* Set local node, IFF we have more than one hw queue. If
2306	2869	* not, we remain on the home node of the device
2307	2870	*/
2308		- hctx = blk_mq_map_queue(q, i);
2309		- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2310		- hctx->numa_node = local_memory_node(cpu_to_node(i));
	2871	+ for (j = 0; j < set->nr_maps; j++) {
	2872	+ hctx = blk_mq_map_queue_type(q, j, i);
	2873	+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
	2874	+ hctx->numa_node = cpu_to_node(i);
	2875	+ }
2311	2876	}
2312	2877	}
2313	2878
2314		-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
	2879	+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
	2880	+ int hctx_idx)
2315	2881	{
	2882	+ unsigned int flags = set->flags;
2316	2883	int ret = 0;
2317	2884
2318	2885	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2319		- set->queue_depth, set->reserved_tags);
	2886	+ set->queue_depth, set->reserved_tags, flags);
2320	2887	if (!set->tags[hctx_idx])
2321	2888	return false;
2322	2889
..	..	@@ -2325,7 +2892,7 @@
2325	2892	if (!ret)
2326	2893	return true;
2327	2894
2328		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2895	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2329	2896	set->tags[hctx_idx] = NULL;
2330	2897	return false;
2331	2898	}
..	..	@@ -2333,16 +2900,18 @@
2333	2900	static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2334	2901	unsigned int hctx_idx)
2335	2902	{
2336		- if (set->tags[hctx_idx]) {
	2903	+ unsigned int flags = set->flags;
	2904	+
	2905	+ if (set->tags && set->tags[hctx_idx]) {
2337	2906	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2338		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2907	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2339	2908	set->tags[hctx_idx] = NULL;
2340	2909	}
2341	2910	}
2342	2911
2343	2912	static void blk_mq_map_swqueue(struct request_queue *q)
2344	2913	{
2345		- unsigned int i, hctx_idx;
	2914	+ unsigned int i, j, hctx_idx;
2346	2915	struct blk_mq_hw_ctx *hctx;
2347	2916	struct blk_mq_ctx *ctx;
2348	2917	struct blk_mq_tag_set *set = q->tag_set;
..	..	@@ -2359,25 +2928,52 @@
2359	2928	* If the cpu isn't present, the cpu is mapped to first hctx.
2360	2929	*/
2361	2930	for_each_possible_cpu(i) {
2362		- hctx_idx = q->mq_map[i];
2363		- /* unmapped hw queue can be remapped after CPU topo changed */
2364		- if (!set->tags[hctx_idx] &&
2365		- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2366		- /*
2367		- * If tags initialization fail for some hctx,
2368		- * that hctx won't be brought online. In this
2369		- * case, remap the current ctx to hctx[0] which
2370		- * is guaranteed to always have tags allocated
2371		- */
2372		- q->mq_map[i] = 0;
2373		- }
2374	2931
2375	2932	ctx = per_cpu_ptr(q->queue_ctx, i);
2376		- hctx = blk_mq_map_queue(q, i);
	2933	+ for (j = 0; j < set->nr_maps; j++) {
	2934	+ if (!set->map[j].nr_queues) {
	2935	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2936	+ HCTX_TYPE_DEFAULT, i);
	2937	+ continue;
	2938	+ }
	2939	+ hctx_idx = set->map[j].mq_map[i];
	2940	+ /* unmapped hw queue can be remapped after CPU topo changed */
	2941	+ if (!set->tags[hctx_idx] &&
	2942	+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
	2943	+ /*
	2944	+ * If tags initialization fail for some hctx,
	2945	+ * that hctx won't be brought online. In this
	2946	+ * case, remap the current ctx to hctx[0] which
	2947	+ * is guaranteed to always have tags allocated
	2948	+ */
	2949	+ set->map[j].mq_map[i] = 0;
	2950	+ }
2377	2951
2378		- cpumask_set_cpu(i, hctx->cpumask);
2379		- ctx->index_hw = hctx->nr_ctx;
2380		- hctx->ctxs[hctx->nr_ctx++] = ctx;
	2952	+ hctx = blk_mq_map_queue_type(q, j, i);
	2953	+ ctx->hctxs[j] = hctx;
	2954	+ /*
	2955	+ * If the CPU is already set in the mask, then we've
	2956	+ * mapped this one already. This can happen if
	2957	+ * devices share queues across queue maps.
	2958	+ */
	2959	+ if (cpumask_test_cpu(i, hctx->cpumask))
	2960	+ continue;
	2961	+
	2962	+ cpumask_set_cpu(i, hctx->cpumask);
	2963	+ hctx->type = j;
	2964	+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
	2965	+ hctx->ctxs[hctx->nr_ctx++] = ctx;
	2966	+
	2967	+ /*
	2968	+ * If the nr_ctx type overflows, we have exceeded the
	2969	+ * amount of sw queues we can support.
	2970	+ */
	2971	+ BUG_ON(!hctx->nr_ctx);
	2972	+ }
	2973	+
	2974	+ for (; j < HCTX_MAX_TYPES; j++)
	2975	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2976	+ HCTX_TYPE_DEFAULT, i);
2381	2977	}
2382	2978
2383	2979	queue_for_each_hw_ctx(q, hctx, i) {
..	..	@@ -2426,14 +3022,14 @@
2426	3022
2427	3023	queue_for_each_hw_ctx(q, hctx, i) {
2428	3024	if (shared)
2429		- hctx->flags \|= BLK_MQ_F_TAG_SHARED;
	3025	+ hctx->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2430	3026	else
2431		- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
	3027	+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2432	3028	}
2433	3029	}
2434	3030
2435		-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2436		- bool shared)
	3031	+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
	3032	+ bool shared)
2437	3033	{
2438	3034	struct request_queue *q;
2439	3035
..	..	@@ -2451,12 +3047,12 @@
2451	3047	struct blk_mq_tag_set *set = q->tag_set;
2452	3048
2453	3049	mutex_lock(&set->tag_list_lock);
2454		- list_del_rcu(&q->tag_set_list);
	3050	+ list_del(&q->tag_set_list);
2455	3051	if (list_is_singular(&set->tag_list)) {
2456	3052	/* just transitioned to unshared */
2457		- set->flags &= ~BLK_MQ_F_TAG_SHARED;
	3053	+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2458	3054	/* update existing queue */
2459		- blk_mq_update_tag_set_depth(set, false);
	3055	+ blk_mq_update_tag_set_shared(set, false);
2460	3056	}
2461	3057	mutex_unlock(&set->tag_list_lock);
2462	3058	INIT_LIST_HEAD(&q->tag_set_list);
..	..	@@ -2465,24 +3061,50 @@
2465	3061	static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2466	3062	struct request_queue *q)
2467	3063	{
2468		- q->tag_set = set;
2469		-
2470	3064	mutex_lock(&set->tag_list_lock);
2471	3065
2472	3066	/*
2473	3067	* Check to see if we're transitioning to shared (from 1 to 2 queues).
2474	3068	*/
2475	3069	if (!list_empty(&set->tag_list) &&
2476		- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2477		- set->flags \|= BLK_MQ_F_TAG_SHARED;
	3070	+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	3071	+ set->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2478	3072	/* update existing queue */
2479		- blk_mq_update_tag_set_depth(set, true);
	3073	+ blk_mq_update_tag_set_shared(set, true);
2480	3074	}
2481		- if (set->flags & BLK_MQ_F_TAG_SHARED)
	3075	+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2482	3076	queue_set_hctx_shared(q, true);
2483		- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
	3077	+ list_add_tail(&q->tag_set_list, &set->tag_list);
2484	3078
2485	3079	mutex_unlock(&set->tag_list_lock);
	3080	+}
	3081	+
	3082	+/* All allocations will be freed in release handler of q->mq_kobj */
	3083	+static int blk_mq_alloc_ctxs(struct request_queue *q)
	3084	+{
	3085	+ struct blk_mq_ctxs *ctxs;
	3086	+ int cpu;
	3087	+
	3088	+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
	3089	+ if (!ctxs)
	3090	+ return -ENOMEM;
	3091	+
	3092	+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
	3093	+ if (!ctxs->queue_ctx)
	3094	+ goto fail;
	3095	+
	3096	+ for_each_possible_cpu(cpu) {
	3097	+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
	3098	+ ctx->ctxs = ctxs;
	3099	+ }
	3100	+
	3101	+ q->mq_kobj = &ctxs->kobj;
	3102	+ q->queue_ctx = ctxs->queue_ctx;
	3103	+
	3104	+ return 0;
	3105	+ fail:
	3106	+ kfree(ctxs);
	3107	+ return -ENOMEM;
2486	3108	}
2487	3109
2488	3110	/*
..	..	@@ -2493,17 +3115,17 @@
2493	3115	*/
2494	3116	void blk_mq_release(struct request_queue *q)
2495	3117	{
2496		- struct blk_mq_hw_ctx *hctx;
2497		- unsigned int i;
	3118	+ struct blk_mq_hw_ctx hctx, next;
	3119	+ int i;
2498	3120
2499		- /* hctx kobj stays in hctx */
2500		- queue_for_each_hw_ctx(q, hctx, i) {
2501		- if (!hctx)
2502		- continue;
	3121	+ queue_for_each_hw_ctx(q, hctx, i)
	3122	+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
	3123	+
	3124	+ /* all hctx are in .unused_hctx_list now */
	3125	+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
	3126	+ list_del_init(&hctx->hctx_list);
2503	3127	kobject_put(&hctx->kobj);
2504	3128	}
2505		-
2506		- q->mq_map = NULL;
2507	3129
2508	3130	kfree(q->queue_hw_ctx);
2509	3131
..	..	@@ -2512,102 +3134,184 @@
2512	3134	* both share lifetime with request queue.
2513	3135	*/
2514	3136	blk_mq_sysfs_deinit(q);
2515		-
2516		- free_percpu(q->queue_ctx);
2517	3137	}
2518	3138
2519		-struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3139	+struct request_queue blk_mq_init_queue_data(struct blk_mq_tag_set set,
	3140	+ void *queuedata)
2520	3141	{
2521	3142	struct request_queue uninit_q, q;
2522	3143
2523		- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
	3144	+ uninit_q = blk_alloc_queue(set->numa_node);
2524	3145	if (!uninit_q)
2525	3146	return ERR_PTR(-ENOMEM);
	3147	+ uninit_q->queuedata = queuedata;
2526	3148
2527		- q = blk_mq_init_allocated_queue(set, uninit_q);
	3149	+ /*
	3150	+ * Initialize the queue without an elevator. device_add_disk() will do
	3151	+ * the initialization.
	3152	+ */
	3153	+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
2528	3154	if (IS_ERR(q))
2529	3155	blk_cleanup_queue(uninit_q);
2530	3156
2531	3157	return q;
2532	3158	}
	3159	+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
	3160	+
	3161	+struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3162	+{
	3163	+ return blk_mq_init_queue_data(set, NULL);
	3164	+}
2533	3165	EXPORT_SYMBOL(blk_mq_init_queue);
2534	3166
2535		-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	3167	+/*
	3168	+ * Helper for setting up a queue with mq ops, given queue depth, and
	3169	+ * the passed in mq ops flags.
	3170	+ */
	3171	+struct request_queue blk_mq_init_sq_queue(struct blk_mq_tag_set set,
	3172	+ const struct blk_mq_ops *ops,
	3173	+ unsigned int queue_depth,
	3174	+ unsigned int set_flags)
2536	3175	{
2537		- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	3176	+ struct request_queue *q;
	3177	+ int ret;
2538	3178
2539		- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2540		- __alignof__(struct blk_mq_hw_ctx)) !=
2541		- sizeof(struct blk_mq_hw_ctx));
	3179	+ memset(set, 0, sizeof(*set));
	3180	+ set->ops = ops;
	3181	+ set->nr_hw_queues = 1;
	3182	+ set->nr_maps = 1;
	3183	+ set->queue_depth = queue_depth;
	3184	+ set->numa_node = NUMA_NO_NODE;
	3185	+ set->flags = set_flags;
2542	3186
2543		- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2544		- hw_ctx_size += sizeof(struct srcu_struct);
	3187	+ ret = blk_mq_alloc_tag_set(set);
	3188	+ if (ret)
	3189	+ return ERR_PTR(ret);
2545	3190
2546		- return hw_ctx_size;
	3191	+ q = blk_mq_init_queue(set);
	3192	+ if (IS_ERR(q)) {
	3193	+ blk_mq_free_tag_set(set);
	3194	+ return q;
	3195	+ }
	3196	+
	3197	+ return q;
	3198	+}
	3199	+EXPORT_SYMBOL(blk_mq_init_sq_queue);
	3200	+
	3201	+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
	3202	+ struct blk_mq_tag_set set, struct request_queue q,
	3203	+ int hctx_idx, int node)
	3204	+{
	3205	+ struct blk_mq_hw_ctx hctx = NULL, tmp;
	3206	+
	3207	+ /* reuse dead hctx first */
	3208	+ spin_lock(&q->unused_hctx_lock);
	3209	+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
	3210	+ if (tmp->numa_node == node) {
	3211	+ hctx = tmp;
	3212	+ break;
	3213	+ }
	3214	+ }
	3215	+ if (hctx)
	3216	+ list_del_init(&hctx->hctx_list);
	3217	+ spin_unlock(&q->unused_hctx_lock);
	3218	+
	3219	+ if (!hctx)
	3220	+ hctx = blk_mq_alloc_hctx(q, set, node);
	3221	+ if (!hctx)
	3222	+ goto fail;
	3223	+
	3224	+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
	3225	+ goto free_hctx;
	3226	+
	3227	+ return hctx;
	3228	+
	3229	+ free_hctx:
	3230	+ kobject_put(&hctx->kobj);
	3231	+ fail:
	3232	+ return NULL;
2547	3233	}
2548	3234
2549	3235	static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2550	3236	struct request_queue *q)
2551	3237	{
2552		- int i, j;
	3238	+ int i, j, end;
2553	3239	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2554	3240
2555		- blk_mq_sysfs_unregister(q);
	3241	+ if (q->nr_hw_queues < set->nr_hw_queues) {
	3242	+ struct blk_mq_hw_ctx **new_hctxs;
	3243	+
	3244	+ new_hctxs = kcalloc_node(set->nr_hw_queues,
	3245	+ sizeof(*new_hctxs), GFP_KERNEL,
	3246	+ set->numa_node);
	3247	+ if (!new_hctxs)
	3248	+ return;
	3249	+ if (hctxs)
	3250	+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
	3251	+ sizeof(*hctxs));
	3252	+ q->queue_hw_ctx = new_hctxs;
	3253	+ kfree(hctxs);
	3254	+ hctxs = new_hctxs;
	3255	+ }
2556	3256
2557	3257	/* protect against switching io scheduler */
2558	3258	mutex_lock(&q->sysfs_lock);
2559	3259	for (i = 0; i < set->nr_hw_queues; i++) {
2560	3260	int node;
	3261	+ struct blk_mq_hw_ctx *hctx;
2561	3262
2562		- if (hctxs[i])
	3263	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
	3264	+ /*
	3265	+ * If the hw queue has been mapped to another numa node,
	3266	+ * we need to realloc the hctx. If allocation fails, fallback
	3267	+ * to use the previous one.
	3268	+ */
	3269	+ if (hctxs[i] && (hctxs[i]->numa_node == node))
2563	3270	continue;
2564	3271
2565		- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2566		- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2567		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2568		- node);
2569		- if (!hctxs[i])
2570		- break;
2571		-
2572		- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2573		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2574		- node)) {
2575		- kfree(hctxs[i]);
2576		- hctxs[i] = NULL;
2577		- break;
	3272	+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
	3273	+ if (hctx) {
	3274	+ if (hctxs[i])
	3275	+ blk_mq_exit_hctx(q, set, hctxs[i], i);
	3276	+ hctxs[i] = hctx;
	3277	+ } else {
	3278	+ if (hctxs[i])
	3279	+ pr_warn("Allocate new hctx on node %d fails,\
	3280	+ fallback to previous one on node %d\n",
	3281	+ node, hctxs[i]->numa_node);
	3282	+ else
	3283	+ break;
2578	3284	}
2579		-
2580		- atomic_set(&hctxs[i]->nr_active, 0);
2581		- hctxs[i]->numa_node = node;
2582		- hctxs[i]->queue_num = i;
2583		-
2584		- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2585		- free_cpumask_var(hctxs[i]->cpumask);
2586		- kfree(hctxs[i]);
2587		- hctxs[i] = NULL;
2588		- break;
2589		- }
2590		- blk_mq_hctx_kobj_init(hctxs[i]);
2591	3285	}
2592		- for (j = i; j < q->nr_hw_queues; j++) {
	3286	+ /*
	3287	+ * Increasing nr_hw_queues fails. Free the newly allocated
	3288	+ * hctxs and keep the previous q->nr_hw_queues.
	3289	+ */
	3290	+ if (i != set->nr_hw_queues) {
	3291	+ j = q->nr_hw_queues;
	3292	+ end = i;
	3293	+ } else {
	3294	+ j = i;
	3295	+ end = q->nr_hw_queues;
	3296	+ q->nr_hw_queues = set->nr_hw_queues;
	3297	+ }
	3298	+
	3299	+ for (; j < end; j++) {
2593	3300	struct blk_mq_hw_ctx *hctx = hctxs[j];
2594	3301
2595	3302	if (hctx) {
2596	3303	if (hctx->tags)
2597	3304	blk_mq_free_map_and_requests(set, j);
2598	3305	blk_mq_exit_hctx(q, set, hctx, j);
2599		- kobject_put(&hctx->kobj);
2600	3306	hctxs[j] = NULL;
2601		-
2602	3307	}
2603	3308	}
2604		- q->nr_hw_queues = i;
2605	3309	mutex_unlock(&q->sysfs_lock);
2606		- blk_mq_sysfs_register(q);
2607	3310	}
2608	3311
2609	3312	struct request_queue blk_mq_init_allocated_queue(struct blk_mq_tag_set set,
2610		- struct request_queue *q)
	3313	+ struct request_queue *q,
	3314	+ bool elevator_init)
2611	3315	{
2612	3316	/* mark the queue as mq asap */
2613	3317	q->mq_ops = set->ops;
..	..	@@ -2618,19 +3322,14 @@
2618	3322	if (!q->poll_cb)
2619	3323	goto err_exit;
2620	3324
2621		- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2622		- if (!q->queue_ctx)
2623		- goto err_exit;
	3325	+ if (blk_mq_alloc_ctxs(q))
	3326	+ goto err_poll;
2624	3327
2625	3328	/* init q->mq_kobj and sw queues' kobjects */
2626	3329	blk_mq_sysfs_init(q);
2627	3330
2628		- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2629		- GFP_KERNEL, set->numa_node);
2630		- if (!q->queue_hw_ctx)
2631		- goto err_percpu;
2632		-
2633		- q->mq_map = set->mq_map;
	3331	+ INIT_LIST_HEAD(&q->unused_hctx_list);
	3332	+ spin_lock_init(&q->unused_hctx_lock);
2634	3333
2635	3334	blk_mq_realloc_hw_ctxs(set, q);
2636	3335	if (!q->nr_hw_queues)
..	..	@@ -2639,12 +3338,12 @@
2639	3338	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2640	3339	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2641	3340
2642		- q->nr_queues = nr_cpu_ids;
	3341	+ q->tag_set = set;
2643	3342
2644	3343	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
2645		-
2646		- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2647		- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
	3344	+ if (set->nr_maps > HCTX_TYPE_POLL &&
	3345	+ set->map[HCTX_TYPE_POLL].nr_queues)
	3346	+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2648	3347
2649	3348	q->sg_reserved_size = INT_MAX;
2650	3349
..	..	@@ -2652,41 +3351,29 @@
2652	3351	INIT_LIST_HEAD(&q->requeue_list);
2653	3352	spin_lock_init(&q->requeue_lock);
2654	3353
2655		- blk_queue_make_request(q, blk_mq_make_request);
2656		- if (q->mq_ops->poll)
2657		- q->poll_fn = blk_mq_poll;
2658		-
2659		- /*
2660		- * Do this after blk_queue_make_request() overrides it...
2661		- */
2662	3354	q->nr_requests = set->queue_depth;
2663	3355
2664	3356	/*
2665	3357	* Default to classic polling
2666	3358	*/
2667		- q->poll_nsec = -1;
2668		-
2669		- if (set->ops->complete)
2670		- blk_queue_softirq_done(q, set->ops->complete);
	3359	+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
2671	3360
2672	3361	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2673	3362	blk_mq_add_queue_tag_set(set, q);
2674	3363	blk_mq_map_swqueue(q);
2675	3364
2676		- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2677		- int ret;
2678		-
2679		- ret = elevator_init_mq(q);
2680		- if (ret)
2681		- return ERR_PTR(ret);
2682		- }
	3365	+ if (elevator_init)
	3366	+ elevator_init_mq(q);
2683	3367
2684	3368	return q;
2685	3369
2686	3370	err_hctxs:
2687	3371	kfree(q->queue_hw_ctx);
2688		-err_percpu:
2689		- free_percpu(q->queue_ctx);
	3372	+ q->nr_hw_queues = 0;
	3373	+ blk_mq_sysfs_deinit(q);
	3374	+err_poll:
	3375	+ blk_stat_free_callback(q->poll_cb);
	3376	+ q->poll_cb = NULL;
2690	3377	err_exit:
2691	3378	q->mq_ops = NULL;
2692	3379	return ERR_PTR(-ENOMEM);
..	..	@@ -2704,38 +3391,21 @@
2704	3391	blk_mq_del_queue_tag_set(q);
2705	3392	}
2706	3393
2707		-/* Basically redo blk_mq_init_queue with queue frozen */
2708		-static void blk_mq_queue_reinit(struct request_queue *q)
2709		-{
2710		- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2711		-
2712		- blk_mq_debugfs_unregister_hctxs(q);
2713		- blk_mq_sysfs_unregister(q);
2714		-
2715		- /*
2716		- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2717		- * we should change hctx numa_node according to the new topology (this
2718		- * involves freeing and re-allocating memory, worth doing?)
2719		- */
2720		- blk_mq_map_swqueue(q);
2721		-
2722		- blk_mq_sysfs_register(q);
2723		- blk_mq_debugfs_register_hctxs(q);
2724		-}
2725		-
2726	3394	static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2727	3395	{
2728	3396	int i;
2729	3397
2730		- for (i = 0; i < set->nr_hw_queues; i++)
2731		- if (!__blk_mq_alloc_rq_map(set, i))
	3398	+ for (i = 0; i < set->nr_hw_queues; i++) {
	3399	+ if (!__blk_mq_alloc_map_and_request(set, i))
2732	3400	goto out_unwind;
	3401	+ cond_resched();
	3402	+ }
2733	3403
2734	3404	return 0;
2735	3405
2736	3406	out_unwind:
2737	3407	while (--i >= 0)
2738		- blk_mq_free_rq_map(set->tags[i]);
	3408	+ blk_mq_free_map_and_requests(set, i);
2739	3409
2740	3410	return -ENOMEM;
2741	3411	}
..	..	@@ -2745,7 +3415,7 @@
2745	3415	* may reduce the depth asked for, if memory is tight. set->queue_depth
2746	3416	* will be updated to reflect the allocated depth.
2747	3417	*/
2748		-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
	3418	+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
2749	3419	{
2750	3420	unsigned int depth;
2751	3421	int err;
..	..	@@ -2777,7 +3447,17 @@
2777	3447
2778	3448	static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2779	3449	{
2780		- if (set->ops->map_queues) {
	3450	+ /*
	3451	+ * blk_mq_map_queues() and multiple .map_queues() implementations
	3452	+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
	3453	+ * number of hardware queues.
	3454	+ */
	3455	+ if (set->nr_maps == 1)
	3456	+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
	3457	+
	3458	+ if (set->ops->map_queues && !is_kdump_kernel()) {
	3459	+ int i;
	3460	+
2781	3461	/*
2782	3462	* transport .map_queues is usually done in the following
2783	3463	* way:
..	..	@@ -2785,18 +3465,44 @@
2785	3465	* for (queue = 0; queue < set->nr_hw_queues; queue++) {
2786	3466	* mask = get_cpu_mask(queue)
2787	3467	* for_each_cpu(cpu, mask)
2788		- * set->mq_map[cpu] = queue;
	3468	+ * set->map[x].mq_map[cpu] = queue;
2789	3469	* }
2790	3470	*
2791	3471	* When we need to remap, the table has to be cleared for
2792	3472	* killing stale mapping since one CPU may not be mapped
2793	3473	* to any hw queue.
2794	3474	*/
2795		- blk_mq_clear_mq_map(set);
	3475	+ for (i = 0; i < set->nr_maps; i++)
	3476	+ blk_mq_clear_mq_map(&set->map[i]);
2796	3477
2797	3478	return set->ops->map_queues(set);
2798		- } else
2799		- return blk_mq_map_queues(set);
	3479	+ } else {
	3480	+ BUG_ON(set->nr_maps > 1);
	3481	+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3482	+ }
	3483	+}
	3484	+
	3485	+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
	3486	+ int cur_nr_hw_queues, int new_nr_hw_queues)
	3487	+{
	3488	+ struct blk_mq_tags **new_tags;
	3489	+
	3490	+ if (cur_nr_hw_queues >= new_nr_hw_queues)
	3491	+ return 0;
	3492	+
	3493	+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
	3494	+ GFP_KERNEL, set->numa_node);
	3495	+ if (!new_tags)
	3496	+ return -ENOMEM;
	3497	+
	3498	+ if (set->tags)
	3499	+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
	3500	+ sizeof(*set->tags));
	3501	+ kfree(set->tags);
	3502	+ set->tags = new_tags;
	3503	+ set->nr_hw_queues = new_nr_hw_queues;
	3504	+
	3505	+ return 0;
2800	3506	}
2801	3507
2802	3508	/*
..	..	@@ -2807,7 +3513,7 @@
2807	3513	*/
2808	3514	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2809	3515	{
2810		- int ret;
	3516	+ int i, ret;
2811	3517
2812	3518	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2813	3519
..	..	@@ -2830,6 +3536,11 @@
2830	3536	set->queue_depth = BLK_MQ_MAX_DEPTH;
2831	3537	}
2832	3538
	3539	+ if (!set->nr_maps)
	3540	+ set->nr_maps = 1;
	3541	+ else if (set->nr_maps > HCTX_MAX_TYPES)
	3542	+ return -EINVAL;
	3543	+
2833	3544	/*
2834	3545	* If a crashdump is active, then we are potentially in a very
2835	3546	* memory constrained environment. Limit us to 1 queue and
..	..	@@ -2837,42 +3548,59 @@
2837	3548	*/
2838	3549	if (is_kdump_kernel()) {
2839	3550	set->nr_hw_queues = 1;
	3551	+ set->nr_maps = 1;
2840	3552	set->queue_depth = min(64U, set->queue_depth);
2841	3553	}
2842	3554	/*
2843		- * There is no use for more h/w queues than cpus.
	3555	+ * There is no use for more h/w queues than cpus if we just have
	3556	+ * a single map
2844	3557	*/
2845		- if (set->nr_hw_queues > nr_cpu_ids)
	3558	+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2846	3559	set->nr_hw_queues = nr_cpu_ids;
2847	3560
2848		- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2849		- GFP_KERNEL, set->numa_node);
2850		- if (!set->tags)
	3561	+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
2851	3562	return -ENOMEM;
2852	3563
2853	3564	ret = -ENOMEM;
2854		- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2855		- GFP_KERNEL, set->numa_node);
2856		- if (!set->mq_map)
2857		- goto out_free_tags;
	3565	+ for (i = 0; i < set->nr_maps; i++) {
	3566	+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
	3567	+ sizeof(set->map[i].mq_map[0]),
	3568	+ GFP_KERNEL, set->numa_node);
	3569	+ if (!set->map[i].mq_map)
	3570	+ goto out_free_mq_map;
	3571	+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
	3572	+ }
2858	3573
2859	3574	ret = blk_mq_update_queue_map(set);
2860	3575	if (ret)
2861	3576	goto out_free_mq_map;
2862	3577
2863		- ret = blk_mq_alloc_rq_maps(set);
	3578	+ ret = blk_mq_alloc_map_and_requests(set);
2864	3579	if (ret)
2865	3580	goto out_free_mq_map;
	3581	+
	3582	+ if (blk_mq_is_sbitmap_shared(set->flags)) {
	3583	+ atomic_set(&set->active_queues_shared_sbitmap, 0);
	3584	+
	3585	+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
	3586	+ ret = -ENOMEM;
	3587	+ goto out_free_mq_rq_maps;
	3588	+ }
	3589	+ }
2866	3590
2867	3591	mutex_init(&set->tag_list_lock);
2868	3592	INIT_LIST_HEAD(&set->tag_list);
2869	3593
2870	3594	return 0;
2871	3595
	3596	+out_free_mq_rq_maps:
	3597	+ for (i = 0; i < set->nr_hw_queues; i++)
	3598	+ blk_mq_free_map_and_requests(set, i);
2872	3599	out_free_mq_map:
2873		- kfree(set->mq_map);
2874		- set->mq_map = NULL;
2875		-out_free_tags:
	3600	+ for (i = 0; i < set->nr_maps; i++) {
	3601	+ kfree(set->map[i].mq_map);
	3602	+ set->map[i].mq_map = NULL;
	3603	+ }
2876	3604	kfree(set->tags);
2877	3605	set->tags = NULL;
2878	3606	return ret;
..	..	@@ -2881,13 +3609,18 @@
2881	3609
2882	3610	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2883	3611	{
2884		- int i;
	3612	+ int i, j;
2885	3613
2886		- for (i = 0; i < nr_cpu_ids; i++)
	3614	+ for (i = 0; i < set->nr_hw_queues; i++)
2887	3615	blk_mq_free_map_and_requests(set, i);
2888	3616
2889		- kfree(set->mq_map);
2890		- set->mq_map = NULL;
	3617	+ if (blk_mq_is_sbitmap_shared(set->flags))
	3618	+ blk_mq_exit_shared_sbitmap(set);
	3619	+
	3620	+ for (j = 0; j < set->nr_maps; j++) {
	3621	+ kfree(set->map[j].mq_map);
	3622	+ set->map[j].mq_map = NULL;
	3623	+ }
2891	3624
2892	3625	kfree(set->tags);
2893	3626	set->tags = NULL;
..	..	@@ -2903,6 +3636,9 @@
2903	3636	if (!set)
2904	3637	return -EINVAL;
2905	3638
	3639	+ if (q->nr_requests == nr)
	3640	+ return 0;
	3641	+
2906	3642	blk_mq_freeze_queue(q);
2907	3643	blk_mq_quiesce_queue(q);
2908	3644
..	..	@@ -2917,14 +3653,16 @@
2917	3653	if (!hctx->sched_tags) {
2918	3654	ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2919	3655	false);
	3656	+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
	3657	+ blk_mq_tag_resize_shared_sbitmap(set, nr);
2920	3658	} else {
2921	3659	ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2922	3660	nr, true);
2923	3661	}
2924	3662	if (ret)
2925	3663	break;
2926		- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2927		- q->elevator->type->ops.mq.depth_updated(hctx);
	3664	+ if (q->elevator && q->elevator->type->ops.depth_updated)
	3665	+ q->elevator->type->ops.depth_updated(hctx);
2928	3666	}
2929	3667
2930	3668	if (!ret)
..	..	@@ -3011,20 +3749,19 @@
3011	3749	{
3012	3750	struct request_queue *q;
3013	3751	LIST_HEAD(head);
	3752	+ int prev_nr_hw_queues;
3014	3753
3015	3754	lockdep_assert_held(&set->tag_list_lock);
3016	3755
3017		- if (nr_hw_queues > nr_cpu_ids)
	3756	+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3018	3757	nr_hw_queues = nr_cpu_ids;
3019		- if (nr_hw_queues < 1 \|\| nr_hw_queues == set->nr_hw_queues)
	3758	+ if (nr_hw_queues < 1)
	3759	+ return;
	3760	+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
3020	3761	return;
3021	3762
3022	3763	list_for_each_entry(q, &set->tag_list, tag_set_list)
3023	3764	blk_mq_freeze_queue(q);
3024		- /*
3025		- * Sync with blk_mq_queue_tag_busy_iter.
3026		- */
3027		- synchronize_rcu();
3028	3765	/*
3029	3766	* Switch IO scheduler to 'none', cleaning up the data associated
3030	3767	* with the previous scheduler. We will switch back once we are done
..	..	@@ -3034,11 +3771,35 @@
3034	3771	if (!blk_mq_elv_switch_none(&head, q))
3035	3772	goto switch_back;
3036	3773
	3774	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3775	+ blk_mq_debugfs_unregister_hctxs(q);
	3776	+ blk_mq_sysfs_unregister(q);
	3777	+ }
	3778	+
	3779	+ prev_nr_hw_queues = set->nr_hw_queues;
	3780	+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
	3781	+ 0)
	3782	+ goto reregister;
	3783	+
3037	3784	set->nr_hw_queues = nr_hw_queues;
	3785	+fallback:
3038	3786	blk_mq_update_queue_map(set);
3039	3787	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3040	3788	blk_mq_realloc_hw_ctxs(set, q);
3041		- blk_mq_queue_reinit(q);
	3789	+ if (q->nr_hw_queues != set->nr_hw_queues) {
	3790	+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
	3791	+ nr_hw_queues, prev_nr_hw_queues);
	3792	+ set->nr_hw_queues = prev_nr_hw_queues;
	3793	+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3794	+ goto fallback;
	3795	+ }
	3796	+ blk_mq_map_swqueue(q);
	3797	+ }
	3798	+
	3799	+reregister:
	3800	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3801	+ blk_mq_sysfs_register(q);
	3802	+ blk_mq_debugfs_register_hctxs(q);
3042	3803	}
3043	3804
3044	3805	switch_back:
..	..	@@ -3092,7 +3853,6 @@
3092	3853	}
3093	3854
3094	3855	static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3095		- struct blk_mq_hw_ctx *hctx,
3096	3856	struct request *rq)
3097	3857	{
3098	3858	unsigned long ret = 0;
..	..	@@ -3125,7 +3885,6 @@
3125	3885	}
3126	3886
3127	3887	static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3128		- struct blk_mq_hw_ctx *hctx,
3129	3888	struct request *rq)
3130	3889	{
3131	3890	struct hrtimer_sleeper hs;
..	..	@@ -3137,18 +3896,15 @@
3137	3896	return false;
3138	3897
3139	3898	/*
3140		- * poll_nsec can be:
	3899	+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3141	3900	*
3142		- * -1: don't ever hybrid sleep
3143	3901	* 0: use half of prev avg
3144	3902	* >0: use this specific value
3145	3903	*/
3146		- if (q->poll_nsec == -1)
3147		- return false;
3148		- else if (q->poll_nsec > 0)
	3904	+ if (q->poll_nsec > 0)
3149	3905	nsecs = q->poll_nsec;
3150	3906	else
3151		- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
	3907	+ nsecs = blk_mq_poll_nsecs(q, rq);
3152	3908
3153	3909	if (!nsecs)
3154	3910	return false;
..	..	@@ -3162,14 +3918,14 @@
3162	3918	kt = nsecs;
3163	3919
3164	3920	mode = HRTIMER_MODE_REL;
3165		- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
	3921	+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3166	3922	hrtimer_set_expires(&hs.timer, kt);
3167	3923
3168	3924	do {
3169	3925	if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3170	3926	break;
3171	3927	set_current_state(TASK_UNINTERRUPTIBLE);
3172		- hrtimer_start_expires(&hs.timer, mode);
	3928	+ hrtimer_sleeper_start_expires(&hs, mode);
3173	3929	if (hs.task)
3174	3930	io_schedule();
3175	3931	hrtimer_cancel(&hs.timer);
..	..	@@ -3181,59 +3937,14 @@
3181	3937	return true;
3182	3938	}
3183	3939
3184		-static bool __blk_mq_poll(struct blk_mq_hw_ctx hctx, struct request rq)
	3940	+static bool blk_mq_poll_hybrid(struct request_queue *q,
	3941	+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3185	3942	{
3186		- struct request_queue *q = hctx->queue;
3187		- long state;
3188		-
3189		- /*
3190		- * If we sleep, have the caller restart the poll loop to reset
3191		- * the state. Like for the other success return cases, the
3192		- * caller is responsible for checking if the IO completed. If
3193		- * the IO isn't complete, we'll get called again and will go
3194		- * straight to the busy poll loop.
3195		- */
3196		- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3197		- return true;
3198		-
3199		- hctx->poll_considered++;
3200		-
3201		- state = current->state;
3202		- while (!need_resched()) {
3203		- int ret;
3204		-
3205		- hctx->poll_invoked++;
3206		-
3207		- ret = q->mq_ops->poll(hctx, rq->tag);
3208		- if (ret > 0) {
3209		- hctx->poll_success++;
3210		- set_current_state(TASK_RUNNING);
3211		- return true;
3212		- }
3213		-
3214		- if (signal_pending_state(state, current))
3215		- set_current_state(TASK_RUNNING);
3216		-
3217		- if (current->state == TASK_RUNNING)
3218		- return true;
3219		- if (ret < 0)
3220		- break;
3221		- cpu_relax();
3222		- }
3223		-
3224		- __set_current_state(TASK_RUNNING);
3225		- return false;
3226		-}
3227		-
3228		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3229		-{
3230		- struct blk_mq_hw_ctx *hctx;
3231	3943	struct request *rq;
3232	3944
3233		- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3945	+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3234	3946	return false;
3235	3947
3236		- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3237	3948	if (!blk_qc_t_is_internal(cookie))
3238	3949	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3239	3950	else {
..	..	@@ -3248,13 +3959,97 @@
3248	3959	return false;
3249	3960	}
3250	3961
3251		- return __blk_mq_poll(hctx, rq);
	3962	+ return blk_mq_poll_hybrid_sleep(q, rq);
3252	3963	}
	3964	+
	3965	+/**
	3966	+ * blk_poll - poll for IO completions
	3967	+ * @q: the queue
	3968	+ * @cookie: cookie passed back at IO submission time
	3969	+ * @spin: whether to spin for completions
	3970	+ *
	3971	+ * Description:
	3972	+ * Poll for completions on the passed in queue. Returns number of
	3973	+ * completed entries found. If @spin is true, then blk_poll will continue
	3974	+ * looping until at least one completion is found, unless the task is
	3975	+ * otherwise marked running (or we need to reschedule).
	3976	+ */
	3977	+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
	3978	+{
	3979	+ struct blk_mq_hw_ctx *hctx;
	3980	+ long state;
	3981	+
	3982	+ if (!blk_qc_t_valid(cookie) \|\|
	3983	+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3984	+ return 0;
	3985	+
	3986	+ if (current->plug)
	3987	+ blk_flush_plug_list(current->plug, false);
	3988	+
	3989	+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
	3990	+
	3991	+ /*
	3992	+ * If we sleep, have the caller restart the poll loop to reset
	3993	+ * the state. Like for the other success return cases, the
	3994	+ * caller is responsible for checking if the IO completed. If
	3995	+ * the IO isn't complete, we'll get called again and will go
	3996	+ * straight to the busy poll loop.
	3997	+ */
	3998	+ if (blk_mq_poll_hybrid(q, hctx, cookie))
	3999	+ return 1;
	4000	+
	4001	+ hctx->poll_considered++;
	4002	+
	4003	+ state = current->state;
	4004	+ do {
	4005	+ int ret;
	4006	+
	4007	+ hctx->poll_invoked++;
	4008	+
	4009	+ ret = q->mq_ops->poll(hctx);
	4010	+ if (ret > 0) {
	4011	+ hctx->poll_success++;
	4012	+ __set_current_state(TASK_RUNNING);
	4013	+ return ret;
	4014	+ }
	4015	+
	4016	+ if (signal_pending_state(state, current))
	4017	+ __set_current_state(TASK_RUNNING);
	4018	+
	4019	+ if (current->state == TASK_RUNNING)
	4020	+ return 1;
	4021	+ if (ret < 0 \|\| !spin)
	4022	+ break;
	4023	+ cpu_relax();
	4024	+ } while (!need_resched());
	4025	+
	4026	+ __set_current_state(TASK_RUNNING);
	4027	+ return 0;
	4028	+}
	4029	+EXPORT_SYMBOL_GPL(blk_poll);
	4030	+
	4031	+unsigned int blk_mq_rq_cpu(struct request *rq)
	4032	+{
	4033	+ return rq->mq_ctx->cpu;
	4034	+}
	4035	+EXPORT_SYMBOL(blk_mq_rq_cpu);
3253	4036
3254	4037	static int __init blk_mq_init(void)
3255	4038	{
	4039	+ int i;
	4040	+
	4041	+ for_each_possible_cpu(i)
	4042	+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
	4043	+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
	4044	+
	4045	+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
	4046	+ "block/softirq:dead", NULL,
	4047	+ blk_softirq_cpu_dead);
3256	4048	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3257	4049	blk_mq_hctx_notify_dead);
	4050	+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
	4051	+ blk_mq_hctx_notify_online,
	4052	+ blk_mq_hctx_notify_offline);
3258	4053	return 0;
3259	4054	}
3260	4055	subsys_initcall(blk_mq_init);