~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Block multiqueue core code
3	4	*
..	..	@@ -25,30 +26,36 @@
25	26	#include <linux/delay.h>
26	27	#include <linux/crash_dump.h>
27	28	#include <linux/prefetch.h>
	29	+#include <linux/blk-crypto.h>
28	30
29	31	#include <trace/events/block.h>
30	32
31	33	#include <linux/blk-mq.h>
	34	+#include <linux/t10-pi.h>
32	35	#include "blk.h"
33	36	#include "blk-mq.h"
34	37	#include "blk-mq-debugfs.h"
35	38	#include "blk-mq-tag.h"
	39	+#include "blk-pm.h"
36	40	#include "blk-stat.h"
37	41	#include "blk-mq-sched.h"
38	42	#include "blk-rq-qos.h"
39	43
40		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
	44	+#include <trace/hooks/block.h>
	45	+
	46	+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
	47	+
41	48	static void blk_mq_poll_stats_start(struct request_queue *q);
42	49	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
43	50
44	51	static int blk_mq_poll_stats_bkt(const struct request *rq)
45	52	{
46		- int ddir, bytes, bucket;
	53	+ int ddir, sectors, bucket;
47	54
48	55	ddir = rq_data_dir(rq);
49		- bytes = blk_rq_bytes(rq);
	56	+ sectors = blk_rq_stats_sectors(rq);
50	57
51		- bucket = ddir + 2*(ilog2(bytes) - 9);
	58	+ bucket = ddir + 2 * ilog2(sectors);
52	59
53	60	if (bucket < 0)
54	61	return -1;
..	..	@@ -59,7 +66,8 @@
59	66	}
60	67
61	68	/*
62		- * Check if any of the ctx's have pending work in this hardware queue
	69	+ * Check if any of the ctx, dispatch list or elevator
	70	+ * have pending work in this hardware queue.
63	71	*/
64	72	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
65	73	{
..	..	@@ -74,75 +82,67 @@
74	82	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
75	83	struct blk_mq_ctx *ctx)
76	84	{
77		- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78		- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
	85	+ const int bit = ctx->index_hw[hctx->type];
	86	+
	87	+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
	88	+ sbitmap_set_bit(&hctx->ctx_map, bit);
79	89	}
80	90
81	91	static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
82	92	struct blk_mq_ctx *ctx)
83	93	{
84		- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
	94	+ const int bit = ctx->index_hw[hctx->type];
	95	+
	96	+ sbitmap_clear_bit(&hctx->ctx_map, bit);
85	97	}
86	98
87	99	struct mq_inflight {
88	100	struct hd_struct *part;
89		- unsigned int *inflight;
	101	+ unsigned int inflight[2];
90	102	};
91	103
92		-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
	104	+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93	105	struct request rq, void priv,
94	106	bool reserved)
95	107	{
96	108	struct mq_inflight *mi = priv;
97	109
98		- /*
99		- * index[0] counts the specific partition that was asked for. index[1]
100		- * counts the ones that are active on the whole device, so increment
101		- * that if mi->part is indeed a partition, and not a whole device.
102		- */
103		- if (rq->part == mi->part)
104		- mi->inflight[0]++;
105		- if (mi->part->partno)
106		- mi->inflight[1]++;
107		-}
108		-
109		-void blk_mq_in_flight(struct request_queue q, struct hd_struct part,
110		- unsigned int inflight[2])
111		-{
112		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113		-
114		- inflight[0] = inflight[1] = 0;
115		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116		-}
117		-
118		-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119		- struct request rq, void priv,
120		- bool reserved)
121		-{
122		- struct mq_inflight *mi = priv;
123		-
124		- if (rq->part == mi->part)
	110	+ if ((!mi->part->partno \|\| rq->part == mi->part) &&
	111	+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125	112	mi->inflight[rq_data_dir(rq)]++;
	113	+
	114	+ return true;
	115	+}
	116	+
	117	+unsigned int blk_mq_in_flight(struct request_queue q, struct hd_struct part)
	118	+{
	119	+ struct mq_inflight mi = { .part = part };
	120	+
	121	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	122	+
	123	+ return mi.inflight[0] + mi.inflight[1];
126	124	}
127	125
128	126	void blk_mq_in_flight_rw(struct request_queue q, struct hd_struct part,
129	127	unsigned int inflight[2])
130	128	{
131		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
	129	+ struct mq_inflight mi = { .part = part };
132	130
133		- inflight[0] = inflight[1] = 0;
134		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
	131	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	132	+ inflight[0] = mi.inflight[0];
	133	+ inflight[1] = mi.inflight[1];
135	134	}
136	135
137	136	void blk_freeze_queue_start(struct request_queue *q)
138	137	{
139		- int freeze_depth;
140		-
141		- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142		- if (freeze_depth == 1) {
	138	+ mutex_lock(&q->mq_freeze_lock);
	139	+ if (++q->mq_freeze_depth == 1) {
143	140	percpu_ref_kill(&q->q_usage_counter);
144		- if (q->mq_ops)
	141	+ mutex_unlock(&q->mq_freeze_lock);
	142	+ if (queue_is_mq(q))
145	143	blk_mq_run_hw_queues(q, false);
	144	+ } else {
	145	+ mutex_unlock(&q->mq_freeze_lock);
146	146	}
147	147	}
148	148	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
..	..	@@ -176,8 +176,6 @@
176	176	* exported to drivers as the only user for unfreeze is blk_mq.
177	177	*/
178	178	blk_freeze_queue_start(q);
179		- if (!q->mq_ops)
180		- blk_drain_queue(q);
181	179	blk_mq_freeze_queue_wait(q);
182	180	}
183	181
..	..	@@ -193,14 +191,14 @@
193	191
194	192	void blk_mq_unfreeze_queue(struct request_queue *q)
195	193	{
196		- int freeze_depth;
197		-
198		- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199		- WARN_ON_ONCE(freeze_depth < 0);
200		- if (!freeze_depth) {
201		- percpu_ref_reinit(&q->q_usage_counter);
	194	+ mutex_lock(&q->mq_freeze_lock);
	195	+ q->mq_freeze_depth--;
	196	+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
	197	+ if (!q->mq_freeze_depth) {
	198	+ percpu_ref_resurrect(&q->q_usage_counter);
202	199	wake_up_all(&q->mq_freeze_wq);
203	200	}
	201	+ mutex_unlock(&q->mq_freeze_lock);
204	202	}
205	203	EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206	204
..	..	@@ -268,40 +266,37 @@
268	266	blk_mq_tag_wakeup_all(hctx->tags, true);
269	267	}
270	268
271		-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
	269	+/*
	270	+ * Only need start/end time stamping if we have iostat or
	271	+ * blk stats enabled, or using an IO scheduler.
	272	+ */
	273	+static inline bool blk_mq_need_time_stamp(struct request *rq)
272	274	{
273		- return blk_mq_has_free_tags(hctx->tags);
	275	+ return (rq->rq_flags & (RQF_IO_STAT \| RQF_STATS)) \|\| rq->q->elevator;
274	276	}
275		-EXPORT_SYMBOL(blk_mq_can_queue);
276	277
277	278	static struct request blk_mq_rq_ctx_init(struct blk_mq_alloc_data data,
278		- unsigned int tag, unsigned int op)
	279	+ unsigned int tag, u64 alloc_time_ns)
279	280	{
280	281	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281	282	struct request *rq = tags->static_rqs[tag];
282		- req_flags_t rq_flags = 0;
283	283
284		- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285		- rq->tag = -1;
	284	+ if (data->q->elevator) {
	285	+ rq->tag = BLK_MQ_NO_TAG;
286	286	rq->internal_tag = tag;
287	287	} else {
288		- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289		- rq_flags = RQF_MQ_INFLIGHT;
290		- atomic_inc(&data->hctx->nr_active);
291		- }
292	288	rq->tag = tag;
293		- rq->internal_tag = -1;
294		- data->hctx->tags->rqs[rq->tag] = rq;
	289	+ rq->internal_tag = BLK_MQ_NO_TAG;
295	290	}
296	291
297	292	/* csd/requeue_work/fifo_time is initialized before use */
298	293	rq->q = data->q;
299	294	rq->mq_ctx = data->ctx;
300		- rq->rq_flags = rq_flags;
301		- rq->cpu = -1;
302		- rq->cmd_flags = op;
303		- if (data->flags & BLK_MQ_REQ_PREEMPT)
304		- rq->rq_flags \|= RQF_PREEMPT;
	295	+ rq->mq_hctx = data->hctx;
	296	+ rq->rq_flags = 0;
	297	+ rq->cmd_flags = data->cmd_flags;
	298	+ if (data->flags & BLK_MQ_REQ_PM)
	299	+ rq->rq_flags \|= RQF_PM;
305	300	if (blk_queue_io_stat(data->q))
306	301	rq->rq_flags \|= RQF_IO_STAT;
307	302	INIT_LIST_HEAD(&rq->queuelist);
..	..	@@ -309,97 +304,110 @@
309	304	RB_CLEAR_NODE(&rq->rb_node);
310	305	rq->rq_disk = NULL;
311	306	rq->part = NULL;
312		- rq->start_time_ns = ktime_get_ns();
	307	+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	308	+ rq->alloc_time_ns = alloc_time_ns;
	309	+#endif
	310	+ if (blk_mq_need_time_stamp(rq))
	311	+ rq->start_time_ns = ktime_get_ns();
	312	+ else
	313	+ rq->start_time_ns = 0;
313	314	rq->io_start_time_ns = 0;
	315	+ rq->stats_sectors = 0;
314	316	rq->nr_phys_segments = 0;
315	317	#if defined(CONFIG_BLK_DEV_INTEGRITY)
316	318	rq->nr_integrity_segments = 0;
317	319	#endif
318		- rq->special = NULL;
	320	+ blk_crypto_rq_set_defaults(rq);
319	321	/* tag was already set */
320		- rq->extra_len = 0;
321		- rq->__deadline = 0;
	322	+ WRITE_ONCE(rq->deadline, 0);
322	323
323		- INIT_LIST_HEAD(&rq->timeout_list);
324	324	rq->timeout = 0;
325	325
326	326	rq->end_io = NULL;
327	327	rq->end_io_data = NULL;
328		- rq->next_rq = NULL;
329	328
330		-#ifdef CONFIG_BLK_CGROUP
331		- rq->rl = NULL;
332		-#endif
333		-
334		- data->ctx->rq_dispatched[op_is_sync(op)]++;
	329	+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
335	330	refcount_set(&rq->ref, 1);
	331	+
	332	+ if (!op_is_flush(data->cmd_flags)) {
	333	+ struct elevator_queue *e = data->q->elevator;
	334	+
	335	+ rq->elv.icq = NULL;
	336	+ if (e && e->type->ops.prepare_request) {
	337	+ if (e->type->icq_cache)
	338	+ blk_mq_sched_assign_ioc(rq);
	339	+
	340	+ e->type->ops.prepare_request(rq);
	341	+ rq->rq_flags \|= RQF_ELVPRIV;
	342	+ }
	343	+ }
	344	+
	345	+ data->hctx->queued++;
	346	+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
336	347	return rq;
337	348	}
338	349
339		-static struct request blk_mq_get_request(struct request_queue q,
340		- struct bio *bio, unsigned int op,
341		- struct blk_mq_alloc_data *data)
	350	+static struct request __blk_mq_alloc_request(struct blk_mq_alloc_data data)
342	351	{
	352	+ struct request_queue *q = data->q;
343	353	struct elevator_queue *e = q->elevator;
344		- struct request *rq;
	354	+ u64 alloc_time_ns = 0;
345	355	unsigned int tag;
346		- bool put_ctx_on_error = false;
347	356
348		- blk_queue_enter_live(q);
349		- data->q = q;
350		- if (likely(!data->ctx)) {
351		- data->ctx = blk_mq_get_ctx(q);
352		- put_ctx_on_error = true;
353		- }
354		- if (likely(!data->hctx))
355		- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
356		- if (op & REQ_NOWAIT)
	357	+ /* alloc_time includes depth and tag waits */
	358	+ if (blk_queue_rq_alloc_time(q))
	359	+ alloc_time_ns = ktime_get_ns();
	360	+
	361	+ if (data->cmd_flags & REQ_NOWAIT)
357	362	data->flags \|= BLK_MQ_REQ_NOWAIT;
358	363
359	364	if (e) {
360		- data->flags \|= BLK_MQ_REQ_INTERNAL;
361		-
362	365	/*
363	366	* Flush requests are special and go directly to the
364	367	* dispatch list. Don't include reserved tags in the
365	368	* limiting, as it isn't useful.
366	369	*/
367		- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
	370	+ if (!op_is_flush(data->cmd_flags) &&
	371	+ e->type->ops.limit_depth &&
368	372	!(data->flags & BLK_MQ_REQ_RESERVED))
369		- e->type->ops.mq.limit_depth(op, data);
370		- } else {
	373	+ e->type->ops.limit_depth(data->cmd_flags, data);
	374	+ }
	375	+
	376	+retry:
	377	+ data->ctx = blk_mq_get_ctx(q);
	378	+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
	379	+ if (!e)
371	380	blk_mq_tag_busy(data->hctx);
372		- }
373	381
	382	+ /*
	383	+ * Waiting allocations only fail because of an inactive hctx. In that
	384	+ * case just retry the hctx assignment and tag allocation as CPU hotplug
	385	+ * should have migrated us to an online CPU by now.
	386	+ */
374	387	tag = blk_mq_get_tag(data);
375		- if (tag == BLK_MQ_TAG_FAIL) {
376		- if (put_ctx_on_error) {
377		- blk_mq_put_ctx(data->ctx);
378		- data->ctx = NULL;
379		- }
380		- blk_queue_exit(q);
381		- return NULL;
382		- }
	388	+ if (tag == BLK_MQ_NO_TAG) {
	389	+ if (data->flags & BLK_MQ_REQ_NOWAIT)
	390	+ return NULL;
383	391
384		- rq = blk_mq_rq_ctx_init(data, tag, op);
385		- if (!op_is_flush(op)) {
386		- rq->elv.icq = NULL;
387		- if (e && e->type->ops.mq.prepare_request) {
388		- if (e->type->icq_cache && rq_ioc(bio))
389		- blk_mq_sched_assign_ioc(rq, bio);
390		-
391		- e->type->ops.mq.prepare_request(rq, bio);
392		- rq->rq_flags \|= RQF_ELVPRIV;
393		- }
	392	+ /*
	393	+ * Give up the CPU and sleep for a random short time to ensure
	394	+ * that thread using a realtime scheduling class are migrated
	395	+ * off the CPU, and thus off the hctx that is going away.
	396	+ */
	397	+ msleep(3);
	398	+ goto retry;
394	399	}
395		- data->hctx->queued++;
396		- return rq;
	400	+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
397	401	}
398	402
399	403	struct request blk_mq_alloc_request(struct request_queue q, unsigned int op,
400	404	blk_mq_req_flags_t flags)
401	405	{
402		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
	406	+ struct blk_mq_alloc_data data = {
	407	+ .q = q,
	408	+ .flags = flags,
	409	+ .cmd_flags = op,
	410	+ };
403	411	struct request *rq;
404	412	int ret;
405	413
..	..	@@ -407,28 +415,35 @@
407	415	if (ret)
408	416	return ERR_PTR(ret);
409	417
410		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
411		- blk_queue_exit(q);
412		-
	418	+ rq = __blk_mq_alloc_request(&data);
413	419	if (!rq)
414		- return ERR_PTR(-EWOULDBLOCK);
415		-
416		- blk_mq_put_ctx(alloc_data.ctx);
417		-
	420	+ goto out_queue_exit;
418	421	rq->__data_len = 0;
419	422	rq->__sector = (sector_t) -1;
420	423	rq->bio = rq->biotail = NULL;
421	424	return rq;
	425	+out_queue_exit:
	426	+ blk_queue_exit(q);
	427	+ return ERR_PTR(-EWOULDBLOCK);
422	428	}
423	429	EXPORT_SYMBOL(blk_mq_alloc_request);
424	430
425	431	struct request blk_mq_alloc_request_hctx(struct request_queue q,
426	432	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
427	433	{
428		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
429		- struct request *rq;
	434	+ struct blk_mq_alloc_data data = {
	435	+ .q = q,
	436	+ .flags = flags,
	437	+ .cmd_flags = op,
	438	+ };
	439	+ u64 alloc_time_ns = 0;
430	440	unsigned int cpu;
	441	+ unsigned int tag;
431	442	int ret;
	443	+
	444	+ /* alloc_time includes depth and tag waits */
	445	+ if (blk_queue_rq_alloc_time(q))
	446	+ alloc_time_ns = ktime_get_ns();
432	447
433	448	/*
434	449	* If the tag allocator sleeps we could get an allocation for a
..	..	@@ -436,7 +451,7 @@
436	451	* allocator for this for the rare use case of a command tied to
437	452	* a specific queue.
438	453	*/
439		- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
	454	+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT \| BLK_MQ_REQ_RESERVED))))
440	455	return ERR_PTR(-EINVAL);
441	456
442	457	if (hctx_idx >= q->nr_hw_queues)
..	..	@@ -450,21 +465,27 @@
450	465	* Check if the hardware context is actually mapped to anything.
451	466	* If not tell the caller that it should skip this queue.
452	467	*/
453		- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
454		- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
455		- blk_queue_exit(q);
456		- return ERR_PTR(-EXDEV);
457		- }
458		- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
459		- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
	468	+ ret = -EXDEV;
	469	+ data.hctx = q->queue_hw_ctx[hctx_idx];
	470	+ if (!blk_mq_hw_queue_mapped(data.hctx))
	471	+ goto out_queue_exit;
	472	+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
	473	+ if (cpu >= nr_cpu_ids)
	474	+ goto out_queue_exit;
	475	+ data.ctx = __blk_mq_get_ctx(q, cpu);
460	476
461		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
	477	+ if (!q->elevator)
	478	+ blk_mq_tag_busy(data.hctx);
	479	+
	480	+ ret = -EWOULDBLOCK;
	481	+ tag = blk_mq_get_tag(&data);
	482	+ if (tag == BLK_MQ_NO_TAG)
	483	+ goto out_queue_exit;
	484	+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
	485	+
	486	+out_queue_exit:
462	487	blk_queue_exit(q);
463		-
464		- if (!rq)
465		- return ERR_PTR(-EWOULDBLOCK);
466		-
467		- return rq;
	488	+ return ERR_PTR(ret);
468	489	}
469	490	EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
470	491
..	..	@@ -472,13 +493,16 @@
472	493	{
473	494	struct request_queue *q = rq->q;
474	495	struct blk_mq_ctx *ctx = rq->mq_ctx;
475		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	496	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
476	497	const int sched_tag = rq->internal_tag;
477	498
478		- if (rq->tag != -1)
479		- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
480		- if (sched_tag != -1)
481		- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	499	+ blk_crypto_free_request(rq);
	500	+ blk_pm_mark_last_busy(rq);
	501	+ rq->mq_hctx = NULL;
	502	+ if (rq->tag != BLK_MQ_NO_TAG)
	503	+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
	504	+ if (sched_tag != BLK_MQ_NO_TAG)
	505	+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
482	506	blk_mq_sched_restart(hctx);
483	507	blk_queue_exit(q);
484	508	}
..	..	@@ -488,11 +512,11 @@
488	512	struct request_queue *q = rq->q;
489	513	struct elevator_queue *e = q->elevator;
490	514	struct blk_mq_ctx *ctx = rq->mq_ctx;
491		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	515	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
492	516
493	517	if (rq->rq_flags & RQF_ELVPRIV) {
494		- if (e && e->type->ops.mq.finish_request)
495		- e->type->ops.mq.finish_request(rq);
	518	+ if (e && e->type->ops.finish_request)
	519	+ e->type->ops.finish_request(rq);
496	520	if (rq->elv.icq) {
497	521	put_io_context(rq->elv.icq->ioc);
498	522	rq->elv.icq = NULL;
..	..	@@ -501,15 +525,12 @@
501	525
502	526	ctx->rq_completed[rq_is_sync(rq)]++;
503	527	if (rq->rq_flags & RQF_MQ_INFLIGHT)
504		- atomic_dec(&hctx->nr_active);
	528	+ __blk_mq_dec_active_requests(hctx);
505	529
506	530	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
507	531	laptop_io_completion(q->backing_dev_info);
508	532
509	533	rq_qos_done(q, rq);
510		-
511		- if (blk_rq_rl(rq))
512		- blk_put_rl(blk_rq_rl(rq));
513	534
514	535	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
515	536	if (refcount_dec_and_test(&rq->ref))
..	..	@@ -519,12 +540,17 @@
519	540
520	541	inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
521	542	{
522		- u64 now = ktime_get_ns();
	543	+ u64 now = 0;
	544	+
	545	+ if (blk_mq_need_time_stamp(rq))
	546	+ now = ktime_get_ns();
523	547
524	548	if (rq->rq_flags & RQF_STATS) {
525	549	blk_mq_poll_stats_start(rq->q);
526	550	blk_stat_add(rq, now);
527	551	}
	552	+
	553	+ blk_mq_sched_completed_request(rq, now);
528	554
529	555	blk_account_io_done(rq, now);
530	556
..	..	@@ -532,8 +558,6 @@
532	558	rq_qos_done(rq->q, rq);
533	559	rq->end_io(rq, error);
534	560	} else {
535		- if (unlikely(blk_bidi_rq(rq)))
536		- blk_mq_free_request(rq->next_rq);
537	561	blk_mq_free_request(rq);
538	562	}
539	563	}
..	..	@@ -547,43 +571,120 @@
547	571	}
548	572	EXPORT_SYMBOL(blk_mq_end_request);
549	573
550		-static void __blk_mq_complete_request_remote(void *data)
	574	+static void blk_complete_reqs(struct llist_head *list)
551	575	{
552		- struct request *rq = data;
	576	+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
	577	+ struct request rq, next;
553	578
554		- rq->q->softirq_done_fn(rq);
	579	+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
	580	+ rq->q->mq_ops->complete(rq);
555	581	}
556	582
557		-static void __blk_mq_complete_request(struct request *rq)
	583	+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
558	584	{
559		- struct blk_mq_ctx *ctx = rq->mq_ctx;
560		- bool shared = false;
561		- int cpu;
	585	+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
	586	+}
562	587
563		- if (!blk_mq_mark_complete(rq))
564		- return;
565		- if (rq->internal_tag != -1)
566		- blk_mq_sched_completed_request(rq);
	588	+static int blk_softirq_cpu_dead(unsigned int cpu)
	589	+{
	590	+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
	591	+ return 0;
	592	+}
567	593
568		- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
569		- rq->q->softirq_done_fn(rq);
570		- return;
571		- }
	594	+static void __blk_mq_complete_request_remote(void *data)
	595	+{
	596	+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
	597	+}
572	598
573		- cpu = get_cpu();
574		- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
575		- shared = cpus_share_cache(cpu, ctx->cpu);
	599	+static inline bool blk_mq_complete_need_ipi(struct request *rq)
	600	+{
	601	+ int cpu = raw_smp_processor_id();
576	602
577		- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
	603	+ if (!IS_ENABLED(CONFIG_SMP) \|\|
	604	+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
	605	+ return false;
	606	+ /*
	607	+ * With force threaded interrupts enabled, raising softirq from an SMP
	608	+ * function call will always result in waking the ksoftirqd thread.
	609	+ * This is probably worse than completing the request on a different
	610	+ * cache domain.
	611	+ */
	612	+ if (force_irqthreads)
	613	+ return false;
	614	+
	615	+ /* same CPU or cache domain? Complete locally */
	616	+ if (cpu == rq->mq_ctx->cpu \|\|
	617	+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
	618	+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
	619	+ return false;
	620	+
	621	+ /* don't try to IPI to an offline CPU */
	622	+ return cpu_online(rq->mq_ctx->cpu);
	623	+}
	624	+
	625	+static void blk_mq_complete_send_ipi(struct request *rq)
	626	+{
	627	+ struct llist_head *list;
	628	+ unsigned int cpu;
	629	+
	630	+ cpu = rq->mq_ctx->cpu;
	631	+ list = &per_cpu(blk_cpu_done, cpu);
	632	+ if (llist_add(&rq->ipi_list, list)) {
578	633	rq->csd.func = __blk_mq_complete_request_remote;
579	634	rq->csd.info = rq;
580	635	rq->csd.flags = 0;
581		- smp_call_function_single_async(ctx->cpu, &rq->csd);
582		- } else {
583		- rq->q->softirq_done_fn(rq);
	636	+ smp_call_function_single_async(cpu, &rq->csd);
584	637	}
585		- put_cpu();
586	638	}
	639	+
	640	+static void blk_mq_raise_softirq(struct request *rq)
	641	+{
	642	+ struct llist_head *list;
	643	+
	644	+ preempt_disable();
	645	+ list = this_cpu_ptr(&blk_cpu_done);
	646	+ if (llist_add(&rq->ipi_list, list))
	647	+ raise_softirq(BLOCK_SOFTIRQ);
	648	+ preempt_enable();
	649	+}
	650	+
	651	+bool blk_mq_complete_request_remote(struct request *rq)
	652	+{
	653	+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
	654	+
	655	+ /*
	656	+ * For a polled request, always complete locallly, it's pointless
	657	+ * to redirect the completion.
	658	+ */
	659	+ if (rq->cmd_flags & REQ_HIPRI)
	660	+ return false;
	661	+
	662	+ if (blk_mq_complete_need_ipi(rq)) {
	663	+ blk_mq_complete_send_ipi(rq);
	664	+ return true;
	665	+ }
	666	+
	667	+ if (rq->q->nr_hw_queues == 1) {
	668	+ blk_mq_raise_softirq(rq);
	669	+ return true;
	670	+ }
	671	+ return false;
	672	+}
	673	+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
	674	+
	675	+/**
	676	+ * blk_mq_complete_request - end I/O on a request
	677	+ * @rq: the request being processed
	678	+ *
	679	+ * Description:
	680	+ * Complete a request by scheduling the ->complete_rq operation.
	681	+ **/
	682	+void blk_mq_complete_request(struct request *rq)
	683	+{
	684	+ if (!blk_mq_complete_request_remote(rq))
	685	+ rq->q->mq_ops->complete(rq);
	686	+}
	687	+EXPORT_SYMBOL(blk_mq_complete_request);
587	688
588	689	static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
589	690	__releases(hctx->srcu)
..	..	@@ -606,40 +707,22 @@
606	707	}
607	708
608	709	/**
609		- * blk_mq_complete_request - end I/O on a request
610		- * @rq: the request being processed
	710	+ * blk_mq_start_request - Start processing a request
	711	+ * @rq: Pointer to request to be started
611	712	*
612		- * Description:
613		- * Ends all I/O on a request. It does not handle partial completions.
614		- * The actual completion happens out-of-order, through a IPI handler.
615		- **/
616		-void blk_mq_complete_request(struct request *rq)
617		-{
618		- if (unlikely(blk_should_fake_timeout(rq->q)))
619		- return;
620		- __blk_mq_complete_request(rq);
621		-}
622		-EXPORT_SYMBOL(blk_mq_complete_request);
623		-
624		-int blk_mq_request_started(struct request *rq)
625		-{
626		- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
627		-}
628		-EXPORT_SYMBOL_GPL(blk_mq_request_started);
629		-
	713	+ * Function used by device drivers to notify the block layer that a request
	714	+ * is going to be processed now, so blk layer can do proper initializations
	715	+ * such as starting the timeout timer.
	716	+ */
630	717	void blk_mq_start_request(struct request *rq)
631	718	{
632	719	struct request_queue *q = rq->q;
633		-
634		- blk_mq_sched_started_request(rq);
635	720
636	721	trace_block_rq_issue(q, rq);
637	722
638	723	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
639	724	rq->io_start_time_ns = ktime_get_ns();
640		-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
641		- rq->throtl_size = blk_rq_sectors(rq);
642		-#endif
	725	+ rq->stats_sectors = blk_rq_sectors(rq);
643	726	rq->rq_flags \|= RQF_STATS;
644	727	rq_qos_issue(q, rq);
645	728	}
..	..	@@ -649,14 +732,10 @@
649	732	blk_add_timer(rq);
650	733	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
651	734
652		- if (q->dma_drain_size && blk_rq_bytes(rq)) {
653		- /*
654		- * Make sure space for the drain appears. We know we can do
655		- * this because max_hw_segments has been adjusted to be one
656		- * fewer than the device can handle.
657		- */
658		- rq->nr_phys_segments++;
659		- }
	735	+#ifdef CONFIG_BLK_DEV_INTEGRITY
	736	+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
	737	+ q->integrity.profile->prepare_fn(rq);
	738	+#endif
660	739	}
661	740	EXPORT_SYMBOL(blk_mq_start_request);
662	741
..	..	@@ -672,8 +751,6 @@
672	751	if (blk_mq_request_started(rq)) {
673	752	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
674	753	rq->rq_flags &= ~RQF_TIMED_OUT;
675		- if (q->dma_drain_size && blk_rq_bytes(rq))
676		- rq->nr_phys_segments--;
677	754	}
678	755	}
679	756
..	..	@@ -684,7 +761,6 @@
684	761	/* this request will be re-inserted to io scheduler queue */
685	762	blk_mq_sched_requeue_request(rq);
686	763
687		- BUG_ON(blk_queued_rq(rq));
688	764	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
689	765	}
690	766	EXPORT_SYMBOL(blk_mq_requeue_request);
..	..	@@ -712,7 +788,7 @@
712	788	* merge.
713	789	*/
714	790	if (rq->rq_flags & RQF_DONTPREP)
715		- blk_mq_request_bypass_insert(rq, false);
	791	+ blk_mq_request_bypass_insert(rq, false, false);
716	792	else
717	793	blk_mq_sched_insert_request(rq, true, false, false);
718	794	}
..	..	@@ -750,7 +826,6 @@
750	826	if (kick_requeue_list)
751	827	blk_mq_kick_requeue_list(q);
752	828	}
753		-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
754	829
755	830	void blk_mq_kick_requeue_list(struct request_queue *q)
756	831	{
..	..	@@ -777,6 +852,32 @@
777	852	}
778	853	EXPORT_SYMBOL(blk_mq_tag_to_rq);
779	854
	855	+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx hctx, struct request rq,
	856	+ void *priv, bool reserved)
	857	+{
	858	+ /*
	859	+ * If we find a request that isn't idle and the queue matches,
	860	+ * we know the queue is busy. Return false to stop the iteration.
	861	+ */
	862	+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
	863	+ bool *busy = priv;
	864	+
	865	+ *busy = true;
	866	+ return false;
	867	+ }
	868	+
	869	+ return true;
	870	+}
	871	+
	872	+bool blk_mq_queue_inflight(struct request_queue *q)
	873	+{
	874	+ bool busy = false;
	875	+
	876	+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
	877	+ return busy;
	878	+}
	879	+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
	880	+
780	881	static void blk_mq_rq_timed_out(struct request *req, bool reserved)
781	882	{
782	883	req->rq_flags \|= RQF_TIMED_OUT;
..	..	@@ -801,7 +902,7 @@
801	902	if (rq->rq_flags & RQF_TIMED_OUT)
802	903	return false;
803	904
804		- deadline = blk_rq_deadline(rq);
	905	+ deadline = READ_ONCE(rq->deadline);
805	906	if (time_after_eq(jiffies, deadline))
806	907	return true;
807	908
..	..	@@ -812,43 +913,29 @@
812	913	return false;
813	914	}
814	915
815		-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
	916	+void blk_mq_put_rq_ref(struct request *rq)
	917	+{
	918	+ if (is_flush_rq(rq))
	919	+ rq->end_io(rq, 0);
	920	+ else if (refcount_dec_and_test(&rq->ref))
	921	+ __blk_mq_free_request(rq);
	922	+}
	923	+
	924	+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816	925	struct request rq, void priv, bool reserved)
817	926	{
818	927	unsigned long *next = priv;
819	928
820	929	/*
821		- * Just do a quick check if it is expired before locking the request in
822		- * so we're not unnecessarilly synchronizing across CPUs.
823		- */
824		- if (!blk_mq_req_expired(rq, next))
825		- return;
826		-
827		- /*
828		- * We have reason to believe the request may be expired. Take a
829		- * reference on the request to lock this request lifetime into its
830		- * currently allocated context to prevent it from being reallocated in
831		- * the event the completion by-passes this timeout handler.
832		- *
833		- * If the reference was already released, then the driver beat the
834		- * timeout handler to posting a natural completion.
835		- */
836		- if (!refcount_inc_not_zero(&rq->ref))
837		- return;
838		-
839		- /*
840		- * The request is now locked and cannot be reallocated underneath the
841		- * timeout handler's processing. Re-verify this exact request is truly
842		- * expired; if it is not expired, then the request was completed and
843		- * reallocated as a new request.
	930	+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
	931	+ * be reallocated underneath the timeout handler's processing, then
	932	+ * the expire check is reliable. If the request is not expired, then
	933	+ * it was completed and reallocated as a new request after returning
	934	+ * from blk_mq_check_expired().
844	935	*/
845	936	if (blk_mq_req_expired(rq, next))
846	937	blk_mq_rq_timed_out(rq, reserved);
847		-
848		- if (is_flush_rq(rq, hctx))
849		- rq->end_io(rq, 0);
850		- else if (refcount_dec_and_test(&rq->ref))
851		- __blk_mq_free_request(rq);
	938	+ return true;
852	939	}
853	940
854	941	static void blk_mq_timeout_work(struct work_struct *work)
..	..	@@ -905,9 +992,10 @@
905	992	struct flush_busy_ctx_data *flush_data = data;
906	993	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
907	994	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	995	+ enum hctx_type type = hctx->type;
908	996
909	997	spin_lock(&ctx->lock);
910		- list_splice_tail_init(&ctx->rq_list, flush_data->list);
	998	+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
911	999	sbitmap_clear_bit(sb, bitnr);
912	1000	spin_unlock(&ctx->lock);
913	1001	return true;
..	..	@@ -939,12 +1027,13 @@
939	1027	struct dispatch_rq_data *dispatch_data = data;
940	1028	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
941	1029	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1030	+ enum hctx_type type = hctx->type;
942	1031
943	1032	spin_lock(&ctx->lock);
944		- if (!list_empty(&ctx->rq_list)) {
945		- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
	1033	+ if (!list_empty(&ctx->rq_lists[type])) {
	1034	+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
946	1035	list_del_init(&dispatch_data->rq->queuelist);
947		- if (list_empty(&ctx->rq_list))
	1036	+ if (list_empty(&ctx->rq_lists[type]))
948	1037	sbitmap_clear_bit(sb, bitnr);
949	1038	}
950	1039	spin_unlock(&ctx->lock);
..	..	@@ -955,7 +1044,7 @@
955	1044	struct request blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx hctx,
956	1045	struct blk_mq_ctx *start)
957	1046	{
958		- unsigned off = start ? start->index_hw : 0;
	1047	+ unsigned off = start ? start->index_hw[hctx->type] : 0;
959	1048	struct dispatch_rq_data data = {
960	1049	.hctx = hctx,
961	1050	.rq = NULL,
..	..	@@ -975,33 +1064,44 @@
975	1064	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
976	1065	}
977	1066
978		-bool blk_mq_get_driver_tag(struct request *rq)
	1067	+static bool __blk_mq_get_driver_tag(struct request *rq)
979	1068	{
980		- struct blk_mq_alloc_data data = {
981		- .q = rq->q,
982		- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
983		- .flags = BLK_MQ_REQ_NOWAIT,
984		- };
985		- bool shared;
	1069	+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
	1070	+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
	1071	+ int tag;
986	1072
987		- if (rq->tag != -1)
988		- goto done;
	1073	+ blk_mq_tag_busy(rq->mq_hctx);
989	1074
990		- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
991		- data.flags \|= BLK_MQ_REQ_RESERVED;
992		-
993		- shared = blk_mq_tag_busy(data.hctx);
994		- rq->tag = blk_mq_get_tag(&data);
995		- if (rq->tag >= 0) {
996		- if (shared) {
997		- rq->rq_flags \|= RQF_MQ_INFLIGHT;
998		- atomic_inc(&data.hctx->nr_active);
999		- }
1000		- data.hctx->tags->rqs[rq->tag] = rq;
	1075	+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
	1076	+ bt = rq->mq_hctx->tags->breserved_tags;
	1077	+ tag_offset = 0;
	1078	+ } else {
	1079	+ if (!hctx_may_queue(rq->mq_hctx, bt))
	1080	+ return false;
1001	1081	}
1002	1082
1003		-done:
1004		- return rq->tag != -1;
	1083	+ tag = __sbitmap_queue_get(bt);
	1084	+ if (tag == BLK_MQ_NO_TAG)
	1085	+ return false;
	1086	+
	1087	+ rq->tag = tag + tag_offset;
	1088	+ return true;
	1089	+}
	1090	+
	1091	+static bool blk_mq_get_driver_tag(struct request *rq)
	1092	+{
	1093	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1094	+
	1095	+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
	1096	+ return false;
	1097	+
	1098	+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
	1099	+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
	1100	+ rq->rq_flags \|= RQF_MQ_INFLIGHT;
	1101	+ __blk_mq_inc_active_requests(hctx);
	1102	+ }
	1103	+ hctx->tags->rqs[rq->tag] = rq;
	1104	+ return true;
1005	1105	}
1006	1106
1007	1107	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
..	..	@@ -1012,7 +1112,13 @@
1012	1112	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1013	1113
1014	1114	spin_lock(&hctx->dispatch_wait_lock);
1015		- list_del_init(&wait->entry);
	1115	+ if (!list_empty(&wait->entry)) {
	1116	+ struct sbitmap_queue *sbq;
	1117	+
	1118	+ list_del_init(&wait->entry);
	1119	+ sbq = hctx->tags->bitmap_tags;
	1120	+ atomic_dec(&sbq->ws_active);
	1121	+ }
1016	1122	spin_unlock(&hctx->dispatch_wait_lock);
1017	1123
1018	1124	blk_mq_run_hw_queue(hctx, true);
..	..	@@ -1028,13 +1134,13 @@
1028	1134	static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1029	1135	struct request *rq)
1030	1136	{
	1137	+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
1031	1138	struct wait_queue_head *wq;
1032	1139	wait_queue_entry_t *wait;
1033	1140	bool ret;
1034	1141
1035		- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1036		- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1037		- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
	1142	+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	1143	+ blk_mq_sched_mark_restart_hctx(hctx);
1038	1144
1039	1145	/*
1040	1146	* It's possible that a tag was freed in the window between the
..	..	@@ -1051,7 +1157,7 @@
1051	1157	if (!list_empty_careful(&wait->entry))
1052	1158	return false;
1053	1159
1054		- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
	1160	+ wq = &bt_wait_ptr(sbq, hctx)->wait;
1055	1161
1056	1162	spin_lock_irq(&wq->lock);
1057	1163	spin_lock(&hctx->dispatch_wait_lock);
..	..	@@ -1061,6 +1167,7 @@
1061	1167	return false;
1062	1168	}
1063	1169
	1170	+ atomic_inc(&sbq->ws_active);
1064	1171	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1065	1172	__add_wait_queue(wq, wait);
1066	1173
..	..	@@ -1081,6 +1188,7 @@
1081	1188	* someone else gets the wakeup.
1082	1189	*/
1083	1190	list_del_init(&wait->entry);
	1191	+ atomic_dec(&sbq->ws_active);
1084	1192	spin_unlock(&hctx->dispatch_wait_lock);
1085	1193	spin_unlock_irq(&wq->lock);
1086	1194
..	..	@@ -1099,9 +1207,6 @@
1099	1207	static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1100	1208	{
1101	1209	unsigned int ewma;
1102		-
1103		- if (hctx->queue->elevator)
1104		- return;
1105	1210
1106	1211	ewma = hctx->dispatch_busy;
1107	1212
..	..	@@ -1135,22 +1240,83 @@
1135	1240	__blk_mq_requeue_request(rq);
1136	1241	}
1137	1242
	1243	+static void blk_mq_handle_zone_resource(struct request *rq,
	1244	+ struct list_head *zone_list)
	1245	+{
	1246	+ /*
	1247	+ * If we end up here it is because we cannot dispatch a request to a
	1248	+ * specific zone due to LLD level zone-write locking or other zone
	1249	+ * related resource not being available. In this case, set the request
	1250	+ * aside in zone_list for retrying it later.
	1251	+ */
	1252	+ list_add(&rq->queuelist, zone_list);
	1253	+ __blk_mq_requeue_request(rq);
	1254	+}
	1255	+
	1256	+enum prep_dispatch {
	1257	+ PREP_DISPATCH_OK,
	1258	+ PREP_DISPATCH_NO_TAG,
	1259	+ PREP_DISPATCH_NO_BUDGET,
	1260	+};
	1261	+
	1262	+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
	1263	+ bool need_budget)
	1264	+{
	1265	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1266	+
	1267	+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
	1268	+ blk_mq_put_driver_tag(rq);
	1269	+ return PREP_DISPATCH_NO_BUDGET;
	1270	+ }
	1271	+
	1272	+ if (!blk_mq_get_driver_tag(rq)) {
	1273	+ /*
	1274	+ * The initial allocation attempt failed, so we need to
	1275	+ * rerun the hardware queue when a tag is freed. The
	1276	+ * waitqueue takes care of that. If the queue is run
	1277	+ * before we add this entry back on the dispatch list,
	1278	+ * we'll re-run it below.
	1279	+ */
	1280	+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
	1281	+ /*
	1282	+ * All budgets not got from this function will be put
	1283	+ * together during handling partial dispatch
	1284	+ */
	1285	+ if (need_budget)
	1286	+ blk_mq_put_dispatch_budget(rq->q);
	1287	+ return PREP_DISPATCH_NO_TAG;
	1288	+ }
	1289	+ }
	1290	+
	1291	+ return PREP_DISPATCH_OK;
	1292	+}
	1293	+
	1294	+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
	1295	+static void blk_mq_release_budgets(struct request_queue *q,
	1296	+ unsigned int nr_budgets)
	1297	+{
	1298	+ int i;
	1299	+
	1300	+ for (i = 0; i < nr_budgets; i++)
	1301	+ blk_mq_put_dispatch_budget(q);
	1302	+}
	1303	+
1138	1304	/*
1139	1305	* Returns true if we did some work AND can potentially do more.
1140	1306	*/
1141		-bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1142		- bool got_budget)
	1307	+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx hctx, struct list_head list,
	1308	+ unsigned int nr_budgets)
1143	1309	{
1144		- struct blk_mq_hw_ctx *hctx;
	1310	+ enum prep_dispatch prep;
	1311	+ struct request_queue *q = hctx->queue;
1145	1312	struct request rq, nxt;
1146		- bool no_tag = false;
1147	1313	int errors, queued;
1148	1314	blk_status_t ret = BLK_STS_OK;
	1315	+ LIST_HEAD(zone_list);
	1316	+ bool needs_resource = false;
1149	1317
1150	1318	if (list_empty(list))
1151	1319	return false;
1152		-
1153		- WARN_ON(!list_is_singular(list) && got_budget);
1154	1320
1155	1321	/*
1156	1322	* Now process all the entries, sending them to the driver.
..	..	@@ -1161,29 +1327,10 @@
1161	1327
1162	1328	rq = list_first_entry(list, struct request, queuelist);
1163	1329
1164		- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1165		- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
	1330	+ WARN_ON_ONCE(hctx != rq->mq_hctx);
	1331	+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
	1332	+ if (prep != PREP_DISPATCH_OK)
1166	1333	break;
1167		-
1168		- if (!blk_mq_get_driver_tag(rq)) {
1169		- /*
1170		- * The initial allocation attempt failed, so we need to
1171		- * rerun the hardware queue when a tag is freed. The
1172		- * waitqueue takes care of that. If the queue is run
1173		- * before we add this entry back on the dispatch list,
1174		- * we'll re-run it below.
1175		- */
1176		- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1177		- blk_mq_put_dispatch_budget(hctx);
1178		- /*
1179		- * For non-shared tags, the RESTART check
1180		- * will suffice.
1181		- */
1182		- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1183		- no_tag = true;
1184		- break;
1185		- }
1186		- }
1187	1334
1188	1335	list_del_init(&rq->queuelist);
1189	1336
..	..	@@ -1200,32 +1347,63 @@
1200	1347	bd.last = !blk_mq_get_driver_tag(nxt);
1201	1348	}
1202	1349
	1350	+ /*
	1351	+ * once the request is queued to lld, no need to cover the
	1352	+ * budget any more
	1353	+ */
	1354	+ if (nr_budgets)
	1355	+ nr_budgets--;
1203	1356	ret = q->mq_ops->queue_rq(hctx, &bd);
1204		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE) {
1205		- blk_mq_handle_dev_resource(rq, list);
	1357	+ switch (ret) {
	1358	+ case BLK_STS_OK:
	1359	+ queued++;
1206	1360	break;
1207		- }
1208		-
1209		- if (unlikely(ret != BLK_STS_OK)) {
	1361	+ case BLK_STS_RESOURCE:
	1362	+ needs_resource = true;
	1363	+ fallthrough;
	1364	+ case BLK_STS_DEV_RESOURCE:
	1365	+ blk_mq_handle_dev_resource(rq, list);
	1366	+ goto out;
	1367	+ case BLK_STS_ZONE_RESOURCE:
	1368	+ /*
	1369	+ * Move the request to zone_list and keep going through
	1370	+ * the dispatch list to find more requests the drive can
	1371	+ * accept.
	1372	+ */
	1373	+ blk_mq_handle_zone_resource(rq, &zone_list);
	1374	+ needs_resource = true;
	1375	+ break;
	1376	+ default:
1210	1377	errors++;
1211	1378	blk_mq_end_request(rq, BLK_STS_IOERR);
1212		- continue;
1213	1379	}
1214		-
1215		- queued++;
1216	1380	} while (!list_empty(list));
	1381	+out:
	1382	+ if (!list_empty(&zone_list))
	1383	+ list_splice_tail_init(&zone_list, list);
1217	1384
1218	1385	hctx->dispatched[queued_to_index(queued)]++;
1219	1386
	1387	+ /* If we didn't flush the entire list, we could have told the driver
	1388	+ * there was more coming, but that turned out to be a lie.
	1389	+ */
	1390	+ if ((!list_empty(list) \|\| errors \|\| needs_resource \|\|
	1391	+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
	1392	+ q->mq_ops->commit_rqs(hctx);
1220	1393	/*
1221	1394	* Any items that need requeuing? Stuff them into hctx->dispatch,
1222	1395	* that is where we will continue on next queue run.
1223	1396	*/
1224	1397	if (!list_empty(list)) {
1225	1398	bool needs_restart;
	1399	+ /* For non-shared tags, the RESTART check will suffice */
	1400	+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
	1401	+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
	1402	+
	1403	+ blk_mq_release_budgets(q, nr_budgets);
1226	1404
1227	1405	spin_lock(&hctx->lock);
1228		- list_splice_init(list, &hctx->dispatch);
	1406	+ list_splice_tail_init(list, &hctx->dispatch);
1229	1407	spin_unlock(&hctx->lock);
1230	1408
1231	1409	/*
..	..	@@ -1259,13 +1437,17 @@
1259	1437	*
1260	1438	* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1261	1439	* bit is set, run queue after a delay to avoid IO stalls
1262		- * that could otherwise occur if the queue is idle.
	1440	+ * that could otherwise occur if the queue is idle. We'll do
	1441	+ * similar if we couldn't get budget or couldn't lock a zone
	1442	+ * and SCHED_RESTART is set.
1263	1443	*/
1264	1444	needs_restart = blk_mq_sched_needs_restart(hctx);
	1445	+ if (prep == PREP_DISPATCH_NO_BUDGET)
	1446	+ needs_resource = true;
1265	1447	if (!needs_restart \|\|
1266	1448	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1267	1449	blk_mq_run_hw_queue(hctx, true);
1268		- else if (needs_restart && (ret == BLK_STS_RESOURCE))
	1450	+ else if (needs_restart && needs_resource)
1269	1451	blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1270	1452
1271	1453	blk_mq_update_dispatch_busy(hctx, true);
..	..	@@ -1273,16 +1455,15 @@
1273	1455	} else
1274	1456	blk_mq_update_dispatch_busy(hctx, false);
1275	1457
1276		- /*
1277		- * If the host/device is unable to accept more work, inform the
1278		- * caller of that.
1279		- */
1280		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1281		- return false;
1282		-
1283	1458	return (queued + errors) != 0;
1284	1459	}
1285	1460
	1461	+/**
	1462	+ * __blk_mq_run_hw_queue - Run a hardware queue.
	1463	+ * @hctx: Pointer to the hardware queue to run.
	1464	+ *
	1465	+ * Send pending requests to the hardware.
	1466	+ */
1286	1467	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1287	1468	{
1288	1469	int srcu_idx;
..	..	@@ -1380,6 +1561,15 @@
1380	1561	return next_cpu;
1381	1562	}
1382	1563
	1564	+/**
	1565	+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
	1566	+ * @hctx: Pointer to the hardware queue to run.
	1567	+ * @async: If we want to run the queue asynchronously.
	1568	+ * @msecs: Microseconds of delay to wait before running the queue.
	1569	+ *
	1570	+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
	1571	+ * with a delay of @msecs.
	1572	+ */
1383	1573	static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1384	1574	unsigned long msecs)
1385	1575	{
..	..	@@ -1387,27 +1577,43 @@
1387	1577	return;
1388	1578
1389	1579	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1390		- int cpu = get_cpu();
	1580	+ int cpu = get_cpu_light();
1391	1581	if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1392	1582	__blk_mq_run_hw_queue(hctx);
1393		- put_cpu();
	1583	+ put_cpu_light();
1394	1584	return;
1395	1585	}
1396	1586
1397		- put_cpu();
	1587	+ put_cpu_light();
1398	1588	}
1399	1589
1400	1590	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1401	1591	msecs_to_jiffies(msecs));
1402	1592	}
1403	1593
	1594	+/**
	1595	+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
	1596	+ * @hctx: Pointer to the hardware queue to run.
	1597	+ * @msecs: Microseconds of delay to wait before running the queue.
	1598	+ *
	1599	+ * Run a hardware queue asynchronously with a delay of @msecs.
	1600	+ */
1404	1601	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1405	1602	{
1406	1603	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1407	1604	}
1408	1605	EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1409	1606
1410		-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
	1607	+/**
	1608	+ * blk_mq_run_hw_queue - Start to run a hardware queue.
	1609	+ * @hctx: Pointer to the hardware queue to run.
	1610	+ * @async: If we want to run the queue asynchronously.
	1611	+ *
	1612	+ * Check if the request queue is not in a quiesced state and if there are
	1613	+ * pending requests to be sent. If this is true, run the queue to send requests
	1614	+ * to hardware.
	1615	+ */
	1616	+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1411	1617	{
1412	1618	int srcu_idx;
1413	1619	bool need_run;
..	..	@@ -1425,28 +1631,101 @@
1425	1631	blk_mq_hctx_has_pending(hctx);
1426	1632	hctx_unlock(hctx, srcu_idx);
1427	1633
1428		- if (need_run) {
	1634	+ if (need_run)
1429	1635	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1430		- return true;
1431		- }
1432		-
1433		- return false;
1434	1636	}
1435	1637	EXPORT_SYMBOL(blk_mq_run_hw_queue);
1436	1638
	1639	+/*
	1640	+ * Is the request queue handled by an IO scheduler that does not respect
	1641	+ * hardware queues when dispatching?
	1642	+ */
	1643	+static bool blk_mq_has_sqsched(struct request_queue *q)
	1644	+{
	1645	+ struct elevator_queue *e = q->elevator;
	1646	+
	1647	+ if (e && e->type->ops.dispatch_request &&
	1648	+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
	1649	+ return true;
	1650	+ return false;
	1651	+}
	1652	+
	1653	+/*
	1654	+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
	1655	+ * scheduler.
	1656	+ */
	1657	+static struct blk_mq_hw_ctx blk_mq_get_sq_hctx(struct request_queue q)
	1658	+{
	1659	+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	1660	+ /*
	1661	+ * If the IO scheduler does not respect hardware queues when
	1662	+ * dispatching, we just don't bother with multiple HW queues and
	1663	+ * dispatch from hctx for the current CPU since running multiple queues
	1664	+ * just causes lock contention inside the scheduler and pointless cache
	1665	+ * bouncing.
	1666	+ */
	1667	+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
	1668	+
	1669	+ if (!blk_mq_hctx_stopped(hctx))
	1670	+ return hctx;
	1671	+ return NULL;
	1672	+}
	1673	+
	1674	+/**
	1675	+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
	1676	+ * @q: Pointer to the request queue to run.
	1677	+ * @async: If we want to run the queue asynchronously.
	1678	+ */
1437	1679	void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1438	1680	{
1439		- struct blk_mq_hw_ctx *hctx;
	1681	+ struct blk_mq_hw_ctx hctx, sq_hctx;
1440	1682	int i;
1441	1683
	1684	+ sq_hctx = NULL;
	1685	+ if (blk_mq_has_sqsched(q))
	1686	+ sq_hctx = blk_mq_get_sq_hctx(q);
1442	1687	queue_for_each_hw_ctx(q, hctx, i) {
1443	1688	if (blk_mq_hctx_stopped(hctx))
1444	1689	continue;
1445		-
1446		- blk_mq_run_hw_queue(hctx, async);
	1690	+ /*
	1691	+ * Dispatch from this hctx either if there's no hctx preferred
	1692	+ * by IO scheduler or if it has requests that bypass the
	1693	+ * scheduler.
	1694	+ */
	1695	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1696	+ !list_empty_careful(&hctx->dispatch))
	1697	+ blk_mq_run_hw_queue(hctx, async);
1447	1698	}
1448	1699	}
1449	1700	EXPORT_SYMBOL(blk_mq_run_hw_queues);
	1701	+
	1702	+/**
	1703	+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
	1704	+ * @q: Pointer to the request queue to run.
	1705	+ * @msecs: Microseconds of delay to wait before running the queues.
	1706	+ */
	1707	+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
	1708	+{
	1709	+ struct blk_mq_hw_ctx hctx, sq_hctx;
	1710	+ int i;
	1711	+
	1712	+ sq_hctx = NULL;
	1713	+ if (blk_mq_has_sqsched(q))
	1714	+ sq_hctx = blk_mq_get_sq_hctx(q);
	1715	+ queue_for_each_hw_ctx(q, hctx, i) {
	1716	+ if (blk_mq_hctx_stopped(hctx))
	1717	+ continue;
	1718	+ /*
	1719	+ * Dispatch from this hctx either if there's no hctx preferred
	1720	+ * by IO scheduler or if it has requests that bypass the
	1721	+ * scheduler.
	1722	+ */
	1723	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1724	+ !list_empty_careful(&hctx->dispatch))
	1725	+ blk_mq_delay_run_hw_queue(hctx, msecs);
	1726	+ }
	1727	+}
	1728	+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1450	1729
1451	1730	/**
1452	1731	* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
..	..	@@ -1551,7 +1830,7 @@
1551	1830	/*
1552	1831	* If we are stopped, don't run the queue.
1553	1832	*/
1554		- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
	1833	+ if (blk_mq_hctx_stopped(hctx))
1555	1834	return;
1556	1835
1557	1836	__blk_mq_run_hw_queue(hctx);
..	..	@@ -1562,15 +1841,16 @@
1562	1841	bool at_head)
1563	1842	{
1564	1843	struct blk_mq_ctx *ctx = rq->mq_ctx;
	1844	+ enum hctx_type type = hctx->type;
1565	1845
1566	1846	lockdep_assert_held(&ctx->lock);
1567	1847
1568	1848	trace_block_rq_insert(hctx->queue, rq);
1569	1849
1570	1850	if (at_head)
1571		- list_add(&rq->queuelist, &ctx->rq_list);
	1851	+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
1572	1852	else
1573		- list_add_tail(&rq->queuelist, &ctx->rq_list);
	1853	+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1574	1854	}
1575	1855
1576	1856	void __blk_mq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
..	..	@@ -1584,17 +1864,25 @@
1584	1864	blk_mq_hctx_mark_pending(hctx, ctx);
1585	1865	}
1586	1866
1587		-/*
	1867	+/**
	1868	+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
	1869	+ * @rq: Pointer to request to be inserted.
	1870	+ * @at_head: true if the request should be inserted at the head of the list.
	1871	+ * @run_queue: If we should run the hardware queue after inserting the request.
	1872	+ *
1588	1873	* Should only be used carefully, when the caller knows we want to
1589	1874	* bypass a potential IO scheduler on the target device.
1590	1875	*/
1591		-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
	1876	+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
	1877	+ bool run_queue)
1592	1878	{
1593		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1594		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	1879	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1595	1880
1596	1881	spin_lock(&hctx->lock);
1597		- list_add_tail(&rq->queuelist, &hctx->dispatch);
	1882	+ if (at_head)
	1883	+ list_add(&rq->queuelist, &hctx->dispatch);
	1884	+ else
	1885	+ list_add_tail(&rq->queuelist, &hctx->dispatch);
1598	1886	spin_unlock(&hctx->lock);
1599	1887
1600	1888	if (run_queue)
..	..	@@ -1606,6 +1894,7 @@
1606	1894
1607	1895	{
1608	1896	struct request *rq;
	1897	+ enum hctx_type type = hctx->type;
1609	1898
1610	1899	/*
1611	1900	* preemption doesn't flush plug list, so it's possible ctx->cpu is
..	..	@@ -1617,95 +1906,87 @@
1617	1906	}
1618	1907
1619	1908	spin_lock(&ctx->lock);
1620		- list_splice_tail_init(list, &ctx->rq_list);
	1909	+ list_splice_tail_init(list, &ctx->rq_lists[type]);
1621	1910	blk_mq_hctx_mark_pending(hctx, ctx);
1622	1911	spin_unlock(&ctx->lock);
1623	1912	}
1624	1913
1625		-static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
	1914	+static int plug_rq_cmp(void priv, struct list_head a, struct list_head *b)
1626	1915	{
1627	1916	struct request *rqa = container_of(a, struct request, queuelist);
1628	1917	struct request *rqb = container_of(b, struct request, queuelist);
1629	1918
1630		- return !(rqa->mq_ctx < rqb->mq_ctx \|\|
1631		- (rqa->mq_ctx == rqb->mq_ctx &&
1632		- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
	1919	+ if (rqa->mq_ctx != rqb->mq_ctx)
	1920	+ return rqa->mq_ctx > rqb->mq_ctx;
	1921	+ if (rqa->mq_hctx != rqb->mq_hctx)
	1922	+ return rqa->mq_hctx > rqb->mq_hctx;
	1923	+
	1924	+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1633	1925	}
1634	1926
1635	1927	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1636	1928	{
1637		- struct blk_mq_ctx *this_ctx;
1638		- struct request_queue *this_q;
1639		- struct request *rq;
1640	1929	LIST_HEAD(list);
1641		- LIST_HEAD(ctx_list);
1642		- unsigned int depth;
1643	1930
	1931	+ if (list_empty(&plug->mq_list))
	1932	+ return;
1644	1933	list_splice_init(&plug->mq_list, &list);
1645	1934
1646		- list_sort(NULL, &list, plug_ctx_cmp);
	1935	+ if (plug->rq_count > 2 && plug->multiple_queues)
	1936	+ list_sort(NULL, &list, plug_rq_cmp);
1647	1937
1648		- this_q = NULL;
1649		- this_ctx = NULL;
1650		- depth = 0;
	1938	+ plug->rq_count = 0;
1651	1939
1652		- while (!list_empty(&list)) {
1653		- rq = list_entry_rq(list.next);
1654		- list_del_init(&rq->queuelist);
1655		- BUG_ON(!rq->q);
1656		- if (rq->mq_ctx != this_ctx) {
1657		- if (this_ctx) {
1658		- trace_block_unplug(this_q, depth, !from_schedule);
1659		- blk_mq_sched_insert_requests(this_q, this_ctx,
1660		- &ctx_list,
1661		- from_schedule);
1662		- }
	1940	+ do {
	1941	+ struct list_head rq_list;
	1942	+ struct request rq, head_rq = list_entry_rq(list.next);
	1943	+ struct list_head pos = &head_rq->queuelist; / skip first */
	1944	+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
	1945	+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
	1946	+ unsigned int depth = 1;
1663	1947
1664		- this_ctx = rq->mq_ctx;
1665		- this_q = rq->q;
1666		- depth = 0;
	1948	+ list_for_each_continue(pos, &list) {
	1949	+ rq = list_entry_rq(pos);
	1950	+ BUG_ON(!rq->q);
	1951	+ if (rq->mq_hctx != this_hctx \|\| rq->mq_ctx != this_ctx)
	1952	+ break;
	1953	+ depth++;
1667	1954	}
1668	1955
1669		- depth++;
1670		- list_add_tail(&rq->queuelist, &ctx_list);
1671		- }
1672		-
1673		- /*
1674		- * If 'this_ctx' is set, we know we have entries to complete
1675		- * on 'ctx_list'. Do those.
1676		- */
1677		- if (this_ctx) {
1678		- trace_block_unplug(this_q, depth, !from_schedule);
1679		- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
	1956	+ list_cut_before(&rq_list, &list, pos);
	1957	+ trace_block_unplug(head_rq->q, depth, !from_schedule);
	1958	+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1680	1959	from_schedule);
1681		- }
	1960	+ } while(!list_empty(&list));
1682	1961	}
1683	1962
1684		-static void blk_mq_bio_to_request(struct request rq, struct bio bio)
	1963	+static void blk_mq_bio_to_request(struct request rq, struct bio bio,
	1964	+ unsigned int nr_segs)
1685	1965	{
1686		- blk_init_request_from_bio(rq, bio);
	1966	+ int err;
1687	1967
1688		- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
	1968	+ if (bio->bi_opf & REQ_RAHEAD)
	1969	+ rq->cmd_flags \|= REQ_FAILFAST_MASK;
1689	1970
1690		- blk_account_io_start(rq, true);
1691		-}
	1971	+ rq->__sector = bio->bi_iter.bi_sector;
	1972	+ rq->write_hint = bio->bi_write_hint;
	1973	+ blk_rq_bio_prep(rq, bio, nr_segs);
1692	1974
1693		-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx hctx, struct request rq)
1694		-{
1695		- if (rq->tag != -1)
1696		- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
	1975	+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
	1976	+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
	1977	+ WARN_ON_ONCE(err);
1697	1978
1698		- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
	1979	+ blk_account_io_start(rq);
1699	1980	}
1700	1981
1701	1982	static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1702	1983	struct request *rq,
1703		- blk_qc_t *cookie)
	1984	+ blk_qc_t *cookie, bool last)
1704	1985	{
1705	1986	struct request_queue *q = rq->q;
1706	1987	struct blk_mq_queue_data bd = {
1707	1988	.rq = rq,
1708		- .last = true,
	1989	+ .last = last,
1709	1990	};
1710	1991	blk_qc_t new_cookie;
1711	1992	blk_status_t ret;
..	..	@@ -1740,7 +2021,7 @@
1740	2021	static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1741	2022	struct request *rq,
1742	2023	blk_qc_t *cookie,
1743		- bool bypass_insert)
	2024	+ bool bypass_insert, bool last)
1744	2025	{
1745	2026	struct request_queue *q = rq->q;
1746	2027	bool run_queue = true;
..	..	@@ -1761,23 +2042,35 @@
1761	2042	if (q->elevator && !bypass_insert)
1762	2043	goto insert;
1763	2044
1764		- if (!blk_mq_get_dispatch_budget(hctx))
	2045	+ if (!blk_mq_get_dispatch_budget(q))
1765	2046	goto insert;
1766	2047
1767	2048	if (!blk_mq_get_driver_tag(rq)) {
1768		- blk_mq_put_dispatch_budget(hctx);
	2049	+ blk_mq_put_dispatch_budget(q);
1769	2050	goto insert;
1770	2051	}
1771	2052
1772		- return __blk_mq_issue_directly(hctx, rq, cookie);
	2053	+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
1773	2054	insert:
1774	2055	if (bypass_insert)
1775	2056	return BLK_STS_RESOURCE;
1776	2057
1777		- blk_mq_request_bypass_insert(rq, run_queue);
	2058	+ blk_mq_sched_insert_request(rq, false, run_queue, false);
	2059	+
1778	2060	return BLK_STS_OK;
1779	2061	}
1780	2062
	2063	+/**
	2064	+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
	2065	+ * @hctx: Pointer of the associated hardware queue.
	2066	+ * @rq: Pointer to request to be sent.
	2067	+ * @cookie: Request queue cookie.
	2068	+ *
	2069	+ * If the device has enough resources to accept a new request now, send the
	2070	+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
	2071	+ * we can try send it another time in the future. Requests inserted at this
	2072	+ * queue have higher priority.
	2073	+ */
1781	2074	static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1782	2075	struct request rq, blk_qc_t cookie)
1783	2076	{
..	..	@@ -1788,25 +2081,24 @@
1788	2081
1789	2082	hctx_lock(hctx, &srcu_idx);
1790	2083
1791		- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
	2084	+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1792	2085	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1793		- blk_mq_request_bypass_insert(rq, true);
	2086	+ blk_mq_request_bypass_insert(rq, false, true);
1794	2087	else if (ret != BLK_STS_OK)
1795	2088	blk_mq_end_request(rq, ret);
1796	2089
1797	2090	hctx_unlock(hctx, srcu_idx);
1798	2091	}
1799	2092
1800		-blk_status_t blk_mq_request_issue_directly(struct request *rq)
	2093	+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1801	2094	{
1802	2095	blk_status_t ret;
1803	2096	int srcu_idx;
1804	2097	blk_qc_t unused_cookie;
1805		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1806		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	2098	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1807	2099
1808	2100	hctx_lock(hctx, &srcu_idx);
1809		- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
	2101	+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1810	2102	hctx_unlock(hctx, srcu_idx);
1811	2103
1812	2104	return ret;
..	..	@@ -1815,104 +2107,169 @@
1815	2107	void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1816	2108	struct list_head *list)
1817	2109	{
	2110	+ int queued = 0;
	2111	+ int errors = 0;
	2112	+
1818	2113	while (!list_empty(list)) {
1819	2114	blk_status_t ret;
1820	2115	struct request *rq = list_first_entry(list, struct request,
1821	2116	queuelist);
1822	2117
1823	2118	list_del_init(&rq->queuelist);
1824		- ret = blk_mq_request_issue_directly(rq);
	2119	+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
1825	2120	if (ret != BLK_STS_OK) {
	2121	+ errors++;
1826	2122	if (ret == BLK_STS_RESOURCE \|\|
1827	2123	ret == BLK_STS_DEV_RESOURCE) {
1828		- blk_mq_request_bypass_insert(rq,
	2124	+ blk_mq_request_bypass_insert(rq, false,
1829	2125	list_empty(list));
1830	2126	break;
1831	2127	}
1832	2128	blk_mq_end_request(rq, ret);
1833		- }
	2129	+ } else
	2130	+ queued++;
	2131	+ }
	2132	+
	2133	+ /*
	2134	+ * If we didn't flush the entire list, we could have told
	2135	+ * the driver there was more coming, but that turned out to
	2136	+ * be a lie.
	2137	+ */
	2138	+ if ((!list_empty(list) \|\| errors) &&
	2139	+ hctx->queue->mq_ops->commit_rqs && queued)
	2140	+ hctx->queue->mq_ops->commit_rqs(hctx);
	2141	+}
	2142	+
	2143	+static void blk_add_rq_to_plug(struct blk_plug plug, struct request rq)
	2144	+{
	2145	+ list_add_tail(&rq->queuelist, &plug->mq_list);
	2146	+ plug->rq_count++;
	2147	+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
	2148	+ struct request *tmp;
	2149	+
	2150	+ tmp = list_first_entry(&plug->mq_list, struct request,
	2151	+ queuelist);
	2152	+ if (tmp->q != rq->q)
	2153	+ plug->multiple_queues = true;
1834	2154	}
1835	2155	}
1836	2156
1837		-static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
	2157	+/*
	2158	+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
	2159	+ * queues. This is important for md arrays to benefit from merging
	2160	+ * requests.
	2161	+ */
	2162	+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1838	2163	{
	2164	+ if (plug->multiple_queues)
	2165	+ return BLK_MAX_REQUEST_COUNT * 2;
	2166	+ return BLK_MAX_REQUEST_COUNT;
	2167	+}
	2168	+
	2169	+/**
	2170	+ * blk_mq_submit_bio - Create and send a request to block device.
	2171	+ * @bio: Bio pointer.
	2172	+ *
	2173	+ * Builds up a request structure from @q and @bio and send to the device. The
	2174	+ * request may not be queued directly to hardware if:
	2175	+ * * This request can be merged with another one
	2176	+ * * We want to place request at plug queue for possible future merging
	2177	+ * * There is an IO scheduler active at this queue
	2178	+ *
	2179	+ * It will not queue the request if there is an error with the bio, or at the
	2180	+ * request creation.
	2181	+ *
	2182	+ * Returns: Request queue cookie.
	2183	+ */
	2184	+blk_qc_t blk_mq_submit_bio(struct bio *bio)
	2185	+{
	2186	+ struct request_queue *q = bio->bi_disk->queue;
1839	2187	const int is_sync = op_is_sync(bio->bi_opf);
1840	2188	const int is_flush_fua = op_is_flush(bio->bi_opf);
1841		- struct blk_mq_alloc_data data = { .flags = 0 };
	2189	+ struct blk_mq_alloc_data data = {
	2190	+ .q = q,
	2191	+ };
1842	2192	struct request *rq;
1843		- unsigned int request_count = 0;
1844	2193	struct blk_plug *plug;
1845	2194	struct request *same_queue_rq = NULL;
	2195	+ unsigned int nr_segs;
1846	2196	blk_qc_t cookie;
	2197	+ blk_status_t ret;
1847	2198
1848	2199	blk_queue_bounce(q, &bio);
1849		-
1850		- blk_queue_split(q, &bio);
	2200	+ __blk_queue_split(&bio, &nr_segs);
1851	2201
1852	2202	if (!bio_integrity_prep(bio))
1853		- return BLK_QC_T_NONE;
	2203	+ goto queue_exit;
1854	2204
1855	2205	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1856		- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1857		- return BLK_QC_T_NONE;
	2206	+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
	2207	+ goto queue_exit;
1858	2208
1859		- if (blk_mq_sched_bio_merge(q, bio))
1860		- return BLK_QC_T_NONE;
	2209	+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
	2210	+ goto queue_exit;
1861	2211
1862		- rq_qos_throttle(q, bio, NULL);
	2212	+ rq_qos_throttle(q, bio);
1863	2213
1864		- trace_block_getrq(q, bio, bio->bi_opf);
1865		-
1866		- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
	2214	+ data.cmd_flags = bio->bi_opf;
	2215	+ rq = __blk_mq_alloc_request(&data);
1867	2216	if (unlikely(!rq)) {
1868	2217	rq_qos_cleanup(q, bio);
1869	2218	if (bio->bi_opf & REQ_NOWAIT)
1870	2219	bio_wouldblock_error(bio);
1871		- return BLK_QC_T_NONE;
	2220	+ goto queue_exit;
1872	2221	}
	2222	+
	2223	+ trace_block_getrq(q, bio, bio->bi_opf);
1873	2224
1874	2225	rq_qos_track(q, rq, bio);
1875	2226
1876	2227	cookie = request_to_qc_t(data.hctx, rq);
1877	2228
1878		- plug = current->plug;
1879		- if (unlikely(is_flush_fua)) {
1880		- blk_mq_put_ctx(data.ctx);
1881		- blk_mq_bio_to_request(rq, bio);
	2229	+ blk_mq_bio_to_request(rq, bio, nr_segs);
1882	2230
1883		- /* bypass scheduler for flush rq */
	2231	+ ret = blk_crypto_init_request(rq);
	2232	+ if (ret != BLK_STS_OK) {
	2233	+ bio->bi_status = ret;
	2234	+ bio_endio(bio);
	2235	+ blk_mq_free_request(rq);
	2236	+ return BLK_QC_T_NONE;
	2237	+ }
	2238	+
	2239	+ plug = blk_mq_plug(q, bio);
	2240	+ if (unlikely(is_flush_fua)) {
	2241	+ /* Bypass scheduler for flush requests */
1884	2242	blk_insert_flush(rq);
1885	2243	blk_mq_run_hw_queue(data.hctx, true);
1886		- } else if (plug && q->nr_hw_queues == 1) {
1887		- struct request *last = NULL;
1888		-
1889		- blk_mq_put_ctx(data.ctx);
1890		- blk_mq_bio_to_request(rq, bio);
1891		-
	2244	+ } else if (plug && (q->nr_hw_queues == 1 \|\|
	2245	+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) \|\|
	2246	+ q->mq_ops->commit_rqs \|\| !blk_queue_nonrot(q))) {
1892	2247	/*
1893		- * @request_count may become stale because of schedule
1894		- * out, so check the list again.
	2248	+ * Use plugging if we have a ->commit_rqs() hook as well, as
	2249	+ * we know the driver uses bd->last in a smart fashion.
	2250	+ *
	2251	+ * Use normal plugging if this disk is slow HDD, as sequential
	2252	+ * IO may benefit a lot from plug merging.
1895	2253	*/
1896		- if (list_empty(&plug->mq_list))
1897		- request_count = 0;
1898		- else if (blk_queue_nomerges(q))
1899		- request_count = blk_plug_queued_count(q);
	2254	+ unsigned int request_count = plug->rq_count;
	2255	+ struct request *last = NULL;
1900	2256
1901	2257	if (!request_count)
1902	2258	trace_block_plug(q);
1903	2259	else
1904	2260	last = list_entry_rq(plug->mq_list.prev);
1905	2261
1906		- if (request_count >= BLK_MAX_REQUEST_COUNT \|\| (last &&
	2262	+ if (request_count >= blk_plug_max_rq_count(plug) \|\| (last &&
1907	2263	blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1908	2264	blk_flush_plug_list(plug, false);
1909	2265	trace_block_plug(q);
1910	2266	}
1911	2267
1912		- list_add_tail(&rq->queuelist, &plug->mq_list);
	2268	+ blk_add_rq_to_plug(plug, rq);
	2269	+ } else if (q->elevator) {
	2270	+ /* Insert the request at the IO scheduler queue */
	2271	+ blk_mq_sched_insert_request(rq, false, true, true);
1913	2272	} else if (plug && !blk_queue_nomerges(q)) {
1914		- blk_mq_bio_to_request(rq, bio);
1915		-
1916	2273	/*
1917	2274	* We do limited plugging. If the bio can be merged, do that.
1918	2275	* Otherwise the existing request in the plug list will be
..	..	@@ -1922,30 +2279,74 @@
1922	2279	*/
1923	2280	if (list_empty(&plug->mq_list))
1924	2281	same_queue_rq = NULL;
1925		- if (same_queue_rq)
	2282	+ if (same_queue_rq) {
1926	2283	list_del_init(&same_queue_rq->queuelist);
1927		- list_add_tail(&rq->queuelist, &plug->mq_list);
1928		-
1929		- blk_mq_put_ctx(data.ctx);
	2284	+ plug->rq_count--;
	2285	+ }
	2286	+ blk_add_rq_to_plug(plug, rq);
	2287	+ trace_block_plug(q);
1930	2288
1931	2289	if (same_queue_rq) {
1932		- data.hctx = blk_mq_map_queue(q,
1933		- same_queue_rq->mq_ctx->cpu);
	2290	+ data.hctx = same_queue_rq->mq_hctx;
	2291	+ trace_block_unplug(q, 1, true);
1934	2292	blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1935	2293	&cookie);
1936	2294	}
1937		- } else if ((q->nr_hw_queues > 1 && is_sync) \|\| (!q->elevator &&
1938		- !data.hctx->dispatch_busy)) {
1939		- blk_mq_put_ctx(data.ctx);
1940		- blk_mq_bio_to_request(rq, bio);
	2295	+ } else if ((q->nr_hw_queues > 1 && is_sync) \|\|
	2296	+ !data.hctx->dispatch_busy) {
	2297	+ /*
	2298	+ * There is no scheduler and we can try to send directly
	2299	+ * to the hardware.
	2300	+ */
1941	2301	blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1942	2302	} else {
1943		- blk_mq_put_ctx(data.ctx);
1944		- blk_mq_bio_to_request(rq, bio);
	2303	+ /* Default case. */
1945	2304	blk_mq_sched_insert_request(rq, false, true, true);
1946	2305	}
1947	2306
1948	2307	return cookie;
	2308	+queue_exit:
	2309	+ blk_queue_exit(q);
	2310	+ return BLK_QC_T_NONE;
	2311	+}
	2312	+
	2313	+static size_t order_to_size(unsigned int order)
	2314	+{
	2315	+ return (size_t)PAGE_SIZE << order;
	2316	+}
	2317	+
	2318	+/* called before freeing request pool in @tags */
	2319	+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
	2320	+ struct blk_mq_tags *tags, unsigned int hctx_idx)
	2321	+{
	2322	+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
	2323	+ struct page *page;
	2324	+ unsigned long flags;
	2325	+
	2326	+ list_for_each_entry(page, &tags->page_list, lru) {
	2327	+ unsigned long start = (unsigned long)page_address(page);
	2328	+ unsigned long end = start + order_to_size(page->private);
	2329	+ int i;
	2330	+
	2331	+ for (i = 0; i < set->queue_depth; i++) {
	2332	+ struct request *rq = drv_tags->rqs[i];
	2333	+ unsigned long rq_addr = (unsigned long)rq;
	2334	+
	2335	+ if (rq_addr >= start && rq_addr < end) {
	2336	+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
	2337	+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
	2338	+ }
	2339	+ }
	2340	+ }
	2341	+
	2342	+ /*
	2343	+ * Wait until all pending iteration is done.
	2344	+ *
	2345	+ * Request reference is cleared and it is guaranteed to be observed
	2346	+ * after the ->lock is released.
	2347	+ */
	2348	+ spin_lock_irqsave(&drv_tags->lock, flags);
	2349	+ spin_unlock_irqrestore(&drv_tags->lock, flags);
1949	2350	}
1950	2351
1951	2352	void blk_mq_free_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
..	..	@@ -1966,42 +2367,44 @@
1966	2367	}
1967	2368	}
1968	2369
	2370	+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
	2371	+
1969	2372	while (!list_empty(&tags->page_list)) {
1970	2373	page = list_first_entry(&tags->page_list, struct page, lru);
1971	2374	list_del_init(&page->lru);
1972	2375	/*
1973	2376	* Remove kmemleak object previously allocated in
1974		- * blk_mq_init_rq_map().
	2377	+ * blk_mq_alloc_rqs().
1975	2378	*/
1976	2379	kmemleak_free(page_address(page));
1977	2380	__free_pages(page, page->private);
1978	2381	}
1979	2382	}
1980	2383
1981		-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
	2384	+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
1982	2385	{
1983	2386	kfree(tags->rqs);
1984	2387	tags->rqs = NULL;
1985	2388	kfree(tags->static_rqs);
1986	2389	tags->static_rqs = NULL;
1987	2390
1988		- blk_mq_free_tags(tags);
	2391	+ blk_mq_free_tags(tags, flags);
1989	2392	}
1990	2393
1991	2394	struct blk_mq_tags blk_mq_alloc_rq_map(struct blk_mq_tag_set set,
1992	2395	unsigned int hctx_idx,
1993	2396	unsigned int nr_tags,
1994		- unsigned int reserved_tags)
	2397	+ unsigned int reserved_tags,
	2398	+ unsigned int flags)
1995	2399	{
1996	2400	struct blk_mq_tags *tags;
1997	2401	int node;
1998	2402
1999		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2403	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2000	2404	if (node == NUMA_NO_NODE)
2001	2405	node = set->numa_node;
2002	2406
2003		- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2004		- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
	2407	+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
2005	2408	if (!tags)
2006	2409	return NULL;
2007	2410
..	..	@@ -2009,7 +2412,7 @@
2009	2412	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2010	2413	node);
2011	2414	if (!tags->rqs) {
2012		- blk_mq_free_tags(tags);
	2415	+ blk_mq_free_tags(tags, flags);
2013	2416	return NULL;
2014	2417	}
2015	2418
..	..	@@ -2018,16 +2421,11 @@
2018	2421	node);
2019	2422	if (!tags->static_rqs) {
2020	2423	kfree(tags->rqs);
2021		- blk_mq_free_tags(tags);
	2424	+ blk_mq_free_tags(tags, flags);
2022	2425	return NULL;
2023	2426	}
2024	2427
2025	2428	return tags;
2026		-}
2027		-
2028		-static size_t order_to_size(unsigned int order)
2029		-{
2030		- return (size_t)PAGE_SIZE << order;
2031	2429	}
2032	2430
2033	2431	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
..	..	@@ -2052,7 +2450,7 @@
2052	2450	size_t rq_size, left;
2053	2451	int node;
2054	2452
2055		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2453	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2056	2454	if (node == NUMA_NO_NODE)
2057	2455	node = set->numa_node;
2058	2456
..	..	@@ -2064,6 +2462,7 @@
2064	2462	*/
2065	2463	rq_size = round_up(sizeof(struct request) + set->cmd_size,
2066	2464	cache_line_size());
	2465	+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
2067	2466	left = rq_size * depth;
2068	2467
2069	2468	for (i = 0; i < depth; ) {
..	..	@@ -2122,6 +2521,86 @@
2122	2521	return -ENOMEM;
2123	2522	}
2124	2523
	2524	+struct rq_iter_data {
	2525	+ struct blk_mq_hw_ctx *hctx;
	2526	+ bool has_rq;
	2527	+};
	2528	+
	2529	+static bool blk_mq_has_request(struct request rq, void data, bool reserved)
	2530	+{
	2531	+ struct rq_iter_data *iter_data = data;
	2532	+
	2533	+ if (rq->mq_hctx != iter_data->hctx)
	2534	+ return true;
	2535	+ iter_data->has_rq = true;
	2536	+ return false;
	2537	+}
	2538	+
	2539	+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
	2540	+{
	2541	+ struct blk_mq_tags *tags = hctx->sched_tags ?
	2542	+ hctx->sched_tags : hctx->tags;
	2543	+ struct rq_iter_data data = {
	2544	+ .hctx = hctx,
	2545	+ };
	2546	+
	2547	+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
	2548	+ return data.has_rq;
	2549	+}
	2550	+
	2551	+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
	2552	+ struct blk_mq_hw_ctx *hctx)
	2553	+{
	2554	+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
	2555	+ return false;
	2556	+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
	2557	+ return false;
	2558	+ return true;
	2559	+}
	2560	+
	2561	+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
	2562	+{
	2563	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2564	+ struct blk_mq_hw_ctx, cpuhp_online);
	2565	+
	2566	+ if (!cpumask_test_cpu(cpu, hctx->cpumask) \|\|
	2567	+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
	2568	+ return 0;
	2569	+
	2570	+ /*
	2571	+ * Prevent new request from being allocated on the current hctx.
	2572	+ *
	2573	+ * The smp_mb__after_atomic() Pairs with the implied barrier in
	2574	+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
	2575	+ * seen once we return from the tag allocator.
	2576	+ */
	2577	+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2578	+ smp_mb__after_atomic();
	2579	+
	2580	+ /*
	2581	+ * Try to grab a reference to the queue and wait for any outstanding
	2582	+ * requests. If we could not grab a reference the queue has been
	2583	+ * frozen and there are no requests.
	2584	+ */
	2585	+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
	2586	+ while (blk_mq_hctx_has_requests(hctx))
	2587	+ msleep(5);
	2588	+ percpu_ref_put(&hctx->queue->q_usage_counter);
	2589	+ }
	2590	+
	2591	+ return 0;
	2592	+}
	2593	+
	2594	+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
	2595	+{
	2596	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2597	+ struct blk_mq_hw_ctx, cpuhp_online);
	2598	+
	2599	+ if (cpumask_test_cpu(cpu, hctx->cpumask))
	2600	+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2601	+ return 0;
	2602	+}
	2603	+
2125	2604	/*
2126	2605	* 'cpu' is going away. splice any existing rq_list entries from this
2127	2606	* software queue to the hw queue dispatch list, and ensure that it
..	..	@@ -2132,13 +2611,18 @@
2132	2611	struct blk_mq_hw_ctx *hctx;
2133	2612	struct blk_mq_ctx *ctx;
2134	2613	LIST_HEAD(tmp);
	2614	+ enum hctx_type type;
2135	2615
2136	2616	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
	2617	+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
	2618	+ return 0;
	2619	+
2137	2620	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
	2621	+ type = hctx->type;
2138	2622
2139	2623	spin_lock(&ctx->lock);
2140		- if (!list_empty(&ctx->rq_list)) {
2141		- list_splice_init(&ctx->rq_list, &tmp);
	2624	+ if (!list_empty(&ctx->rq_lists[type])) {
	2625	+ list_splice_init(&ctx->rq_lists[type], &tmp);
2142	2626	blk_mq_hctx_clear_pending(hctx, ctx);
2143	2627	}
2144	2628	spin_unlock(&ctx->lock);
..	..	@@ -2156,8 +2640,40 @@
2156	2640
2157	2641	static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2158	2642	{
	2643	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2644	+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2645	+ &hctx->cpuhp_online);
2159	2646	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2160	2647	&hctx->cpuhp_dead);
	2648	+}
	2649	+
	2650	+/*
	2651	+ * Before freeing hw queue, clearing the flush request reference in
	2652	+ * tags->rqs[] for avoiding potential UAF.
	2653	+ */
	2654	+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
	2655	+ unsigned int queue_depth, struct request *flush_rq)
	2656	+{
	2657	+ int i;
	2658	+ unsigned long flags;
	2659	+
	2660	+ /* The hw queue may not be mapped yet */
	2661	+ if (!tags)
	2662	+ return;
	2663	+
	2664	+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
	2665	+
	2666	+ for (i = 0; i < queue_depth; i++)
	2667	+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
	2668	+
	2669	+ /*
	2670	+ * Wait until all pending iteration is done.
	2671	+ *
	2672	+ * Request reference is cleared and it is guaranteed to be observed
	2673	+ * after the ->lock is released.
	2674	+ */
	2675	+ spin_lock_irqsave(&tags->lock, flags);
	2676	+ spin_unlock_irqrestore(&tags->lock, flags);
2161	2677	}
2162	2678
2163	2679	/* hctx->ctxs will be freed in queue's release handler */
..	..	@@ -2165,18 +2681,24 @@
2165	2681	struct blk_mq_tag_set *set,
2166	2682	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2167	2683	{
2168		- blk_mq_debugfs_unregister_hctx(hctx);
	2684	+ struct request *flush_rq = hctx->fq->flush_rq;
2169	2685
2170	2686	if (blk_mq_hw_queue_mapped(hctx))
2171	2687	blk_mq_tag_idle(hctx);
2172	2688
	2689	+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
	2690	+ set->queue_depth, flush_rq);
2173	2691	if (set->ops->exit_request)
2174		- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
	2692	+ set->ops->exit_request(set, flush_rq, hctx_idx);
2175	2693
2176	2694	if (set->ops->exit_hctx)
2177	2695	set->ops->exit_hctx(hctx, hctx_idx);
2178	2696
2179	2697	blk_mq_remove_cpuhp(hctx);
	2698	+
	2699	+ spin_lock(&q->unused_hctx_lock);
	2700	+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
	2701	+ spin_unlock(&q->unused_hctx_lock);
2180	2702	}
2181	2703
2182	2704	static void blk_mq_exit_hw_queues(struct request_queue *q,
..	..	@@ -2188,112 +2710,160 @@
2188	2710	queue_for_each_hw_ctx(q, hctx, i) {
2189	2711	if (i == nr_queue)
2190	2712	break;
	2713	+ blk_mq_debugfs_unregister_hctx(hctx);
2191	2714	blk_mq_exit_hctx(q, set, hctx, i);
2192	2715	}
	2716	+}
	2717	+
	2718	+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	2719	+{
	2720	+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	2721	+
	2722	+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
	2723	+ __alignof__(struct blk_mq_hw_ctx)) !=
	2724	+ sizeof(struct blk_mq_hw_ctx));
	2725	+
	2726	+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
	2727	+ hw_ctx_size += sizeof(struct srcu_struct);
	2728	+
	2729	+ return hw_ctx_size;
2193	2730	}
2194	2731
2195	2732	static int blk_mq_init_hctx(struct request_queue *q,
2196	2733	struct blk_mq_tag_set *set,
2197	2734	struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2198	2735	{
2199		- int node;
	2736	+ hctx->queue_num = hctx_idx;
2200	2737
2201		- node = hctx->numa_node;
	2738	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2739	+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2740	+ &hctx->cpuhp_online);
	2741	+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
	2742	+
	2743	+ hctx->tags = set->tags[hctx_idx];
	2744	+
	2745	+ if (set->ops->init_hctx &&
	2746	+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
	2747	+ goto unregister_cpu_notifier;
	2748	+
	2749	+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
	2750	+ hctx->numa_node))
	2751	+ goto exit_hctx;
	2752	+ return 0;
	2753	+
	2754	+ exit_hctx:
	2755	+ if (set->ops->exit_hctx)
	2756	+ set->ops->exit_hctx(hctx, hctx_idx);
	2757	+ unregister_cpu_notifier:
	2758	+ blk_mq_remove_cpuhp(hctx);
	2759	+ return -1;
	2760	+}
	2761	+
	2762	+static struct blk_mq_hw_ctx *
	2763	+blk_mq_alloc_hctx(struct request_queue q, struct blk_mq_tag_set set,
	2764	+ int node)
	2765	+{
	2766	+ struct blk_mq_hw_ctx *hctx;
	2767	+ gfp_t gfp = GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY;
	2768	+
	2769	+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
	2770	+ if (!hctx)
	2771	+ goto fail_alloc_hctx;
	2772	+
	2773	+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
	2774	+ goto free_hctx;
	2775	+
	2776	+ atomic_set(&hctx->nr_active, 0);
2202	2777	if (node == NUMA_NO_NODE)
2203		- node = hctx->numa_node = set->numa_node;
	2778	+ node = set->numa_node;
	2779	+ hctx->numa_node = node;
2204	2780
2205	2781	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2206	2782	spin_lock_init(&hctx->lock);
2207	2783	INIT_LIST_HEAD(&hctx->dispatch);
2208	2784	hctx->queue = q;
2209		- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
	2785	+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
2210	2786
2211		- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2212		-
2213		- hctx->tags = set->tags[hctx_idx];
	2787	+ INIT_LIST_HEAD(&hctx->hctx_list);
2214	2788
2215	2789	/*
2216	2790	* Allocate space for all possible cpus to avoid allocation at
2217	2791	* runtime
2218	2792	*/
2219	2793	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2220		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node);
	2794	+ gfp, node);
2221	2795	if (!hctx->ctxs)
2222		- goto unregister_cpu_notifier;
	2796	+ goto free_cpumask;
2223	2797
2224	2798	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2225		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node))
	2799	+ gfp, node))
2226	2800	goto free_ctxs;
2227		-
2228	2801	hctx->nr_ctx = 0;
2229	2802
2230	2803	spin_lock_init(&hctx->dispatch_wait_lock);
2231	2804	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2232	2805	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2233	2806
2234		- if (set->ops->init_hctx &&
2235		- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2236		- goto free_bitmap;
2237		-
2238		- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2239		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
	2807	+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2240	2808	if (!hctx->fq)
2241		- goto exit_hctx;
2242		-
2243		- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2244		- goto free_fq;
	2809	+ goto free_bitmap;
2245	2810
2246	2811	if (hctx->flags & BLK_MQ_F_BLOCKING)
2247	2812	init_srcu_struct(hctx->srcu);
	2813	+ blk_mq_hctx_kobj_init(hctx);
2248	2814
2249		- blk_mq_debugfs_register_hctx(q, hctx);
	2815	+ return hctx;
2250	2816
2251		- return 0;
2252		-
2253		- free_fq:
2254		- blk_free_flush_queue(hctx->fq);
2255		- exit_hctx:
2256		- if (set->ops->exit_hctx)
2257		- set->ops->exit_hctx(hctx, hctx_idx);
2258	2817	free_bitmap:
2259	2818	sbitmap_free(&hctx->ctx_map);
2260	2819	free_ctxs:
2261	2820	kfree(hctx->ctxs);
2262		- unregister_cpu_notifier:
2263		- blk_mq_remove_cpuhp(hctx);
2264		- return -1;
	2821	+ free_cpumask:
	2822	+ free_cpumask_var(hctx->cpumask);
	2823	+ free_hctx:
	2824	+ kfree(hctx);
	2825	+ fail_alloc_hctx:
	2826	+ return NULL;
2265	2827	}
2266	2828
2267	2829	static void blk_mq_init_cpu_queues(struct request_queue *q,
2268	2830	unsigned int nr_hw_queues)
2269	2831	{
2270		- unsigned int i;
	2832	+ struct blk_mq_tag_set *set = q->tag_set;
	2833	+ unsigned int i, j;
2271	2834
2272	2835	for_each_possible_cpu(i) {
2273	2836	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2274	2837	struct blk_mq_hw_ctx *hctx;
	2838	+ int k;
2275	2839
2276	2840	__ctx->cpu = i;
2277	2841	spin_lock_init(&__ctx->lock);
2278		- INIT_LIST_HEAD(&__ctx->rq_list);
	2842	+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
	2843	+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
	2844	+
2279	2845	__ctx->queue = q;
2280	2846
2281	2847	/*
2282	2848	* Set local node, IFF we have more than one hw queue. If
2283	2849	* not, we remain on the home node of the device
2284	2850	*/
2285		- hctx = blk_mq_map_queue(q, i);
2286		- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2287		- hctx->numa_node = local_memory_node(cpu_to_node(i));
	2851	+ for (j = 0; j < set->nr_maps; j++) {
	2852	+ hctx = blk_mq_map_queue_type(q, j, i);
	2853	+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
	2854	+ hctx->numa_node = cpu_to_node(i);
	2855	+ }
2288	2856	}
2289	2857	}
2290	2858
2291		-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
	2859	+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
	2860	+ int hctx_idx)
2292	2861	{
	2862	+ unsigned int flags = set->flags;
2293	2863	int ret = 0;
2294	2864
2295	2865	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2296		- set->queue_depth, set->reserved_tags);
	2866	+ set->queue_depth, set->reserved_tags, flags);
2297	2867	if (!set->tags[hctx_idx])
2298	2868	return false;
2299	2869
..	..	@@ -2302,7 +2872,7 @@
2302	2872	if (!ret)
2303	2873	return true;
2304	2874
2305		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2875	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2306	2876	set->tags[hctx_idx] = NULL;
2307	2877	return false;
2308	2878	}
..	..	@@ -2310,16 +2880,18 @@
2310	2880	static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2311	2881	unsigned int hctx_idx)
2312	2882	{
2313		- if (set->tags[hctx_idx]) {
	2883	+ unsigned int flags = set->flags;
	2884	+
	2885	+ if (set->tags && set->tags[hctx_idx]) {
2314	2886	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2315		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2887	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2316	2888	set->tags[hctx_idx] = NULL;
2317	2889	}
2318	2890	}
2319	2891
2320	2892	static void blk_mq_map_swqueue(struct request_queue *q)
2321	2893	{
2322		- unsigned int i, hctx_idx;
	2894	+ unsigned int i, j, hctx_idx;
2323	2895	struct blk_mq_hw_ctx *hctx;
2324	2896	struct blk_mq_ctx *ctx;
2325	2897	struct blk_mq_tag_set *set = q->tag_set;
..	..	@@ -2336,25 +2908,52 @@
2336	2908	* If the cpu isn't present, the cpu is mapped to first hctx.
2337	2909	*/
2338	2910	for_each_possible_cpu(i) {
2339		- hctx_idx = q->mq_map[i];
2340		- /* unmapped hw queue can be remapped after CPU topo changed */
2341		- if (!set->tags[hctx_idx] &&
2342		- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2343		- /*
2344		- * If tags initialization fail for some hctx,
2345		- * that hctx won't be brought online. In this
2346		- * case, remap the current ctx to hctx[0] which
2347		- * is guaranteed to always have tags allocated
2348		- */
2349		- q->mq_map[i] = 0;
2350		- }
2351	2911
2352	2912	ctx = per_cpu_ptr(q->queue_ctx, i);
2353		- hctx = blk_mq_map_queue(q, i);
	2913	+ for (j = 0; j < set->nr_maps; j++) {
	2914	+ if (!set->map[j].nr_queues) {
	2915	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2916	+ HCTX_TYPE_DEFAULT, i);
	2917	+ continue;
	2918	+ }
	2919	+ hctx_idx = set->map[j].mq_map[i];
	2920	+ /* unmapped hw queue can be remapped after CPU topo changed */
	2921	+ if (!set->tags[hctx_idx] &&
	2922	+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
	2923	+ /*
	2924	+ * If tags initialization fail for some hctx,
	2925	+ * that hctx won't be brought online. In this
	2926	+ * case, remap the current ctx to hctx[0] which
	2927	+ * is guaranteed to always have tags allocated
	2928	+ */
	2929	+ set->map[j].mq_map[i] = 0;
	2930	+ }
2354	2931
2355		- cpumask_set_cpu(i, hctx->cpumask);
2356		- ctx->index_hw = hctx->nr_ctx;
2357		- hctx->ctxs[hctx->nr_ctx++] = ctx;
	2932	+ hctx = blk_mq_map_queue_type(q, j, i);
	2933	+ ctx->hctxs[j] = hctx;
	2934	+ /*
	2935	+ * If the CPU is already set in the mask, then we've
	2936	+ * mapped this one already. This can happen if
	2937	+ * devices share queues across queue maps.
	2938	+ */
	2939	+ if (cpumask_test_cpu(i, hctx->cpumask))
	2940	+ continue;
	2941	+
	2942	+ cpumask_set_cpu(i, hctx->cpumask);
	2943	+ hctx->type = j;
	2944	+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
	2945	+ hctx->ctxs[hctx->nr_ctx++] = ctx;
	2946	+
	2947	+ /*
	2948	+ * If the nr_ctx type overflows, we have exceeded the
	2949	+ * amount of sw queues we can support.
	2950	+ */
	2951	+ BUG_ON(!hctx->nr_ctx);
	2952	+ }
	2953	+
	2954	+ for (; j < HCTX_MAX_TYPES; j++)
	2955	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2956	+ HCTX_TYPE_DEFAULT, i);
2358	2957	}
2359	2958
2360	2959	queue_for_each_hw_ctx(q, hctx, i) {
..	..	@@ -2403,14 +3002,14 @@
2403	3002
2404	3003	queue_for_each_hw_ctx(q, hctx, i) {
2405	3004	if (shared)
2406		- hctx->flags \|= BLK_MQ_F_TAG_SHARED;
	3005	+ hctx->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2407	3006	else
2408		- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
	3007	+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2409	3008	}
2410	3009	}
2411	3010
2412		-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2413		- bool shared)
	3011	+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
	3012	+ bool shared)
2414	3013	{
2415	3014	struct request_queue *q;
2416	3015
..	..	@@ -2428,12 +3027,12 @@
2428	3027	struct blk_mq_tag_set *set = q->tag_set;
2429	3028
2430	3029	mutex_lock(&set->tag_list_lock);
2431		- list_del_rcu(&q->tag_set_list);
	3030	+ list_del(&q->tag_set_list);
2432	3031	if (list_is_singular(&set->tag_list)) {
2433	3032	/* just transitioned to unshared */
2434		- set->flags &= ~BLK_MQ_F_TAG_SHARED;
	3033	+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2435	3034	/* update existing queue */
2436		- blk_mq_update_tag_set_depth(set, false);
	3035	+ blk_mq_update_tag_set_shared(set, false);
2437	3036	}
2438	3037	mutex_unlock(&set->tag_list_lock);
2439	3038	INIT_LIST_HEAD(&q->tag_set_list);
..	..	@@ -2442,24 +3041,50 @@
2442	3041	static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2443	3042	struct request_queue *q)
2444	3043	{
2445		- q->tag_set = set;
2446		-
2447	3044	mutex_lock(&set->tag_list_lock);
2448	3045
2449	3046	/*
2450	3047	* Check to see if we're transitioning to shared (from 1 to 2 queues).
2451	3048	*/
2452	3049	if (!list_empty(&set->tag_list) &&
2453		- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2454		- set->flags \|= BLK_MQ_F_TAG_SHARED;
	3050	+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	3051	+ set->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2455	3052	/* update existing queue */
2456		- blk_mq_update_tag_set_depth(set, true);
	3053	+ blk_mq_update_tag_set_shared(set, true);
2457	3054	}
2458		- if (set->flags & BLK_MQ_F_TAG_SHARED)
	3055	+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2459	3056	queue_set_hctx_shared(q, true);
2460		- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
	3057	+ list_add_tail(&q->tag_set_list, &set->tag_list);
2461	3058
2462	3059	mutex_unlock(&set->tag_list_lock);
	3060	+}
	3061	+
	3062	+/* All allocations will be freed in release handler of q->mq_kobj */
	3063	+static int blk_mq_alloc_ctxs(struct request_queue *q)
	3064	+{
	3065	+ struct blk_mq_ctxs *ctxs;
	3066	+ int cpu;
	3067	+
	3068	+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
	3069	+ if (!ctxs)
	3070	+ return -ENOMEM;
	3071	+
	3072	+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
	3073	+ if (!ctxs->queue_ctx)
	3074	+ goto fail;
	3075	+
	3076	+ for_each_possible_cpu(cpu) {
	3077	+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
	3078	+ ctx->ctxs = ctxs;
	3079	+ }
	3080	+
	3081	+ q->mq_kobj = &ctxs->kobj;
	3082	+ q->queue_ctx = ctxs->queue_ctx;
	3083	+
	3084	+ return 0;
	3085	+ fail:
	3086	+ kfree(ctxs);
	3087	+ return -ENOMEM;
2463	3088	}
2464	3089
2465	3090	/*
..	..	@@ -2470,17 +3095,17 @@
2470	3095	*/
2471	3096	void blk_mq_release(struct request_queue *q)
2472	3097	{
2473		- struct blk_mq_hw_ctx *hctx;
2474		- unsigned int i;
	3098	+ struct blk_mq_hw_ctx hctx, next;
	3099	+ int i;
2475	3100
2476		- /* hctx kobj stays in hctx */
2477		- queue_for_each_hw_ctx(q, hctx, i) {
2478		- if (!hctx)
2479		- continue;
	3101	+ queue_for_each_hw_ctx(q, hctx, i)
	3102	+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
	3103	+
	3104	+ /* all hctx are in .unused_hctx_list now */
	3105	+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
	3106	+ list_del_init(&hctx->hctx_list);
2480	3107	kobject_put(&hctx->kobj);
2481	3108	}
2482		-
2483		- q->mq_map = NULL;
2484	3109
2485	3110	kfree(q->queue_hw_ctx);
2486	3111
..	..	@@ -2489,102 +3114,184 @@
2489	3114	* both share lifetime with request queue.
2490	3115	*/
2491	3116	blk_mq_sysfs_deinit(q);
2492		-
2493		- free_percpu(q->queue_ctx);
2494	3117	}
2495	3118
2496		-struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3119	+struct request_queue blk_mq_init_queue_data(struct blk_mq_tag_set set,
	3120	+ void *queuedata)
2497	3121	{
2498	3122	struct request_queue uninit_q, q;
2499	3123
2500		- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
	3124	+ uninit_q = blk_alloc_queue(set->numa_node);
2501	3125	if (!uninit_q)
2502	3126	return ERR_PTR(-ENOMEM);
	3127	+ uninit_q->queuedata = queuedata;
2503	3128
2504		- q = blk_mq_init_allocated_queue(set, uninit_q);
	3129	+ /*
	3130	+ * Initialize the queue without an elevator. device_add_disk() will do
	3131	+ * the initialization.
	3132	+ */
	3133	+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
2505	3134	if (IS_ERR(q))
2506	3135	blk_cleanup_queue(uninit_q);
2507	3136
2508	3137	return q;
2509	3138	}
	3139	+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
	3140	+
	3141	+struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3142	+{
	3143	+ return blk_mq_init_queue_data(set, NULL);
	3144	+}
2510	3145	EXPORT_SYMBOL(blk_mq_init_queue);
2511	3146
2512		-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	3147	+/*
	3148	+ * Helper for setting up a queue with mq ops, given queue depth, and
	3149	+ * the passed in mq ops flags.
	3150	+ */
	3151	+struct request_queue blk_mq_init_sq_queue(struct blk_mq_tag_set set,
	3152	+ const struct blk_mq_ops *ops,
	3153	+ unsigned int queue_depth,
	3154	+ unsigned int set_flags)
2513	3155	{
2514		- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	3156	+ struct request_queue *q;
	3157	+ int ret;
2515	3158
2516		- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2517		- __alignof__(struct blk_mq_hw_ctx)) !=
2518		- sizeof(struct blk_mq_hw_ctx));
	3159	+ memset(set, 0, sizeof(*set));
	3160	+ set->ops = ops;
	3161	+ set->nr_hw_queues = 1;
	3162	+ set->nr_maps = 1;
	3163	+ set->queue_depth = queue_depth;
	3164	+ set->numa_node = NUMA_NO_NODE;
	3165	+ set->flags = set_flags;
2519	3166
2520		- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2521		- hw_ctx_size += sizeof(struct srcu_struct);
	3167	+ ret = blk_mq_alloc_tag_set(set);
	3168	+ if (ret)
	3169	+ return ERR_PTR(ret);
2522	3170
2523		- return hw_ctx_size;
	3171	+ q = blk_mq_init_queue(set);
	3172	+ if (IS_ERR(q)) {
	3173	+ blk_mq_free_tag_set(set);
	3174	+ return q;
	3175	+ }
	3176	+
	3177	+ return q;
	3178	+}
	3179	+EXPORT_SYMBOL(blk_mq_init_sq_queue);
	3180	+
	3181	+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
	3182	+ struct blk_mq_tag_set set, struct request_queue q,
	3183	+ int hctx_idx, int node)
	3184	+{
	3185	+ struct blk_mq_hw_ctx hctx = NULL, tmp;
	3186	+
	3187	+ /* reuse dead hctx first */
	3188	+ spin_lock(&q->unused_hctx_lock);
	3189	+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
	3190	+ if (tmp->numa_node == node) {
	3191	+ hctx = tmp;
	3192	+ break;
	3193	+ }
	3194	+ }
	3195	+ if (hctx)
	3196	+ list_del_init(&hctx->hctx_list);
	3197	+ spin_unlock(&q->unused_hctx_lock);
	3198	+
	3199	+ if (!hctx)
	3200	+ hctx = blk_mq_alloc_hctx(q, set, node);
	3201	+ if (!hctx)
	3202	+ goto fail;
	3203	+
	3204	+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
	3205	+ goto free_hctx;
	3206	+
	3207	+ return hctx;
	3208	+
	3209	+ free_hctx:
	3210	+ kobject_put(&hctx->kobj);
	3211	+ fail:
	3212	+ return NULL;
2524	3213	}
2525	3214
2526	3215	static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2527	3216	struct request_queue *q)
2528	3217	{
2529		- int i, j;
	3218	+ int i, j, end;
2530	3219	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2531	3220
2532		- blk_mq_sysfs_unregister(q);
	3221	+ if (q->nr_hw_queues < set->nr_hw_queues) {
	3222	+ struct blk_mq_hw_ctx **new_hctxs;
	3223	+
	3224	+ new_hctxs = kcalloc_node(set->nr_hw_queues,
	3225	+ sizeof(*new_hctxs), GFP_KERNEL,
	3226	+ set->numa_node);
	3227	+ if (!new_hctxs)
	3228	+ return;
	3229	+ if (hctxs)
	3230	+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
	3231	+ sizeof(*hctxs));
	3232	+ q->queue_hw_ctx = new_hctxs;
	3233	+ kfree(hctxs);
	3234	+ hctxs = new_hctxs;
	3235	+ }
2533	3236
2534	3237	/* protect against switching io scheduler */
2535	3238	mutex_lock(&q->sysfs_lock);
2536	3239	for (i = 0; i < set->nr_hw_queues; i++) {
2537	3240	int node;
	3241	+ struct blk_mq_hw_ctx *hctx;
2538	3242
2539		- if (hctxs[i])
	3243	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
	3244	+ /*
	3245	+ * If the hw queue has been mapped to another numa node,
	3246	+ * we need to realloc the hctx. If allocation fails, fallback
	3247	+ * to use the previous one.
	3248	+ */
	3249	+ if (hctxs[i] && (hctxs[i]->numa_node == node))
2540	3250	continue;
2541	3251
2542		- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2543		- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2544		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2545		- node);
2546		- if (!hctxs[i])
2547		- break;
2548		-
2549		- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2550		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2551		- node)) {
2552		- kfree(hctxs[i]);
2553		- hctxs[i] = NULL;
2554		- break;
	3252	+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
	3253	+ if (hctx) {
	3254	+ if (hctxs[i])
	3255	+ blk_mq_exit_hctx(q, set, hctxs[i], i);
	3256	+ hctxs[i] = hctx;
	3257	+ } else {
	3258	+ if (hctxs[i])
	3259	+ pr_warn("Allocate new hctx on node %d fails,\
	3260	+ fallback to previous one on node %d\n",
	3261	+ node, hctxs[i]->numa_node);
	3262	+ else
	3263	+ break;
2555	3264	}
2556		-
2557		- atomic_set(&hctxs[i]->nr_active, 0);
2558		- hctxs[i]->numa_node = node;
2559		- hctxs[i]->queue_num = i;
2560		-
2561		- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2562		- free_cpumask_var(hctxs[i]->cpumask);
2563		- kfree(hctxs[i]);
2564		- hctxs[i] = NULL;
2565		- break;
2566		- }
2567		- blk_mq_hctx_kobj_init(hctxs[i]);
2568	3265	}
2569		- for (j = i; j < q->nr_hw_queues; j++) {
	3266	+ /*
	3267	+ * Increasing nr_hw_queues fails. Free the newly allocated
	3268	+ * hctxs and keep the previous q->nr_hw_queues.
	3269	+ */
	3270	+ if (i != set->nr_hw_queues) {
	3271	+ j = q->nr_hw_queues;
	3272	+ end = i;
	3273	+ } else {
	3274	+ j = i;
	3275	+ end = q->nr_hw_queues;
	3276	+ q->nr_hw_queues = set->nr_hw_queues;
	3277	+ }
	3278	+
	3279	+ for (; j < end; j++) {
2570	3280	struct blk_mq_hw_ctx *hctx = hctxs[j];
2571	3281
2572	3282	if (hctx) {
2573	3283	if (hctx->tags)
2574	3284	blk_mq_free_map_and_requests(set, j);
2575	3285	blk_mq_exit_hctx(q, set, hctx, j);
2576		- kobject_put(&hctx->kobj);
2577	3286	hctxs[j] = NULL;
2578		-
2579	3287	}
2580	3288	}
2581		- q->nr_hw_queues = i;
2582	3289	mutex_unlock(&q->sysfs_lock);
2583		- blk_mq_sysfs_register(q);
2584	3290	}
2585	3291
2586	3292	struct request_queue blk_mq_init_allocated_queue(struct blk_mq_tag_set set,
2587		- struct request_queue *q)
	3293	+ struct request_queue *q,
	3294	+ bool elevator_init)
2588	3295	{
2589	3296	/* mark the queue as mq asap */
2590	3297	q->mq_ops = set->ops;
..	..	@@ -2595,19 +3302,14 @@
2595	3302	if (!q->poll_cb)
2596	3303	goto err_exit;
2597	3304
2598		- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2599		- if (!q->queue_ctx)
2600		- goto err_exit;
	3305	+ if (blk_mq_alloc_ctxs(q))
	3306	+ goto err_poll;
2601	3307
2602	3308	/* init q->mq_kobj and sw queues' kobjects */
2603	3309	blk_mq_sysfs_init(q);
2604	3310
2605		- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2606		- GFP_KERNEL, set->numa_node);
2607		- if (!q->queue_hw_ctx)
2608		- goto err_percpu;
2609		-
2610		- q->mq_map = set->mq_map;
	3311	+ INIT_LIST_HEAD(&q->unused_hctx_list);
	3312	+ spin_lock_init(&q->unused_hctx_lock);
2611	3313
2612	3314	blk_mq_realloc_hw_ctxs(set, q);
2613	3315	if (!q->nr_hw_queues)
..	..	@@ -2616,12 +3318,12 @@
2616	3318	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2617	3319	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2618	3320
2619		- q->nr_queues = nr_cpu_ids;
	3321	+ q->tag_set = set;
2620	3322
2621	3323	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
2622		-
2623		- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2624		- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
	3324	+ if (set->nr_maps > HCTX_TYPE_POLL &&
	3325	+ set->map[HCTX_TYPE_POLL].nr_queues)
	3326	+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2625	3327
2626	3328	q->sg_reserved_size = INT_MAX;
2627	3329
..	..	@@ -2629,41 +3331,29 @@
2629	3331	INIT_LIST_HEAD(&q->requeue_list);
2630	3332	spin_lock_init(&q->requeue_lock);
2631	3333
2632		- blk_queue_make_request(q, blk_mq_make_request);
2633		- if (q->mq_ops->poll)
2634		- q->poll_fn = blk_mq_poll;
2635		-
2636		- /*
2637		- * Do this after blk_queue_make_request() overrides it...
2638		- */
2639	3334	q->nr_requests = set->queue_depth;
2640	3335
2641	3336	/*
2642	3337	* Default to classic polling
2643	3338	*/
2644		- q->poll_nsec = -1;
2645		-
2646		- if (set->ops->complete)
2647		- blk_queue_softirq_done(q, set->ops->complete);
	3339	+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
2648	3340
2649	3341	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2650	3342	blk_mq_add_queue_tag_set(set, q);
2651	3343	blk_mq_map_swqueue(q);
2652	3344
2653		- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2654		- int ret;
2655		-
2656		- ret = elevator_init_mq(q);
2657		- if (ret)
2658		- return ERR_PTR(ret);
2659		- }
	3345	+ if (elevator_init)
	3346	+ elevator_init_mq(q);
2660	3347
2661	3348	return q;
2662	3349
2663	3350	err_hctxs:
2664	3351	kfree(q->queue_hw_ctx);
2665		-err_percpu:
2666		- free_percpu(q->queue_ctx);
	3352	+ q->nr_hw_queues = 0;
	3353	+ blk_mq_sysfs_deinit(q);
	3354	+err_poll:
	3355	+ blk_stat_free_callback(q->poll_cb);
	3356	+ q->poll_cb = NULL;
2667	3357	err_exit:
2668	3358	q->mq_ops = NULL;
2669	3359	return ERR_PTR(-ENOMEM);
..	..	@@ -2681,38 +3371,21 @@
2681	3371	blk_mq_del_queue_tag_set(q);
2682	3372	}
2683	3373
2684		-/* Basically redo blk_mq_init_queue with queue frozen */
2685		-static void blk_mq_queue_reinit(struct request_queue *q)
2686		-{
2687		- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2688		-
2689		- blk_mq_debugfs_unregister_hctxs(q);
2690		- blk_mq_sysfs_unregister(q);
2691		-
2692		- /*
2693		- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2694		- * we should change hctx numa_node according to the new topology (this
2695		- * involves freeing and re-allocating memory, worth doing?)
2696		- */
2697		- blk_mq_map_swqueue(q);
2698		-
2699		- blk_mq_sysfs_register(q);
2700		- blk_mq_debugfs_register_hctxs(q);
2701		-}
2702		-
2703	3374	static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2704	3375	{
2705	3376	int i;
2706	3377
2707		- for (i = 0; i < set->nr_hw_queues; i++)
2708		- if (!__blk_mq_alloc_rq_map(set, i))
	3378	+ for (i = 0; i < set->nr_hw_queues; i++) {
	3379	+ if (!__blk_mq_alloc_map_and_request(set, i))
2709	3380	goto out_unwind;
	3381	+ cond_resched();
	3382	+ }
2710	3383
2711	3384	return 0;
2712	3385
2713	3386	out_unwind:
2714	3387	while (--i >= 0)
2715		- blk_mq_free_rq_map(set->tags[i]);
	3388	+ blk_mq_free_map_and_requests(set, i);
2716	3389
2717	3390	return -ENOMEM;
2718	3391	}
..	..	@@ -2722,7 +3395,7 @@
2722	3395	* may reduce the depth asked for, if memory is tight. set->queue_depth
2723	3396	* will be updated to reflect the allocated depth.
2724	3397	*/
2725		-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
	3398	+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
2726	3399	{
2727	3400	unsigned int depth;
2728	3401	int err;
..	..	@@ -2754,7 +3427,17 @@
2754	3427
2755	3428	static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2756	3429	{
2757		- if (set->ops->map_queues) {
	3430	+ /*
	3431	+ * blk_mq_map_queues() and multiple .map_queues() implementations
	3432	+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
	3433	+ * number of hardware queues.
	3434	+ */
	3435	+ if (set->nr_maps == 1)
	3436	+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
	3437	+
	3438	+ if (set->ops->map_queues && !is_kdump_kernel()) {
	3439	+ int i;
	3440	+
2758	3441	/*
2759	3442	* transport .map_queues is usually done in the following
2760	3443	* way:
..	..	@@ -2762,18 +3445,44 @@
2762	3445	* for (queue = 0; queue < set->nr_hw_queues; queue++) {
2763	3446	* mask = get_cpu_mask(queue)
2764	3447	* for_each_cpu(cpu, mask)
2765		- * set->mq_map[cpu] = queue;
	3448	+ * set->map[x].mq_map[cpu] = queue;
2766	3449	* }
2767	3450	*
2768	3451	* When we need to remap, the table has to be cleared for
2769	3452	* killing stale mapping since one CPU may not be mapped
2770	3453	* to any hw queue.
2771	3454	*/
2772		- blk_mq_clear_mq_map(set);
	3455	+ for (i = 0; i < set->nr_maps; i++)
	3456	+ blk_mq_clear_mq_map(&set->map[i]);
2773	3457
2774	3458	return set->ops->map_queues(set);
2775		- } else
2776		- return blk_mq_map_queues(set);
	3459	+ } else {
	3460	+ BUG_ON(set->nr_maps > 1);
	3461	+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3462	+ }
	3463	+}
	3464	+
	3465	+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
	3466	+ int cur_nr_hw_queues, int new_nr_hw_queues)
	3467	+{
	3468	+ struct blk_mq_tags **new_tags;
	3469	+
	3470	+ if (cur_nr_hw_queues >= new_nr_hw_queues)
	3471	+ return 0;
	3472	+
	3473	+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
	3474	+ GFP_KERNEL, set->numa_node);
	3475	+ if (!new_tags)
	3476	+ return -ENOMEM;
	3477	+
	3478	+ if (set->tags)
	3479	+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
	3480	+ sizeof(*set->tags));
	3481	+ kfree(set->tags);
	3482	+ set->tags = new_tags;
	3483	+ set->nr_hw_queues = new_nr_hw_queues;
	3484	+
	3485	+ return 0;
2777	3486	}
2778	3487
2779	3488	/*
..	..	@@ -2784,7 +3493,7 @@
2784	3493	*/
2785	3494	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2786	3495	{
2787		- int ret;
	3496	+ int i, ret;
2788	3497
2789	3498	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2790	3499
..	..	@@ -2807,6 +3516,11 @@
2807	3516	set->queue_depth = BLK_MQ_MAX_DEPTH;
2808	3517	}
2809	3518
	3519	+ if (!set->nr_maps)
	3520	+ set->nr_maps = 1;
	3521	+ else if (set->nr_maps > HCTX_MAX_TYPES)
	3522	+ return -EINVAL;
	3523	+
2810	3524	/*
2811	3525	* If a crashdump is active, then we are potentially in a very
2812	3526	* memory constrained environment. Limit us to 1 queue and
..	..	@@ -2814,42 +3528,59 @@
2814	3528	*/
2815	3529	if (is_kdump_kernel()) {
2816	3530	set->nr_hw_queues = 1;
	3531	+ set->nr_maps = 1;
2817	3532	set->queue_depth = min(64U, set->queue_depth);
2818	3533	}
2819	3534	/*
2820		- * There is no use for more h/w queues than cpus.
	3535	+ * There is no use for more h/w queues than cpus if we just have
	3536	+ * a single map
2821	3537	*/
2822		- if (set->nr_hw_queues > nr_cpu_ids)
	3538	+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2823	3539	set->nr_hw_queues = nr_cpu_ids;
2824	3540
2825		- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2826		- GFP_KERNEL, set->numa_node);
2827		- if (!set->tags)
	3541	+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
2828	3542	return -ENOMEM;
2829	3543
2830	3544	ret = -ENOMEM;
2831		- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2832		- GFP_KERNEL, set->numa_node);
2833		- if (!set->mq_map)
2834		- goto out_free_tags;
	3545	+ for (i = 0; i < set->nr_maps; i++) {
	3546	+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
	3547	+ sizeof(set->map[i].mq_map[0]),
	3548	+ GFP_KERNEL, set->numa_node);
	3549	+ if (!set->map[i].mq_map)
	3550	+ goto out_free_mq_map;
	3551	+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
	3552	+ }
2835	3553
2836	3554	ret = blk_mq_update_queue_map(set);
2837	3555	if (ret)
2838	3556	goto out_free_mq_map;
2839	3557
2840		- ret = blk_mq_alloc_rq_maps(set);
	3558	+ ret = blk_mq_alloc_map_and_requests(set);
2841	3559	if (ret)
2842	3560	goto out_free_mq_map;
	3561	+
	3562	+ if (blk_mq_is_sbitmap_shared(set->flags)) {
	3563	+ atomic_set(&set->active_queues_shared_sbitmap, 0);
	3564	+
	3565	+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
	3566	+ ret = -ENOMEM;
	3567	+ goto out_free_mq_rq_maps;
	3568	+ }
	3569	+ }
2843	3570
2844	3571	mutex_init(&set->tag_list_lock);
2845	3572	INIT_LIST_HEAD(&set->tag_list);
2846	3573
2847	3574	return 0;
2848	3575
	3576	+out_free_mq_rq_maps:
	3577	+ for (i = 0; i < set->nr_hw_queues; i++)
	3578	+ blk_mq_free_map_and_requests(set, i);
2849	3579	out_free_mq_map:
2850		- kfree(set->mq_map);
2851		- set->mq_map = NULL;
2852		-out_free_tags:
	3580	+ for (i = 0; i < set->nr_maps; i++) {
	3581	+ kfree(set->map[i].mq_map);
	3582	+ set->map[i].mq_map = NULL;
	3583	+ }
2853	3584	kfree(set->tags);
2854	3585	set->tags = NULL;
2855	3586	return ret;
..	..	@@ -2858,13 +3589,18 @@
2858	3589
2859	3590	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2860	3591	{
2861		- int i;
	3592	+ int i, j;
2862	3593
2863		- for (i = 0; i < nr_cpu_ids; i++)
	3594	+ for (i = 0; i < set->nr_hw_queues; i++)
2864	3595	blk_mq_free_map_and_requests(set, i);
2865	3596
2866		- kfree(set->mq_map);
2867		- set->mq_map = NULL;
	3597	+ if (blk_mq_is_sbitmap_shared(set->flags))
	3598	+ blk_mq_exit_shared_sbitmap(set);
	3599	+
	3600	+ for (j = 0; j < set->nr_maps; j++) {
	3601	+ kfree(set->map[j].mq_map);
	3602	+ set->map[j].mq_map = NULL;
	3603	+ }
2868	3604
2869	3605	kfree(set->tags);
2870	3606	set->tags = NULL;
..	..	@@ -2880,6 +3616,9 @@
2880	3616	if (!set)
2881	3617	return -EINVAL;
2882	3618
	3619	+ if (q->nr_requests == nr)
	3620	+ return 0;
	3621	+
2883	3622	blk_mq_freeze_queue(q);
2884	3623	blk_mq_quiesce_queue(q);
2885	3624
..	..	@@ -2894,14 +3633,16 @@
2894	3633	if (!hctx->sched_tags) {
2895	3634	ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2896	3635	false);
	3636	+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
	3637	+ blk_mq_tag_resize_shared_sbitmap(set, nr);
2897	3638	} else {
2898	3639	ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2899	3640	nr, true);
2900	3641	}
2901	3642	if (ret)
2902	3643	break;
2903		- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2904		- q->elevator->type->ops.mq.depth_updated(hctx);
	3644	+ if (q->elevator && q->elevator->type->ops.depth_updated)
	3645	+ q->elevator->type->ops.depth_updated(hctx);
2905	3646	}
2906	3647
2907	3648	if (!ret)
..	..	@@ -2988,20 +3729,19 @@
2988	3729	{
2989	3730	struct request_queue *q;
2990	3731	LIST_HEAD(head);
	3732	+ int prev_nr_hw_queues;
2991	3733
2992	3734	lockdep_assert_held(&set->tag_list_lock);
2993	3735
2994		- if (nr_hw_queues > nr_cpu_ids)
	3736	+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
2995	3737	nr_hw_queues = nr_cpu_ids;
2996		- if (nr_hw_queues < 1 \|\| nr_hw_queues == set->nr_hw_queues)
	3738	+ if (nr_hw_queues < 1)
	3739	+ return;
	3740	+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
2997	3741	return;
2998	3742
2999	3743	list_for_each_entry(q, &set->tag_list, tag_set_list)
3000	3744	blk_mq_freeze_queue(q);
3001		- /*
3002		- * Sync with blk_mq_queue_tag_busy_iter.
3003		- */
3004		- synchronize_rcu();
3005	3745	/*
3006	3746	* Switch IO scheduler to 'none', cleaning up the data associated
3007	3747	* with the previous scheduler. We will switch back once we are done
..	..	@@ -3011,11 +3751,35 @@
3011	3751	if (!blk_mq_elv_switch_none(&head, q))
3012	3752	goto switch_back;
3013	3753
	3754	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3755	+ blk_mq_debugfs_unregister_hctxs(q);
	3756	+ blk_mq_sysfs_unregister(q);
	3757	+ }
	3758	+
	3759	+ prev_nr_hw_queues = set->nr_hw_queues;
	3760	+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
	3761	+ 0)
	3762	+ goto reregister;
	3763	+
3014	3764	set->nr_hw_queues = nr_hw_queues;
	3765	+fallback:
3015	3766	blk_mq_update_queue_map(set);
3016	3767	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3017	3768	blk_mq_realloc_hw_ctxs(set, q);
3018		- blk_mq_queue_reinit(q);
	3769	+ if (q->nr_hw_queues != set->nr_hw_queues) {
	3770	+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
	3771	+ nr_hw_queues, prev_nr_hw_queues);
	3772	+ set->nr_hw_queues = prev_nr_hw_queues;
	3773	+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3774	+ goto fallback;
	3775	+ }
	3776	+ blk_mq_map_swqueue(q);
	3777	+ }
	3778	+
	3779	+reregister:
	3780	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3781	+ blk_mq_sysfs_register(q);
	3782	+ blk_mq_debugfs_register_hctxs(q);
3019	3783	}
3020	3784
3021	3785	switch_back:
..	..	@@ -3069,7 +3833,6 @@
3069	3833	}
3070	3834
3071	3835	static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3072		- struct blk_mq_hw_ctx *hctx,
3073	3836	struct request *rq)
3074	3837	{
3075	3838	unsigned long ret = 0;
..	..	@@ -3102,7 +3865,6 @@
3102	3865	}
3103	3866
3104	3867	static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3105		- struct blk_mq_hw_ctx *hctx,
3106	3868	struct request *rq)
3107	3869	{
3108	3870	struct hrtimer_sleeper hs;
..	..	@@ -3114,18 +3876,15 @@
3114	3876	return false;
3115	3877
3116	3878	/*
3117		- * poll_nsec can be:
	3879	+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3118	3880	*
3119		- * -1: don't ever hybrid sleep
3120	3881	* 0: use half of prev avg
3121	3882	* >0: use this specific value
3122	3883	*/
3123		- if (q->poll_nsec == -1)
3124		- return false;
3125		- else if (q->poll_nsec > 0)
	3884	+ if (q->poll_nsec > 0)
3126	3885	nsecs = q->poll_nsec;
3127	3886	else
3128		- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
	3887	+ nsecs = blk_mq_poll_nsecs(q, rq);
3129	3888
3130	3889	if (!nsecs)
3131	3890	return false;
..	..	@@ -3139,15 +3898,14 @@
3139	3898	kt = nsecs;
3140	3899
3141	3900	mode = HRTIMER_MODE_REL;
3142		- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
	3901	+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3143	3902	hrtimer_set_expires(&hs.timer, kt);
3144	3903
3145		- hrtimer_init_sleeper(&hs, current);
3146	3904	do {
3147	3905	if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3148	3906	break;
3149	3907	set_current_state(TASK_UNINTERRUPTIBLE);
3150		- hrtimer_start_expires(&hs.timer, mode);
	3908	+ hrtimer_sleeper_start_expires(&hs, mode);
3151	3909	if (hs.task)
3152	3910	io_schedule();
3153	3911	hrtimer_cancel(&hs.timer);
..	..	@@ -3159,59 +3917,14 @@
3159	3917	return true;
3160	3918	}
3161	3919
3162		-static bool __blk_mq_poll(struct blk_mq_hw_ctx hctx, struct request rq)
	3920	+static bool blk_mq_poll_hybrid(struct request_queue *q,
	3921	+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3163	3922	{
3164		- struct request_queue *q = hctx->queue;
3165		- long state;
3166		-
3167		- /*
3168		- * If we sleep, have the caller restart the poll loop to reset
3169		- * the state. Like for the other success return cases, the
3170		- * caller is responsible for checking if the IO completed. If
3171		- * the IO isn't complete, we'll get called again and will go
3172		- * straight to the busy poll loop.
3173		- */
3174		- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3175		- return true;
3176		-
3177		- hctx->poll_considered++;
3178		-
3179		- state = current->state;
3180		- while (!need_resched()) {
3181		- int ret;
3182		-
3183		- hctx->poll_invoked++;
3184		-
3185		- ret = q->mq_ops->poll(hctx, rq->tag);
3186		- if (ret > 0) {
3187		- hctx->poll_success++;
3188		- set_current_state(TASK_RUNNING);
3189		- return true;
3190		- }
3191		-
3192		- if (signal_pending_state(state, current))
3193		- set_current_state(TASK_RUNNING);
3194		-
3195		- if (current->state == TASK_RUNNING)
3196		- return true;
3197		- if (ret < 0)
3198		- break;
3199		- cpu_relax();
3200		- }
3201		-
3202		- __set_current_state(TASK_RUNNING);
3203		- return false;
3204		-}
3205		-
3206		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3207		-{
3208		- struct blk_mq_hw_ctx *hctx;
3209	3923	struct request *rq;
3210	3924
3211		- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3925	+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3212	3926	return false;
3213	3927
3214		- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3215	3928	if (!blk_qc_t_is_internal(cookie))
3216	3929	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3217	3930	else {
..	..	@@ -3226,13 +3939,97 @@
3226	3939	return false;
3227	3940	}
3228	3941
3229		- return __blk_mq_poll(hctx, rq);
	3942	+ return blk_mq_poll_hybrid_sleep(q, rq);
3230	3943	}
	3944	+
	3945	+/**
	3946	+ * blk_poll - poll for IO completions
	3947	+ * @q: the queue
	3948	+ * @cookie: cookie passed back at IO submission time
	3949	+ * @spin: whether to spin for completions
	3950	+ *
	3951	+ * Description:
	3952	+ * Poll for completions on the passed in queue. Returns number of
	3953	+ * completed entries found. If @spin is true, then blk_poll will continue
	3954	+ * looping until at least one completion is found, unless the task is
	3955	+ * otherwise marked running (or we need to reschedule).
	3956	+ */
	3957	+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
	3958	+{
	3959	+ struct blk_mq_hw_ctx *hctx;
	3960	+ long state;
	3961	+
	3962	+ if (!blk_qc_t_valid(cookie) \|\|
	3963	+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3964	+ return 0;
	3965	+
	3966	+ if (current->plug)
	3967	+ blk_flush_plug_list(current->plug, false);
	3968	+
	3969	+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
	3970	+
	3971	+ /*
	3972	+ * If we sleep, have the caller restart the poll loop to reset
	3973	+ * the state. Like for the other success return cases, the
	3974	+ * caller is responsible for checking if the IO completed. If
	3975	+ * the IO isn't complete, we'll get called again and will go
	3976	+ * straight to the busy poll loop.
	3977	+ */
	3978	+ if (blk_mq_poll_hybrid(q, hctx, cookie))
	3979	+ return 1;
	3980	+
	3981	+ hctx->poll_considered++;
	3982	+
	3983	+ state = current->state;
	3984	+ do {
	3985	+ int ret;
	3986	+
	3987	+ hctx->poll_invoked++;
	3988	+
	3989	+ ret = q->mq_ops->poll(hctx);
	3990	+ if (ret > 0) {
	3991	+ hctx->poll_success++;
	3992	+ __set_current_state(TASK_RUNNING);
	3993	+ return ret;
	3994	+ }
	3995	+
	3996	+ if (signal_pending_state(state, current))
	3997	+ __set_current_state(TASK_RUNNING);
	3998	+
	3999	+ if (current->state == TASK_RUNNING)
	4000	+ return 1;
	4001	+ if (ret < 0 \|\| !spin)
	4002	+ break;
	4003	+ cpu_relax();
	4004	+ } while (!need_resched());
	4005	+
	4006	+ __set_current_state(TASK_RUNNING);
	4007	+ return 0;
	4008	+}
	4009	+EXPORT_SYMBOL_GPL(blk_poll);
	4010	+
	4011	+unsigned int blk_mq_rq_cpu(struct request *rq)
	4012	+{
	4013	+ return rq->mq_ctx->cpu;
	4014	+}
	4015	+EXPORT_SYMBOL(blk_mq_rq_cpu);
3231	4016
3232	4017	static int __init blk_mq_init(void)
3233	4018	{
	4019	+ int i;
	4020	+
	4021	+ for_each_possible_cpu(i)
	4022	+ init_llist_head(&per_cpu(blk_cpu_done, i));
	4023	+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
	4024	+
	4025	+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
	4026	+ "block/softirq:dead", NULL,
	4027	+ blk_softirq_cpu_dead);
3234	4028	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3235	4029	blk_mq_hctx_notify_dead);
	4030	+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
	4031	+ blk_mq_hctx_notify_online,
	4032	+ blk_mq_hctx_notify_offline);
3236	4033	return 0;
3237	4034	}
3238	4035	subsys_initcall(blk_mq_init);