~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Block multiqueue core code
3	4	*
..	..	@@ -25,30 +26,36 @@
25	26	#include <linux/delay.h>
26	27	#include <linux/crash_dump.h>
27	28	#include <linux/prefetch.h>
	29	+#include <linux/blk-crypto.h>
28	30
29	31	#include <trace/events/block.h>
30	32
31	33	#include <linux/blk-mq.h>
	34	+#include <linux/t10-pi.h>
32	35	#include "blk.h"
33	36	#include "blk-mq.h"
34	37	#include "blk-mq-debugfs.h"
35	38	#include "blk-mq-tag.h"
	39	+#include "blk-pm.h"
36	40	#include "blk-stat.h"
37	41	#include "blk-mq-sched.h"
38	42	#include "blk-rq-qos.h"
39	43
40		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
	44	+#include <trace/hooks/block.h>
	45	+
	46	+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
	47	+
41	48	static void blk_mq_poll_stats_start(struct request_queue *q);
42	49	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
43	50
44	51	static int blk_mq_poll_stats_bkt(const struct request *rq)
45	52	{
46		- int ddir, bytes, bucket;
	53	+ int ddir, sectors, bucket;
47	54
48	55	ddir = rq_data_dir(rq);
49		- bytes = blk_rq_bytes(rq);
	56	+ sectors = blk_rq_stats_sectors(rq);
50	57
51		- bucket = ddir + 2*(ilog2(bytes) - 9);
	58	+ bucket = ddir + 2 * ilog2(sectors);
52	59
53	60	if (bucket < 0)
54	61	return -1;
..	..	@@ -59,7 +66,8 @@
59	66	}
60	67
61	68	/*
62		- * Check if any of the ctx's have pending work in this hardware queue
	69	+ * Check if any of the ctx, dispatch list or elevator
	70	+ * have pending work in this hardware queue.
63	71	*/
64	72	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
65	73	{
..	..	@@ -74,75 +82,67 @@
74	82	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
75	83	struct blk_mq_ctx *ctx)
76	84	{
77		- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78		- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
	85	+ const int bit = ctx->index_hw[hctx->type];
	86	+
	87	+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
	88	+ sbitmap_set_bit(&hctx->ctx_map, bit);
79	89	}
80	90
81	91	static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
82	92	struct blk_mq_ctx *ctx)
83	93	{
84		- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
	94	+ const int bit = ctx->index_hw[hctx->type];
	95	+
	96	+ sbitmap_clear_bit(&hctx->ctx_map, bit);
85	97	}
86	98
87	99	struct mq_inflight {
88	100	struct hd_struct *part;
89		- unsigned int *inflight;
	101	+ unsigned int inflight[2];
90	102	};
91	103
92		-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
	104	+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93	105	struct request rq, void priv,
94	106	bool reserved)
95	107	{
96	108	struct mq_inflight *mi = priv;
97	109
98		- /*
99		- * index[0] counts the specific partition that was asked for. index[1]
100		- * counts the ones that are active on the whole device, so increment
101		- * that if mi->part is indeed a partition, and not a whole device.
102		- */
103		- if (rq->part == mi->part)
104		- mi->inflight[0]++;
105		- if (mi->part->partno)
106		- mi->inflight[1]++;
107		-}
108		-
109		-void blk_mq_in_flight(struct request_queue q, struct hd_struct part,
110		- unsigned int inflight[2])
111		-{
112		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113		-
114		- inflight[0] = inflight[1] = 0;
115		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116		-}
117		-
118		-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119		- struct request rq, void priv,
120		- bool reserved)
121		-{
122		- struct mq_inflight *mi = priv;
123		-
124		- if (rq->part == mi->part)
	110	+ if ((!mi->part->partno \|\| rq->part == mi->part) &&
	111	+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125	112	mi->inflight[rq_data_dir(rq)]++;
	113	+
	114	+ return true;
	115	+}
	116	+
	117	+unsigned int blk_mq_in_flight(struct request_queue q, struct hd_struct part)
	118	+{
	119	+ struct mq_inflight mi = { .part = part };
	120	+
	121	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	122	+
	123	+ return mi.inflight[0] + mi.inflight[1];
126	124	}
127	125
128	126	void blk_mq_in_flight_rw(struct request_queue q, struct hd_struct part,
129	127	unsigned int inflight[2])
130	128	{
131		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
	129	+ struct mq_inflight mi = { .part = part };
132	130
133		- inflight[0] = inflight[1] = 0;
134		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
	131	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	132	+ inflight[0] = mi.inflight[0];
	133	+ inflight[1] = mi.inflight[1];
135	134	}
136	135
137	136	void blk_freeze_queue_start(struct request_queue *q)
138	137	{
139		- int freeze_depth;
140		-
141		- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142		- if (freeze_depth == 1) {
	138	+ mutex_lock(&q->mq_freeze_lock);
	139	+ if (++q->mq_freeze_depth == 1) {
143	140	percpu_ref_kill(&q->q_usage_counter);
144		- if (q->mq_ops)
	141	+ mutex_unlock(&q->mq_freeze_lock);
	142	+ if (queue_is_mq(q))
145	143	blk_mq_run_hw_queues(q, false);
	144	+ } else {
	145	+ mutex_unlock(&q->mq_freeze_lock);
146	146	}
147	147	}
148	148	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
..	..	@@ -176,8 +176,6 @@
176	176	* exported to drivers as the only user for unfreeze is blk_mq.
177	177	*/
178	178	blk_freeze_queue_start(q);
179		- if (!q->mq_ops)
180		- blk_drain_queue(q);
181	179	blk_mq_freeze_queue_wait(q);
182	180	}
183	181
..	..	@@ -193,14 +191,14 @@
193	191
194	192	void blk_mq_unfreeze_queue(struct request_queue *q)
195	193	{
196		- int freeze_depth;
197		-
198		- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199		- WARN_ON_ONCE(freeze_depth < 0);
200		- if (!freeze_depth) {
201		- percpu_ref_reinit(&q->q_usage_counter);
	194	+ mutex_lock(&q->mq_freeze_lock);
	195	+ q->mq_freeze_depth--;
	196	+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
	197	+ if (!q->mq_freeze_depth) {
	198	+ percpu_ref_resurrect(&q->q_usage_counter);
202	199	wake_up_all(&q->mq_freeze_wq);
203	200	}
	201	+ mutex_unlock(&q->mq_freeze_lock);
204	202	}
205	203	EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206	204
..	..	@@ -268,40 +266,37 @@
268	266	blk_mq_tag_wakeup_all(hctx->tags, true);
269	267	}
270	268
271		-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
	269	+/*
	270	+ * Only need start/end time stamping if we have iostat or
	271	+ * blk stats enabled, or using an IO scheduler.
	272	+ */
	273	+static inline bool blk_mq_need_time_stamp(struct request *rq)
272	274	{
273		- return blk_mq_has_free_tags(hctx->tags);
	275	+ return (rq->rq_flags & (RQF_IO_STAT \| RQF_STATS)) \|\| rq->q->elevator;
274	276	}
275		-EXPORT_SYMBOL(blk_mq_can_queue);
276	277
277	278	static struct request blk_mq_rq_ctx_init(struct blk_mq_alloc_data data,
278		- unsigned int tag, unsigned int op)
	279	+ unsigned int tag, u64 alloc_time_ns)
279	280	{
280	281	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281	282	struct request *rq = tags->static_rqs[tag];
282		- req_flags_t rq_flags = 0;
283	283
284		- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285		- rq->tag = -1;
	284	+ if (data->q->elevator) {
	285	+ rq->tag = BLK_MQ_NO_TAG;
286	286	rq->internal_tag = tag;
287	287	} else {
288		- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289		- rq_flags = RQF_MQ_INFLIGHT;
290		- atomic_inc(&data->hctx->nr_active);
291		- }
292	288	rq->tag = tag;
293		- rq->internal_tag = -1;
294		- data->hctx->tags->rqs[rq->tag] = rq;
	289	+ rq->internal_tag = BLK_MQ_NO_TAG;
295	290	}
296	291
297	292	/* csd/requeue_work/fifo_time is initialized before use */
298	293	rq->q = data->q;
299	294	rq->mq_ctx = data->ctx;
300		- rq->rq_flags = rq_flags;
301		- rq->cpu = -1;
302		- rq->cmd_flags = op;
303		- if (data->flags & BLK_MQ_REQ_PREEMPT)
304		- rq->rq_flags \|= RQF_PREEMPT;
	295	+ rq->mq_hctx = data->hctx;
	296	+ rq->rq_flags = 0;
	297	+ rq->cmd_flags = data->cmd_flags;
	298	+ if (data->flags & BLK_MQ_REQ_PM)
	299	+ rq->rq_flags \|= RQF_PM;
305	300	if (blk_queue_io_stat(data->q))
306	301	rq->rq_flags \|= RQF_IO_STAT;
307	302	INIT_LIST_HEAD(&rq->queuelist);
..	..	@@ -309,100 +304,110 @@
309	304	RB_CLEAR_NODE(&rq->rb_node);
310	305	rq->rq_disk = NULL;
311	306	rq->part = NULL;
312		- rq->start_time_ns = ktime_get_ns();
	307	+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	308	+ rq->alloc_time_ns = alloc_time_ns;
	309	+#endif
	310	+ if (blk_mq_need_time_stamp(rq))
	311	+ rq->start_time_ns = ktime_get_ns();
	312	+ else
	313	+ rq->start_time_ns = 0;
313	314	rq->io_start_time_ns = 0;
	315	+ rq->stats_sectors = 0;
314	316	rq->nr_phys_segments = 0;
315	317	#if defined(CONFIG_BLK_DEV_INTEGRITY)
316	318	rq->nr_integrity_segments = 0;
317	319	#endif
318		- rq->special = NULL;
	320	+ blk_crypto_rq_set_defaults(rq);
319	321	/* tag was already set */
320		- rq->extra_len = 0;
321		- rq->__deadline = 0;
	322	+ WRITE_ONCE(rq->deadline, 0);
322	323
323		-#ifdef CONFIG_PREEMPT_RT_FULL
324		- INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
325		-#endif
326		- INIT_LIST_HEAD(&rq->timeout_list);
327	324	rq->timeout = 0;
328	325
329	326	rq->end_io = NULL;
330	327	rq->end_io_data = NULL;
331		- rq->next_rq = NULL;
332	328
333		-#ifdef CONFIG_BLK_CGROUP
334		- rq->rl = NULL;
335		-#endif
336		-
337		- data->ctx->rq_dispatched[op_is_sync(op)]++;
	329	+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
338	330	refcount_set(&rq->ref, 1);
	331	+
	332	+ if (!op_is_flush(data->cmd_flags)) {
	333	+ struct elevator_queue *e = data->q->elevator;
	334	+
	335	+ rq->elv.icq = NULL;
	336	+ if (e && e->type->ops.prepare_request) {
	337	+ if (e->type->icq_cache)
	338	+ blk_mq_sched_assign_ioc(rq);
	339	+
	340	+ e->type->ops.prepare_request(rq);
	341	+ rq->rq_flags \|= RQF_ELVPRIV;
	342	+ }
	343	+ }
	344	+
	345	+ data->hctx->queued++;
	346	+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
339	347	return rq;
340	348	}
341	349
342		-static struct request blk_mq_get_request(struct request_queue q,
343		- struct bio *bio, unsigned int op,
344		- struct blk_mq_alloc_data *data)
	350	+static struct request __blk_mq_alloc_request(struct blk_mq_alloc_data data)
345	351	{
	352	+ struct request_queue *q = data->q;
346	353	struct elevator_queue *e = q->elevator;
347		- struct request *rq;
	354	+ u64 alloc_time_ns = 0;
348	355	unsigned int tag;
349		- bool put_ctx_on_error = false;
350	356
351		- blk_queue_enter_live(q);
352		- data->q = q;
353		- if (likely(!data->ctx)) {
354		- data->ctx = blk_mq_get_ctx(q);
355		- put_ctx_on_error = true;
356		- }
357		- if (likely(!data->hctx))
358		- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
359		- if (op & REQ_NOWAIT)
	357	+ /* alloc_time includes depth and tag waits */
	358	+ if (blk_queue_rq_alloc_time(q))
	359	+ alloc_time_ns = ktime_get_ns();
	360	+
	361	+ if (data->cmd_flags & REQ_NOWAIT)
360	362	data->flags \|= BLK_MQ_REQ_NOWAIT;
361	363
362	364	if (e) {
363		- data->flags \|= BLK_MQ_REQ_INTERNAL;
364		-
365	365	/*
366	366	* Flush requests are special and go directly to the
367	367	* dispatch list. Don't include reserved tags in the
368	368	* limiting, as it isn't useful.
369	369	*/
370		- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
	370	+ if (!op_is_flush(data->cmd_flags) &&
	371	+ e->type->ops.limit_depth &&
371	372	!(data->flags & BLK_MQ_REQ_RESERVED))
372		- e->type->ops.mq.limit_depth(op, data);
373		- } else {
	373	+ e->type->ops.limit_depth(data->cmd_flags, data);
	374	+ }
	375	+
	376	+retry:
	377	+ data->ctx = blk_mq_get_ctx(q);
	378	+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
	379	+ if (!e)
374	380	blk_mq_tag_busy(data->hctx);
375		- }
376	381
	382	+ /*
	383	+ * Waiting allocations only fail because of an inactive hctx. In that
	384	+ * case just retry the hctx assignment and tag allocation as CPU hotplug
	385	+ * should have migrated us to an online CPU by now.
	386	+ */
377	387	tag = blk_mq_get_tag(data);
378		- if (tag == BLK_MQ_TAG_FAIL) {
379		- if (put_ctx_on_error) {
380		- blk_mq_put_ctx(data->ctx);
381		- data->ctx = NULL;
382		- }
383		- blk_queue_exit(q);
384		- return NULL;
385		- }
	388	+ if (tag == BLK_MQ_NO_TAG) {
	389	+ if (data->flags & BLK_MQ_REQ_NOWAIT)
	390	+ return NULL;
386	391
387		- rq = blk_mq_rq_ctx_init(data, tag, op);
388		- if (!op_is_flush(op)) {
389		- rq->elv.icq = NULL;
390		- if (e && e->type->ops.mq.prepare_request) {
391		- if (e->type->icq_cache && rq_ioc(bio))
392		- blk_mq_sched_assign_ioc(rq, bio);
393		-
394		- e->type->ops.mq.prepare_request(rq, bio);
395		- rq->rq_flags \|= RQF_ELVPRIV;
396		- }
	392	+ /*
	393	+ * Give up the CPU and sleep for a random short time to ensure
	394	+ * that thread using a realtime scheduling class are migrated
	395	+ * off the CPU, and thus off the hctx that is going away.
	396	+ */
	397	+ msleep(3);
	398	+ goto retry;
397	399	}
398		- data->hctx->queued++;
399		- return rq;
	400	+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
400	401	}
401	402
402	403	struct request blk_mq_alloc_request(struct request_queue q, unsigned int op,
403	404	blk_mq_req_flags_t flags)
404	405	{
405		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
	406	+ struct blk_mq_alloc_data data = {
	407	+ .q = q,
	408	+ .flags = flags,
	409	+ .cmd_flags = op,
	410	+ };
406	411	struct request *rq;
407	412	int ret;
408	413
..	..	@@ -410,28 +415,35 @@
410	415	if (ret)
411	416	return ERR_PTR(ret);
412	417
413		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
414		- blk_queue_exit(q);
415		-
	418	+ rq = __blk_mq_alloc_request(&data);
416	419	if (!rq)
417		- return ERR_PTR(-EWOULDBLOCK);
418		-
419		- blk_mq_put_ctx(alloc_data.ctx);
420		-
	420	+ goto out_queue_exit;
421	421	rq->__data_len = 0;
422	422	rq->__sector = (sector_t) -1;
423	423	rq->bio = rq->biotail = NULL;
424	424	return rq;
	425	+out_queue_exit:
	426	+ blk_queue_exit(q);
	427	+ return ERR_PTR(-EWOULDBLOCK);
425	428	}
426	429	EXPORT_SYMBOL(blk_mq_alloc_request);
427	430
428	431	struct request blk_mq_alloc_request_hctx(struct request_queue q,
429	432	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
430	433	{
431		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
432		- struct request *rq;
	434	+ struct blk_mq_alloc_data data = {
	435	+ .q = q,
	436	+ .flags = flags,
	437	+ .cmd_flags = op,
	438	+ };
	439	+ u64 alloc_time_ns = 0;
433	440	unsigned int cpu;
	441	+ unsigned int tag;
434	442	int ret;
	443	+
	444	+ /* alloc_time includes depth and tag waits */
	445	+ if (blk_queue_rq_alloc_time(q))
	446	+ alloc_time_ns = ktime_get_ns();
435	447
436	448	/*
437	449	* If the tag allocator sleeps we could get an allocation for a
..	..	@@ -439,7 +451,7 @@
439	451	* allocator for this for the rare use case of a command tied to
440	452	* a specific queue.
441	453	*/
442		- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
	454	+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT \| BLK_MQ_REQ_RESERVED))))
443	455	return ERR_PTR(-EINVAL);
444	456
445	457	if (hctx_idx >= q->nr_hw_queues)
..	..	@@ -453,21 +465,27 @@
453	465	* Check if the hardware context is actually mapped to anything.
454	466	* If not tell the caller that it should skip this queue.
455	467	*/
456		- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
457		- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
458		- blk_queue_exit(q);
459		- return ERR_PTR(-EXDEV);
460		- }
461		- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
462		- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
	468	+ ret = -EXDEV;
	469	+ data.hctx = q->queue_hw_ctx[hctx_idx];
	470	+ if (!blk_mq_hw_queue_mapped(data.hctx))
	471	+ goto out_queue_exit;
	472	+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
	473	+ if (cpu >= nr_cpu_ids)
	474	+ goto out_queue_exit;
	475	+ data.ctx = __blk_mq_get_ctx(q, cpu);
463	476
464		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
	477	+ if (!q->elevator)
	478	+ blk_mq_tag_busy(data.hctx);
	479	+
	480	+ ret = -EWOULDBLOCK;
	481	+ tag = blk_mq_get_tag(&data);
	482	+ if (tag == BLK_MQ_NO_TAG)
	483	+ goto out_queue_exit;
	484	+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
	485	+
	486	+out_queue_exit:
465	487	blk_queue_exit(q);
466		-
467		- if (!rq)
468		- return ERR_PTR(-EWOULDBLOCK);
469		-
470		- return rq;
	488	+ return ERR_PTR(ret);
471	489	}
472	490	EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
473	491
..	..	@@ -475,13 +493,16 @@
475	493	{
476	494	struct request_queue *q = rq->q;
477	495	struct blk_mq_ctx *ctx = rq->mq_ctx;
478		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	496	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
479	497	const int sched_tag = rq->internal_tag;
480	498
481		- if (rq->tag != -1)
482		- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
483		- if (sched_tag != -1)
484		- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	499	+ blk_crypto_free_request(rq);
	500	+ blk_pm_mark_last_busy(rq);
	501	+ rq->mq_hctx = NULL;
	502	+ if (rq->tag != BLK_MQ_NO_TAG)
	503	+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
	504	+ if (sched_tag != BLK_MQ_NO_TAG)
	505	+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
485	506	blk_mq_sched_restart(hctx);
486	507	blk_queue_exit(q);
487	508	}
..	..	@@ -491,11 +512,11 @@
491	512	struct request_queue *q = rq->q;
492	513	struct elevator_queue *e = q->elevator;
493	514	struct blk_mq_ctx *ctx = rq->mq_ctx;
494		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	515	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
495	516
496	517	if (rq->rq_flags & RQF_ELVPRIV) {
497		- if (e && e->type->ops.mq.finish_request)
498		- e->type->ops.mq.finish_request(rq);
	518	+ if (e && e->type->ops.finish_request)
	519	+ e->type->ops.finish_request(rq);
499	520	if (rq->elv.icq) {
500	521	put_io_context(rq->elv.icq->ioc);
501	522	rq->elv.icq = NULL;
..	..	@@ -504,15 +525,12 @@
504	525
505	526	ctx->rq_completed[rq_is_sync(rq)]++;
506	527	if (rq->rq_flags & RQF_MQ_INFLIGHT)
507		- atomic_dec(&hctx->nr_active);
	528	+ __blk_mq_dec_active_requests(hctx);
508	529
509	530	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
510	531	laptop_io_completion(q->backing_dev_info);
511	532
512	533	rq_qos_done(q, rq);
513		-
514		- if (blk_rq_rl(rq))
515		- blk_put_rl(blk_rq_rl(rq));
516	534
517	535	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
518	536	if (refcount_dec_and_test(&rq->ref))
..	..	@@ -522,12 +540,17 @@
522	540
523	541	inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
524	542	{
525		- u64 now = ktime_get_ns();
	543	+ u64 now = 0;
	544	+
	545	+ if (blk_mq_need_time_stamp(rq))
	546	+ now = ktime_get_ns();
526	547
527	548	if (rq->rq_flags & RQF_STATS) {
528	549	blk_mq_poll_stats_start(rq->q);
529	550	blk_stat_add(rq, now);
530	551	}
	552	+
	553	+ blk_mq_sched_completed_request(rq, now);
531	554
532	555	blk_account_io_done(rq, now);
533	556
..	..	@@ -535,8 +558,6 @@
535	558	rq_qos_done(rq->q, rq);
536	559	rq->end_io(rq, error);
537	560	} else {
538		- if (unlikely(blk_bidi_rq(rq)))
539		- blk_mq_free_request(rq->next_rq);
540	561	blk_mq_free_request(rq);
541	562	}
542	563	}
..	..	@@ -550,63 +571,120 @@
550	571	}
551	572	EXPORT_SYMBOL(blk_mq_end_request);
552	573
553		-#ifdef CONFIG_PREEMPT_RT_FULL
554		-
555		-void __blk_mq_complete_request_remote_work(struct work_struct *work)
	574	+static void blk_complete_reqs(struct llist_head *list)
556	575	{
557		- struct request *rq = container_of(work, struct request, work);
	576	+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
	577	+ struct request rq, next;
558	578
559		- rq->q->softirq_done_fn(rq);
	579	+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
	580	+ rq->q->mq_ops->complete(rq);
560	581	}
561	582
562		-#else
	583	+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
	584	+{
	585	+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
	586	+}
	587	+
	588	+static int blk_softirq_cpu_dead(unsigned int cpu)
	589	+{
	590	+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
	591	+ return 0;
	592	+}
563	593
564	594	static void __blk_mq_complete_request_remote(void *data)
565	595	{
566		- struct request *rq = data;
567		-
568		- rq->q->softirq_done_fn(rq);
	596	+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
569	597	}
570		-#endif
571	598
572		-static void __blk_mq_complete_request(struct request *rq)
	599	+static inline bool blk_mq_complete_need_ipi(struct request *rq)
573	600	{
574		- struct blk_mq_ctx *ctx = rq->mq_ctx;
575		- bool shared = false;
576		- int cpu;
	601	+ int cpu = raw_smp_processor_id();
577	602
578		- if (!blk_mq_mark_complete(rq))
579		- return;
580		- if (rq->internal_tag != -1)
581		- blk_mq_sched_completed_request(rq);
	603	+ if (!IS_ENABLED(CONFIG_SMP) \|\|
	604	+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
	605	+ return false;
	606	+ /*
	607	+ * With force threaded interrupts enabled, raising softirq from an SMP
	608	+ * function call will always result in waking the ksoftirqd thread.
	609	+ * This is probably worse than completing the request on a different
	610	+ * cache domain.
	611	+ */
	612	+ if (force_irqthreads)
	613	+ return false;
582	614
583		- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
584		- rq->q->softirq_done_fn(rq);
585		- return;
586		- }
	615	+ /* same CPU or cache domain? Complete locally */
	616	+ if (cpu == rq->mq_ctx->cpu \|\|
	617	+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
	618	+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
	619	+ return false;
587	620
588		- cpu = get_cpu_light();
589		- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
590		- shared = cpus_share_cache(cpu, ctx->cpu);
	621	+ /* don't try to IPI to an offline CPU */
	622	+ return cpu_online(rq->mq_ctx->cpu);
	623	+}
591	624
592		- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
593		-#ifdef CONFIG_PREEMPT_RT_FULL
594		- /*
595		- * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
596		- * here. But we could try to invoke it one the CPU like this.
597		- */
598		- schedule_work_on(ctx->cpu, &rq->work);
599		-#else
	625	+static void blk_mq_complete_send_ipi(struct request *rq)
	626	+{
	627	+ struct llist_head *list;
	628	+ unsigned int cpu;
	629	+
	630	+ cpu = rq->mq_ctx->cpu;
	631	+ list = &per_cpu(blk_cpu_done, cpu);
	632	+ if (llist_add(&rq->ipi_list, list)) {
600	633	rq->csd.func = __blk_mq_complete_request_remote;
601	634	rq->csd.info = rq;
602	635	rq->csd.flags = 0;
603		- smp_call_function_single_async(ctx->cpu, &rq->csd);
604		-#endif
605		- } else {
606		- rq->q->softirq_done_fn(rq);
	636	+ smp_call_function_single_async(cpu, &rq->csd);
607	637	}
608		- put_cpu_light();
609	638	}
	639	+
	640	+static void blk_mq_raise_softirq(struct request *rq)
	641	+{
	642	+ struct llist_head *list;
	643	+
	644	+ preempt_disable();
	645	+ list = this_cpu_ptr(&blk_cpu_done);
	646	+ if (llist_add(&rq->ipi_list, list))
	647	+ raise_softirq(BLOCK_SOFTIRQ);
	648	+ preempt_enable();
	649	+}
	650	+
	651	+bool blk_mq_complete_request_remote(struct request *rq)
	652	+{
	653	+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
	654	+
	655	+ /*
	656	+ * For a polled request, always complete locallly, it's pointless
	657	+ * to redirect the completion.
	658	+ */
	659	+ if (rq->cmd_flags & REQ_HIPRI)
	660	+ return false;
	661	+
	662	+ if (blk_mq_complete_need_ipi(rq)) {
	663	+ blk_mq_complete_send_ipi(rq);
	664	+ return true;
	665	+ }
	666	+
	667	+ if (rq->q->nr_hw_queues == 1) {
	668	+ blk_mq_raise_softirq(rq);
	669	+ return true;
	670	+ }
	671	+ return false;
	672	+}
	673	+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
	674	+
	675	+/**
	676	+ * blk_mq_complete_request - end I/O on a request
	677	+ * @rq: the request being processed
	678	+ *
	679	+ * Description:
	680	+ * Complete a request by scheduling the ->complete_rq operation.
	681	+ **/
	682	+void blk_mq_complete_request(struct request *rq)
	683	+{
	684	+ if (!blk_mq_complete_request_remote(rq))
	685	+ rq->q->mq_ops->complete(rq);
	686	+}
	687	+EXPORT_SYMBOL(blk_mq_complete_request);
610	688
611	689	static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
612	690	__releases(hctx->srcu)
..	..	@@ -629,40 +707,22 @@
629	707	}
630	708
631	709	/**
632		- * blk_mq_complete_request - end I/O on a request
633		- * @rq: the request being processed
	710	+ * blk_mq_start_request - Start processing a request
	711	+ * @rq: Pointer to request to be started
634	712	*
635		- * Description:
636		- * Ends all I/O on a request. It does not handle partial completions.
637		- * The actual completion happens out-of-order, through a IPI handler.
638		- **/
639		-void blk_mq_complete_request(struct request *rq)
640		-{
641		- if (unlikely(blk_should_fake_timeout(rq->q)))
642		- return;
643		- __blk_mq_complete_request(rq);
644		-}
645		-EXPORT_SYMBOL(blk_mq_complete_request);
646		-
647		-int blk_mq_request_started(struct request *rq)
648		-{
649		- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
650		-}
651		-EXPORT_SYMBOL_GPL(blk_mq_request_started);
652		-
	713	+ * Function used by device drivers to notify the block layer that a request
	714	+ * is going to be processed now, so blk layer can do proper initializations
	715	+ * such as starting the timeout timer.
	716	+ */
653	717	void blk_mq_start_request(struct request *rq)
654	718	{
655	719	struct request_queue *q = rq->q;
656		-
657		- blk_mq_sched_started_request(rq);
658	720
659	721	trace_block_rq_issue(q, rq);
660	722
661	723	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
662	724	rq->io_start_time_ns = ktime_get_ns();
663		-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
664		- rq->throtl_size = blk_rq_sectors(rq);
665		-#endif
	725	+ rq->stats_sectors = blk_rq_sectors(rq);
666	726	rq->rq_flags \|= RQF_STATS;
667	727	rq_qos_issue(q, rq);
668	728	}
..	..	@@ -672,14 +732,10 @@
672	732	blk_add_timer(rq);
673	733	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
674	734
675		- if (q->dma_drain_size && blk_rq_bytes(rq)) {
676		- /*
677		- * Make sure space for the drain appears. We know we can do
678		- * this because max_hw_segments has been adjusted to be one
679		- * fewer than the device can handle.
680		- */
681		- rq->nr_phys_segments++;
682		- }
	735	+#ifdef CONFIG_BLK_DEV_INTEGRITY
	736	+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
	737	+ q->integrity.profile->prepare_fn(rq);
	738	+#endif
683	739	}
684	740	EXPORT_SYMBOL(blk_mq_start_request);
685	741
..	..	@@ -695,8 +751,6 @@
695	751	if (blk_mq_request_started(rq)) {
696	752	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
697	753	rq->rq_flags &= ~RQF_TIMED_OUT;
698		- if (q->dma_drain_size && blk_rq_bytes(rq))
699		- rq->nr_phys_segments--;
700	754	}
701	755	}
702	756
..	..	@@ -707,7 +761,6 @@
707	761	/* this request will be re-inserted to io scheduler queue */
708	762	blk_mq_sched_requeue_request(rq);
709	763
710		- BUG_ON(blk_queued_rq(rq));
711	764	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
712	765	}
713	766	EXPORT_SYMBOL(blk_mq_requeue_request);
..	..	@@ -735,7 +788,7 @@
735	788	* merge.
736	789	*/
737	790	if (rq->rq_flags & RQF_DONTPREP)
738		- blk_mq_request_bypass_insert(rq, false);
	791	+ blk_mq_request_bypass_insert(rq, false, false);
739	792	else
740	793	blk_mq_sched_insert_request(rq, true, false, false);
741	794	}
..	..	@@ -773,7 +826,6 @@
773	826	if (kick_requeue_list)
774	827	blk_mq_kick_requeue_list(q);
775	828	}
776		-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
777	829
778	830	void blk_mq_kick_requeue_list(struct request_queue *q)
779	831	{
..	..	@@ -800,6 +852,32 @@
800	852	}
801	853	EXPORT_SYMBOL(blk_mq_tag_to_rq);
802	854
	855	+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx hctx, struct request rq,
	856	+ void *priv, bool reserved)
	857	+{
	858	+ /*
	859	+ * If we find a request that isn't idle and the queue matches,
	860	+ * we know the queue is busy. Return false to stop the iteration.
	861	+ */
	862	+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
	863	+ bool *busy = priv;
	864	+
	865	+ *busy = true;
	866	+ return false;
	867	+ }
	868	+
	869	+ return true;
	870	+}
	871	+
	872	+bool blk_mq_queue_inflight(struct request_queue *q)
	873	+{
	874	+ bool busy = false;
	875	+
	876	+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
	877	+ return busy;
	878	+}
	879	+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
	880	+
803	881	static void blk_mq_rq_timed_out(struct request *req, bool reserved)
804	882	{
805	883	req->rq_flags \|= RQF_TIMED_OUT;
..	..	@@ -824,7 +902,7 @@
824	902	if (rq->rq_flags & RQF_TIMED_OUT)
825	903	return false;
826	904
827		- deadline = blk_rq_deadline(rq);
	905	+ deadline = READ_ONCE(rq->deadline);
828	906	if (time_after_eq(jiffies, deadline))
829	907	return true;
830	908
..	..	@@ -835,43 +913,29 @@
835	913	return false;
836	914	}
837	915
838		-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
	916	+void blk_mq_put_rq_ref(struct request *rq)
	917	+{
	918	+ if (is_flush_rq(rq))
	919	+ rq->end_io(rq, 0);
	920	+ else if (refcount_dec_and_test(&rq->ref))
	921	+ __blk_mq_free_request(rq);
	922	+}
	923	+
	924	+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
839	925	struct request rq, void priv, bool reserved)
840	926	{
841	927	unsigned long *next = priv;
842	928
843	929	/*
844		- * Just do a quick check if it is expired before locking the request in
845		- * so we're not unnecessarilly synchronizing across CPUs.
846		- */
847		- if (!blk_mq_req_expired(rq, next))
848		- return;
849		-
850		- /*
851		- * We have reason to believe the request may be expired. Take a
852		- * reference on the request to lock this request lifetime into its
853		- * currently allocated context to prevent it from being reallocated in
854		- * the event the completion by-passes this timeout handler.
855		- *
856		- * If the reference was already released, then the driver beat the
857		- * timeout handler to posting a natural completion.
858		- */
859		- if (!refcount_inc_not_zero(&rq->ref))
860		- return;
861		-
862		- /*
863		- * The request is now locked and cannot be reallocated underneath the
864		- * timeout handler's processing. Re-verify this exact request is truly
865		- * expired; if it is not expired, then the request was completed and
866		- * reallocated as a new request.
	930	+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
	931	+ * be reallocated underneath the timeout handler's processing, then
	932	+ * the expire check is reliable. If the request is not expired, then
	933	+ * it was completed and reallocated as a new request after returning
	934	+ * from blk_mq_check_expired().
867	935	*/
868	936	if (blk_mq_req_expired(rq, next))
869	937	blk_mq_rq_timed_out(rq, reserved);
870		-
871		- if (is_flush_rq(rq, hctx))
872		- rq->end_io(rq, 0);
873		- else if (refcount_dec_and_test(&rq->ref))
874		- __blk_mq_free_request(rq);
	938	+ return true;
875	939	}
876	940
877	941	static void blk_mq_timeout_work(struct work_struct *work)
..	..	@@ -928,9 +992,10 @@
928	992	struct flush_busy_ctx_data *flush_data = data;
929	993	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
930	994	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	995	+ enum hctx_type type = hctx->type;
931	996
932	997	spin_lock(&ctx->lock);
933		- list_splice_tail_init(&ctx->rq_list, flush_data->list);
	998	+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
934	999	sbitmap_clear_bit(sb, bitnr);
935	1000	spin_unlock(&ctx->lock);
936	1001	return true;
..	..	@@ -962,12 +1027,13 @@
962	1027	struct dispatch_rq_data *dispatch_data = data;
963	1028	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
964	1029	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1030	+ enum hctx_type type = hctx->type;
965	1031
966	1032	spin_lock(&ctx->lock);
967		- if (!list_empty(&ctx->rq_list)) {
968		- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
	1033	+ if (!list_empty(&ctx->rq_lists[type])) {
	1034	+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
969	1035	list_del_init(&dispatch_data->rq->queuelist);
970		- if (list_empty(&ctx->rq_list))
	1036	+ if (list_empty(&ctx->rq_lists[type]))
971	1037	sbitmap_clear_bit(sb, bitnr);
972	1038	}
973	1039	spin_unlock(&ctx->lock);
..	..	@@ -978,7 +1044,7 @@
978	1044	struct request blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx hctx,
979	1045	struct blk_mq_ctx *start)
980	1046	{
981		- unsigned off = start ? start->index_hw : 0;
	1047	+ unsigned off = start ? start->index_hw[hctx->type] : 0;
982	1048	struct dispatch_rq_data data = {
983	1049	.hctx = hctx,
984	1050	.rq = NULL,
..	..	@@ -998,33 +1064,44 @@
998	1064	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
999	1065	}
1000	1066
1001		-bool blk_mq_get_driver_tag(struct request *rq)
	1067	+static bool __blk_mq_get_driver_tag(struct request *rq)
1002	1068	{
1003		- struct blk_mq_alloc_data data = {
1004		- .q = rq->q,
1005		- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
1006		- .flags = BLK_MQ_REQ_NOWAIT,
1007		- };
1008		- bool shared;
	1069	+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
	1070	+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
	1071	+ int tag;
1009	1072
1010		- if (rq->tag != -1)
1011		- goto done;
	1073	+ blk_mq_tag_busy(rq->mq_hctx);
1012	1074
1013		- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1014		- data.flags \|= BLK_MQ_REQ_RESERVED;
1015		-
1016		- shared = blk_mq_tag_busy(data.hctx);
1017		- rq->tag = blk_mq_get_tag(&data);
1018		- if (rq->tag >= 0) {
1019		- if (shared) {
1020		- rq->rq_flags \|= RQF_MQ_INFLIGHT;
1021		- atomic_inc(&data.hctx->nr_active);
1022		- }
1023		- data.hctx->tags->rqs[rq->tag] = rq;
	1075	+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
	1076	+ bt = rq->mq_hctx->tags->breserved_tags;
	1077	+ tag_offset = 0;
	1078	+ } else {
	1079	+ if (!hctx_may_queue(rq->mq_hctx, bt))
	1080	+ return false;
1024	1081	}
1025	1082
1026		-done:
1027		- return rq->tag != -1;
	1083	+ tag = __sbitmap_queue_get(bt);
	1084	+ if (tag == BLK_MQ_NO_TAG)
	1085	+ return false;
	1086	+
	1087	+ rq->tag = tag + tag_offset;
	1088	+ return true;
	1089	+}
	1090	+
	1091	+static bool blk_mq_get_driver_tag(struct request *rq)
	1092	+{
	1093	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1094	+
	1095	+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
	1096	+ return false;
	1097	+
	1098	+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
	1099	+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
	1100	+ rq->rq_flags \|= RQF_MQ_INFLIGHT;
	1101	+ __blk_mq_inc_active_requests(hctx);
	1102	+ }
	1103	+ hctx->tags->rqs[rq->tag] = rq;
	1104	+ return true;
1028	1105	}
1029	1106
1030	1107	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
..	..	@@ -1035,7 +1112,13 @@
1035	1112	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1036	1113
1037	1114	spin_lock(&hctx->dispatch_wait_lock);
1038		- list_del_init(&wait->entry);
	1115	+ if (!list_empty(&wait->entry)) {
	1116	+ struct sbitmap_queue *sbq;
	1117	+
	1118	+ list_del_init(&wait->entry);
	1119	+ sbq = hctx->tags->bitmap_tags;
	1120	+ atomic_dec(&sbq->ws_active);
	1121	+ }
1039	1122	spin_unlock(&hctx->dispatch_wait_lock);
1040	1123
1041	1124	blk_mq_run_hw_queue(hctx, true);
..	..	@@ -1051,13 +1134,13 @@
1051	1134	static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1052	1135	struct request *rq)
1053	1136	{
	1137	+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
1054	1138	struct wait_queue_head *wq;
1055	1139	wait_queue_entry_t *wait;
1056	1140	bool ret;
1057	1141
1058		- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1059		- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1060		- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
	1142	+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	1143	+ blk_mq_sched_mark_restart_hctx(hctx);
1061	1144
1062	1145	/*
1063	1146	* It's possible that a tag was freed in the window between the
..	..	@@ -1074,7 +1157,7 @@
1074	1157	if (!list_empty_careful(&wait->entry))
1075	1158	return false;
1076	1159
1077		- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
	1160	+ wq = &bt_wait_ptr(sbq, hctx)->wait;
1078	1161
1079	1162	spin_lock_irq(&wq->lock);
1080	1163	spin_lock(&hctx->dispatch_wait_lock);
..	..	@@ -1084,6 +1167,7 @@
1084	1167	return false;
1085	1168	}
1086	1169
	1170	+ atomic_inc(&sbq->ws_active);
1087	1171	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1088	1172	__add_wait_queue(wq, wait);
1089	1173
..	..	@@ -1104,6 +1188,7 @@
1104	1188	* someone else gets the wakeup.
1105	1189	*/
1106	1190	list_del_init(&wait->entry);
	1191	+ atomic_dec(&sbq->ws_active);
1107	1192	spin_unlock(&hctx->dispatch_wait_lock);
1108	1193	spin_unlock_irq(&wq->lock);
1109	1194
..	..	@@ -1122,9 +1207,6 @@
1122	1207	static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1123	1208	{
1124	1209	unsigned int ewma;
1125		-
1126		- if (hctx->queue->elevator)
1127		- return;
1128	1210
1129	1211	ewma = hctx->dispatch_busy;
1130	1212
..	..	@@ -1158,22 +1240,83 @@
1158	1240	__blk_mq_requeue_request(rq);
1159	1241	}
1160	1242
	1243	+static void blk_mq_handle_zone_resource(struct request *rq,
	1244	+ struct list_head *zone_list)
	1245	+{
	1246	+ /*
	1247	+ * If we end up here it is because we cannot dispatch a request to a
	1248	+ * specific zone due to LLD level zone-write locking or other zone
	1249	+ * related resource not being available. In this case, set the request
	1250	+ * aside in zone_list for retrying it later.
	1251	+ */
	1252	+ list_add(&rq->queuelist, zone_list);
	1253	+ __blk_mq_requeue_request(rq);
	1254	+}
	1255	+
	1256	+enum prep_dispatch {
	1257	+ PREP_DISPATCH_OK,
	1258	+ PREP_DISPATCH_NO_TAG,
	1259	+ PREP_DISPATCH_NO_BUDGET,
	1260	+};
	1261	+
	1262	+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
	1263	+ bool need_budget)
	1264	+{
	1265	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1266	+
	1267	+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
	1268	+ blk_mq_put_driver_tag(rq);
	1269	+ return PREP_DISPATCH_NO_BUDGET;
	1270	+ }
	1271	+
	1272	+ if (!blk_mq_get_driver_tag(rq)) {
	1273	+ /*
	1274	+ * The initial allocation attempt failed, so we need to
	1275	+ * rerun the hardware queue when a tag is freed. The
	1276	+ * waitqueue takes care of that. If the queue is run
	1277	+ * before we add this entry back on the dispatch list,
	1278	+ * we'll re-run it below.
	1279	+ */
	1280	+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
	1281	+ /*
	1282	+ * All budgets not got from this function will be put
	1283	+ * together during handling partial dispatch
	1284	+ */
	1285	+ if (need_budget)
	1286	+ blk_mq_put_dispatch_budget(rq->q);
	1287	+ return PREP_DISPATCH_NO_TAG;
	1288	+ }
	1289	+ }
	1290	+
	1291	+ return PREP_DISPATCH_OK;
	1292	+}
	1293	+
	1294	+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
	1295	+static void blk_mq_release_budgets(struct request_queue *q,
	1296	+ unsigned int nr_budgets)
	1297	+{
	1298	+ int i;
	1299	+
	1300	+ for (i = 0; i < nr_budgets; i++)
	1301	+ blk_mq_put_dispatch_budget(q);
	1302	+}
	1303	+
1161	1304	/*
1162	1305	* Returns true if we did some work AND can potentially do more.
1163	1306	*/
1164		-bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1165		- bool got_budget)
	1307	+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx hctx, struct list_head list,
	1308	+ unsigned int nr_budgets)
1166	1309	{
1167		- struct blk_mq_hw_ctx *hctx;
	1310	+ enum prep_dispatch prep;
	1311	+ struct request_queue *q = hctx->queue;
1168	1312	struct request rq, nxt;
1169		- bool no_tag = false;
1170	1313	int errors, queued;
1171	1314	blk_status_t ret = BLK_STS_OK;
	1315	+ LIST_HEAD(zone_list);
	1316	+ bool needs_resource = false;
1172	1317
1173	1318	if (list_empty(list))
1174	1319	return false;
1175		-
1176		- WARN_ON(!list_is_singular(list) && got_budget);
1177	1320
1178	1321	/*
1179	1322	* Now process all the entries, sending them to the driver.
..	..	@@ -1184,29 +1327,10 @@
1184	1327
1185	1328	rq = list_first_entry(list, struct request, queuelist);
1186	1329
1187		- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1188		- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
	1330	+ WARN_ON_ONCE(hctx != rq->mq_hctx);
	1331	+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
	1332	+ if (prep != PREP_DISPATCH_OK)
1189	1333	break;
1190		-
1191		- if (!blk_mq_get_driver_tag(rq)) {
1192		- /*
1193		- * The initial allocation attempt failed, so we need to
1194		- * rerun the hardware queue when a tag is freed. The
1195		- * waitqueue takes care of that. If the queue is run
1196		- * before we add this entry back on the dispatch list,
1197		- * we'll re-run it below.
1198		- */
1199		- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1200		- blk_mq_put_dispatch_budget(hctx);
1201		- /*
1202		- * For non-shared tags, the RESTART check
1203		- * will suffice.
1204		- */
1205		- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1206		- no_tag = true;
1207		- break;
1208		- }
1209		- }
1210	1334
1211	1335	list_del_init(&rq->queuelist);
1212	1336
..	..	@@ -1223,32 +1347,63 @@
1223	1347	bd.last = !blk_mq_get_driver_tag(nxt);
1224	1348	}
1225	1349
	1350	+ /*
	1351	+ * once the request is queued to lld, no need to cover the
	1352	+ * budget any more
	1353	+ */
	1354	+ if (nr_budgets)
	1355	+ nr_budgets--;
1226	1356	ret = q->mq_ops->queue_rq(hctx, &bd);
1227		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE) {
1228		- blk_mq_handle_dev_resource(rq, list);
	1357	+ switch (ret) {
	1358	+ case BLK_STS_OK:
	1359	+ queued++;
1229	1360	break;
1230		- }
1231		-
1232		- if (unlikely(ret != BLK_STS_OK)) {
	1361	+ case BLK_STS_RESOURCE:
	1362	+ needs_resource = true;
	1363	+ fallthrough;
	1364	+ case BLK_STS_DEV_RESOURCE:
	1365	+ blk_mq_handle_dev_resource(rq, list);
	1366	+ goto out;
	1367	+ case BLK_STS_ZONE_RESOURCE:
	1368	+ /*
	1369	+ * Move the request to zone_list and keep going through
	1370	+ * the dispatch list to find more requests the drive can
	1371	+ * accept.
	1372	+ */
	1373	+ blk_mq_handle_zone_resource(rq, &zone_list);
	1374	+ needs_resource = true;
	1375	+ break;
	1376	+ default:
1233	1377	errors++;
1234	1378	blk_mq_end_request(rq, BLK_STS_IOERR);
1235		- continue;
1236	1379	}
1237		-
1238		- queued++;
1239	1380	} while (!list_empty(list));
	1381	+out:
	1382	+ if (!list_empty(&zone_list))
	1383	+ list_splice_tail_init(&zone_list, list);
1240	1384
1241	1385	hctx->dispatched[queued_to_index(queued)]++;
1242	1386
	1387	+ /* If we didn't flush the entire list, we could have told the driver
	1388	+ * there was more coming, but that turned out to be a lie.
	1389	+ */
	1390	+ if ((!list_empty(list) \|\| errors \|\| needs_resource \|\|
	1391	+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
	1392	+ q->mq_ops->commit_rqs(hctx);
1243	1393	/*
1244	1394	* Any items that need requeuing? Stuff them into hctx->dispatch,
1245	1395	* that is where we will continue on next queue run.
1246	1396	*/
1247	1397	if (!list_empty(list)) {
1248	1398	bool needs_restart;
	1399	+ /* For non-shared tags, the RESTART check will suffice */
	1400	+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
	1401	+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
	1402	+
	1403	+ blk_mq_release_budgets(q, nr_budgets);
1249	1404
1250	1405	spin_lock(&hctx->lock);
1251		- list_splice_init(list, &hctx->dispatch);
	1406	+ list_splice_tail_init(list, &hctx->dispatch);
1252	1407	spin_unlock(&hctx->lock);
1253	1408
1254	1409	/*
..	..	@@ -1282,13 +1437,17 @@
1282	1437	*
1283	1438	* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1284	1439	* bit is set, run queue after a delay to avoid IO stalls
1285		- * that could otherwise occur if the queue is idle.
	1440	+ * that could otherwise occur if the queue is idle. We'll do
	1441	+ * similar if we couldn't get budget or couldn't lock a zone
	1442	+ * and SCHED_RESTART is set.
1286	1443	*/
1287	1444	needs_restart = blk_mq_sched_needs_restart(hctx);
	1445	+ if (prep == PREP_DISPATCH_NO_BUDGET)
	1446	+ needs_resource = true;
1288	1447	if (!needs_restart \|\|
1289	1448	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1290	1449	blk_mq_run_hw_queue(hctx, true);
1291		- else if (needs_restart && (ret == BLK_STS_RESOURCE))
	1450	+ else if (needs_restart && needs_resource)
1292	1451	blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1293	1452
1294	1453	blk_mq_update_dispatch_busy(hctx, true);
..	..	@@ -1296,16 +1455,15 @@
1296	1455	} else
1297	1456	blk_mq_update_dispatch_busy(hctx, false);
1298	1457
1299		- /*
1300		- * If the host/device is unable to accept more work, inform the
1301		- * caller of that.
1302		- */
1303		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1304		- return false;
1305		-
1306	1458	return (queued + errors) != 0;
1307	1459	}
1308	1460
	1461	+/**
	1462	+ * __blk_mq_run_hw_queue - Run a hardware queue.
	1463	+ * @hctx: Pointer to the hardware queue to run.
	1464	+ *
	1465	+ * Send pending requests to the hardware.
	1466	+ */
1309	1467	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1310	1468	{
1311	1469	int srcu_idx;
..	..	@@ -1403,6 +1561,15 @@
1403	1561	return next_cpu;
1404	1562	}
1405	1563
	1564	+/**
	1565	+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
	1566	+ * @hctx: Pointer to the hardware queue to run.
	1567	+ * @async: If we want to run the queue asynchronously.
	1568	+ * @msecs: Microseconds of delay to wait before running the queue.
	1569	+ *
	1570	+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
	1571	+ * with a delay of @msecs.
	1572	+ */
1406	1573	static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1407	1574	unsigned long msecs)
1408	1575	{
..	..	@@ -1424,13 +1591,29 @@
1424	1591	msecs_to_jiffies(msecs));
1425	1592	}
1426	1593
	1594	+/**
	1595	+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
	1596	+ * @hctx: Pointer to the hardware queue to run.
	1597	+ * @msecs: Microseconds of delay to wait before running the queue.
	1598	+ *
	1599	+ * Run a hardware queue asynchronously with a delay of @msecs.
	1600	+ */
1427	1601	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1428	1602	{
1429	1603	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1430	1604	}
1431	1605	EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1432	1606
1433		-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
	1607	+/**
	1608	+ * blk_mq_run_hw_queue - Start to run a hardware queue.
	1609	+ * @hctx: Pointer to the hardware queue to run.
	1610	+ * @async: If we want to run the queue asynchronously.
	1611	+ *
	1612	+ * Check if the request queue is not in a quiesced state and if there are
	1613	+ * pending requests to be sent. If this is true, run the queue to send requests
	1614	+ * to hardware.
	1615	+ */
	1616	+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1434	1617	{
1435	1618	int srcu_idx;
1436	1619	bool need_run;
..	..	@@ -1448,28 +1631,101 @@
1448	1631	blk_mq_hctx_has_pending(hctx);
1449	1632	hctx_unlock(hctx, srcu_idx);
1450	1633
1451		- if (need_run) {
	1634	+ if (need_run)
1452	1635	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1453		- return true;
1454		- }
1455		-
1456		- return false;
1457	1636	}
1458	1637	EXPORT_SYMBOL(blk_mq_run_hw_queue);
1459	1638
	1639	+/*
	1640	+ * Is the request queue handled by an IO scheduler that does not respect
	1641	+ * hardware queues when dispatching?
	1642	+ */
	1643	+static bool blk_mq_has_sqsched(struct request_queue *q)
	1644	+{
	1645	+ struct elevator_queue *e = q->elevator;
	1646	+
	1647	+ if (e && e->type->ops.dispatch_request &&
	1648	+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
	1649	+ return true;
	1650	+ return false;
	1651	+}
	1652	+
	1653	+/*
	1654	+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
	1655	+ * scheduler.
	1656	+ */
	1657	+static struct blk_mq_hw_ctx blk_mq_get_sq_hctx(struct request_queue q)
	1658	+{
	1659	+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	1660	+ /*
	1661	+ * If the IO scheduler does not respect hardware queues when
	1662	+ * dispatching, we just don't bother with multiple HW queues and
	1663	+ * dispatch from hctx for the current CPU since running multiple queues
	1664	+ * just causes lock contention inside the scheduler and pointless cache
	1665	+ * bouncing.
	1666	+ */
	1667	+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
	1668	+
	1669	+ if (!blk_mq_hctx_stopped(hctx))
	1670	+ return hctx;
	1671	+ return NULL;
	1672	+}
	1673	+
	1674	+/**
	1675	+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
	1676	+ * @q: Pointer to the request queue to run.
	1677	+ * @async: If we want to run the queue asynchronously.
	1678	+ */
1460	1679	void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1461	1680	{
1462		- struct blk_mq_hw_ctx *hctx;
	1681	+ struct blk_mq_hw_ctx hctx, sq_hctx;
1463	1682	int i;
1464	1683
	1684	+ sq_hctx = NULL;
	1685	+ if (blk_mq_has_sqsched(q))
	1686	+ sq_hctx = blk_mq_get_sq_hctx(q);
1465	1687	queue_for_each_hw_ctx(q, hctx, i) {
1466	1688	if (blk_mq_hctx_stopped(hctx))
1467	1689	continue;
1468		-
1469		- blk_mq_run_hw_queue(hctx, async);
	1690	+ /*
	1691	+ * Dispatch from this hctx either if there's no hctx preferred
	1692	+ * by IO scheduler or if it has requests that bypass the
	1693	+ * scheduler.
	1694	+ */
	1695	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1696	+ !list_empty_careful(&hctx->dispatch))
	1697	+ blk_mq_run_hw_queue(hctx, async);
1470	1698	}
1471	1699	}
1472	1700	EXPORT_SYMBOL(blk_mq_run_hw_queues);
	1701	+
	1702	+/**
	1703	+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
	1704	+ * @q: Pointer to the request queue to run.
	1705	+ * @msecs: Microseconds of delay to wait before running the queues.
	1706	+ */
	1707	+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
	1708	+{
	1709	+ struct blk_mq_hw_ctx hctx, sq_hctx;
	1710	+ int i;
	1711	+
	1712	+ sq_hctx = NULL;
	1713	+ if (blk_mq_has_sqsched(q))
	1714	+ sq_hctx = blk_mq_get_sq_hctx(q);
	1715	+ queue_for_each_hw_ctx(q, hctx, i) {
	1716	+ if (blk_mq_hctx_stopped(hctx))
	1717	+ continue;
	1718	+ /*
	1719	+ * Dispatch from this hctx either if there's no hctx preferred
	1720	+ * by IO scheduler or if it has requests that bypass the
	1721	+ * scheduler.
	1722	+ */
	1723	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1724	+ !list_empty_careful(&hctx->dispatch))
	1725	+ blk_mq_delay_run_hw_queue(hctx, msecs);
	1726	+ }
	1727	+}
	1728	+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1473	1729
1474	1730	/**
1475	1731	* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
..	..	@@ -1574,7 +1830,7 @@
1574	1830	/*
1575	1831	* If we are stopped, don't run the queue.
1576	1832	*/
1577		- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
	1833	+ if (blk_mq_hctx_stopped(hctx))
1578	1834	return;
1579	1835
1580	1836	__blk_mq_run_hw_queue(hctx);
..	..	@@ -1585,15 +1841,16 @@
1585	1841	bool at_head)
1586	1842	{
1587	1843	struct blk_mq_ctx *ctx = rq->mq_ctx;
	1844	+ enum hctx_type type = hctx->type;
1588	1845
1589	1846	lockdep_assert_held(&ctx->lock);
1590	1847
1591	1848	trace_block_rq_insert(hctx->queue, rq);
1592	1849
1593	1850	if (at_head)
1594		- list_add(&rq->queuelist, &ctx->rq_list);
	1851	+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
1595	1852	else
1596		- list_add_tail(&rq->queuelist, &ctx->rq_list);
	1853	+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1597	1854	}
1598	1855
1599	1856	void __blk_mq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
..	..	@@ -1607,17 +1864,25 @@
1607	1864	blk_mq_hctx_mark_pending(hctx, ctx);
1608	1865	}
1609	1866
1610		-/*
	1867	+/**
	1868	+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
	1869	+ * @rq: Pointer to request to be inserted.
	1870	+ * @at_head: true if the request should be inserted at the head of the list.
	1871	+ * @run_queue: If we should run the hardware queue after inserting the request.
	1872	+ *
1611	1873	* Should only be used carefully, when the caller knows we want to
1612	1874	* bypass a potential IO scheduler on the target device.
1613	1875	*/
1614		-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
	1876	+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
	1877	+ bool run_queue)
1615	1878	{
1616		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1617		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	1879	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1618	1880
1619	1881	spin_lock(&hctx->lock);
1620		- list_add_tail(&rq->queuelist, &hctx->dispatch);
	1882	+ if (at_head)
	1883	+ list_add(&rq->queuelist, &hctx->dispatch);
	1884	+ else
	1885	+ list_add_tail(&rq->queuelist, &hctx->dispatch);
1621	1886	spin_unlock(&hctx->lock);
1622	1887
1623	1888	if (run_queue)
..	..	@@ -1629,6 +1894,7 @@
1629	1894
1630	1895	{
1631	1896	struct request *rq;
	1897	+ enum hctx_type type = hctx->type;
1632	1898
1633	1899	/*
1634	1900	* preemption doesn't flush plug list, so it's possible ctx->cpu is
..	..	@@ -1640,95 +1906,87 @@
1640	1906	}
1641	1907
1642	1908	spin_lock(&ctx->lock);
1643		- list_splice_tail_init(list, &ctx->rq_list);
	1909	+ list_splice_tail_init(list, &ctx->rq_lists[type]);
1644	1910	blk_mq_hctx_mark_pending(hctx, ctx);
1645	1911	spin_unlock(&ctx->lock);
1646	1912	}
1647	1913
1648		-static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
	1914	+static int plug_rq_cmp(void priv, struct list_head a, struct list_head *b)
1649	1915	{
1650	1916	struct request *rqa = container_of(a, struct request, queuelist);
1651	1917	struct request *rqb = container_of(b, struct request, queuelist);
1652	1918
1653		- return !(rqa->mq_ctx < rqb->mq_ctx \|\|
1654		- (rqa->mq_ctx == rqb->mq_ctx &&
1655		- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
	1919	+ if (rqa->mq_ctx != rqb->mq_ctx)
	1920	+ return rqa->mq_ctx > rqb->mq_ctx;
	1921	+ if (rqa->mq_hctx != rqb->mq_hctx)
	1922	+ return rqa->mq_hctx > rqb->mq_hctx;
	1923	+
	1924	+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1656	1925	}
1657	1926
1658	1927	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1659	1928	{
1660		- struct blk_mq_ctx *this_ctx;
1661		- struct request_queue *this_q;
1662		- struct request *rq;
1663	1929	LIST_HEAD(list);
1664		- LIST_HEAD(ctx_list);
1665		- unsigned int depth;
1666	1930
	1931	+ if (list_empty(&plug->mq_list))
	1932	+ return;
1667	1933	list_splice_init(&plug->mq_list, &list);
1668	1934
1669		- list_sort(NULL, &list, plug_ctx_cmp);
	1935	+ if (plug->rq_count > 2 && plug->multiple_queues)
	1936	+ list_sort(NULL, &list, plug_rq_cmp);
1670	1937
1671		- this_q = NULL;
1672		- this_ctx = NULL;
1673		- depth = 0;
	1938	+ plug->rq_count = 0;
1674	1939
1675		- while (!list_empty(&list)) {
1676		- rq = list_entry_rq(list.next);
1677		- list_del_init(&rq->queuelist);
1678		- BUG_ON(!rq->q);
1679		- if (rq->mq_ctx != this_ctx) {
1680		- if (this_ctx) {
1681		- trace_block_unplug(this_q, depth, !from_schedule);
1682		- blk_mq_sched_insert_requests(this_q, this_ctx,
1683		- &ctx_list,
1684		- from_schedule);
1685		- }
	1940	+ do {
	1941	+ struct list_head rq_list;
	1942	+ struct request rq, head_rq = list_entry_rq(list.next);
	1943	+ struct list_head pos = &head_rq->queuelist; / skip first */
	1944	+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
	1945	+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
	1946	+ unsigned int depth = 1;
1686	1947
1687		- this_ctx = rq->mq_ctx;
1688		- this_q = rq->q;
1689		- depth = 0;
	1948	+ list_for_each_continue(pos, &list) {
	1949	+ rq = list_entry_rq(pos);
	1950	+ BUG_ON(!rq->q);
	1951	+ if (rq->mq_hctx != this_hctx \|\| rq->mq_ctx != this_ctx)
	1952	+ break;
	1953	+ depth++;
1690	1954	}
1691	1955
1692		- depth++;
1693		- list_add_tail(&rq->queuelist, &ctx_list);
1694		- }
1695		-
1696		- /*
1697		- * If 'this_ctx' is set, we know we have entries to complete
1698		- * on 'ctx_list'. Do those.
1699		- */
1700		- if (this_ctx) {
1701		- trace_block_unplug(this_q, depth, !from_schedule);
1702		- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
	1956	+ list_cut_before(&rq_list, &list, pos);
	1957	+ trace_block_unplug(head_rq->q, depth, !from_schedule);
	1958	+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1703	1959	from_schedule);
1704		- }
	1960	+ } while(!list_empty(&list));
1705	1961	}
1706	1962
1707		-static void blk_mq_bio_to_request(struct request rq, struct bio bio)
	1963	+static void blk_mq_bio_to_request(struct request rq, struct bio bio,
	1964	+ unsigned int nr_segs)
1708	1965	{
1709		- blk_init_request_from_bio(rq, bio);
	1966	+ int err;
1710	1967
1711		- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
	1968	+ if (bio->bi_opf & REQ_RAHEAD)
	1969	+ rq->cmd_flags \|= REQ_FAILFAST_MASK;
1712	1970
1713		- blk_account_io_start(rq, true);
1714		-}
	1971	+ rq->__sector = bio->bi_iter.bi_sector;
	1972	+ rq->write_hint = bio->bi_write_hint;
	1973	+ blk_rq_bio_prep(rq, bio, nr_segs);
1715	1974
1716		-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx hctx, struct request rq)
1717		-{
1718		- if (rq->tag != -1)
1719		- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
	1975	+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
	1976	+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
	1977	+ WARN_ON_ONCE(err);
1720	1978
1721		- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
	1979	+ blk_account_io_start(rq);
1722	1980	}
1723	1981
1724	1982	static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1725	1983	struct request *rq,
1726		- blk_qc_t *cookie)
	1984	+ blk_qc_t *cookie, bool last)
1727	1985	{
1728	1986	struct request_queue *q = rq->q;
1729	1987	struct blk_mq_queue_data bd = {
1730	1988	.rq = rq,
1731		- .last = true,
	1989	+ .last = last,
1732	1990	};
1733	1991	blk_qc_t new_cookie;
1734	1992	blk_status_t ret;
..	..	@@ -1763,7 +2021,7 @@
1763	2021	static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1764	2022	struct request *rq,
1765	2023	blk_qc_t *cookie,
1766		- bool bypass_insert)
	2024	+ bool bypass_insert, bool last)
1767	2025	{
1768	2026	struct request_queue *q = rq->q;
1769	2027	bool run_queue = true;
..	..	@@ -1784,23 +2042,35 @@
1784	2042	if (q->elevator && !bypass_insert)
1785	2043	goto insert;
1786	2044
1787		- if (!blk_mq_get_dispatch_budget(hctx))
	2045	+ if (!blk_mq_get_dispatch_budget(q))
1788	2046	goto insert;
1789	2047
1790	2048	if (!blk_mq_get_driver_tag(rq)) {
1791		- blk_mq_put_dispatch_budget(hctx);
	2049	+ blk_mq_put_dispatch_budget(q);
1792	2050	goto insert;
1793	2051	}
1794	2052
1795		- return __blk_mq_issue_directly(hctx, rq, cookie);
	2053	+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
1796	2054	insert:
1797	2055	if (bypass_insert)
1798	2056	return BLK_STS_RESOURCE;
1799	2057
1800		- blk_mq_request_bypass_insert(rq, run_queue);
	2058	+ blk_mq_sched_insert_request(rq, false, run_queue, false);
	2059	+
1801	2060	return BLK_STS_OK;
1802	2061	}
1803	2062
	2063	+/**
	2064	+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
	2065	+ * @hctx: Pointer of the associated hardware queue.
	2066	+ * @rq: Pointer to request to be sent.
	2067	+ * @cookie: Request queue cookie.
	2068	+ *
	2069	+ * If the device has enough resources to accept a new request now, send the
	2070	+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
	2071	+ * we can try send it another time in the future. Requests inserted at this
	2072	+ * queue have higher priority.
	2073	+ */
1804	2074	static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1805	2075	struct request rq, blk_qc_t cookie)
1806	2076	{
..	..	@@ -1811,25 +2081,24 @@
1811	2081
1812	2082	hctx_lock(hctx, &srcu_idx);
1813	2083
1814		- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
	2084	+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1815	2085	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1816		- blk_mq_request_bypass_insert(rq, true);
	2086	+ blk_mq_request_bypass_insert(rq, false, true);
1817	2087	else if (ret != BLK_STS_OK)
1818	2088	blk_mq_end_request(rq, ret);
1819	2089
1820	2090	hctx_unlock(hctx, srcu_idx);
1821	2091	}
1822	2092
1823		-blk_status_t blk_mq_request_issue_directly(struct request *rq)
	2093	+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1824	2094	{
1825	2095	blk_status_t ret;
1826	2096	int srcu_idx;
1827	2097	blk_qc_t unused_cookie;
1828		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1829		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	2098	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1830	2099
1831	2100	hctx_lock(hctx, &srcu_idx);
1832		- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
	2101	+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1833	2102	hctx_unlock(hctx, srcu_idx);
1834	2103
1835	2104	return ret;
..	..	@@ -1838,104 +2107,169 @@
1838	2107	void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1839	2108	struct list_head *list)
1840	2109	{
	2110	+ int queued = 0;
	2111	+ int errors = 0;
	2112	+
1841	2113	while (!list_empty(list)) {
1842	2114	blk_status_t ret;
1843	2115	struct request *rq = list_first_entry(list, struct request,
1844	2116	queuelist);
1845	2117
1846	2118	list_del_init(&rq->queuelist);
1847		- ret = blk_mq_request_issue_directly(rq);
	2119	+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
1848	2120	if (ret != BLK_STS_OK) {
	2121	+ errors++;
1849	2122	if (ret == BLK_STS_RESOURCE \|\|
1850	2123	ret == BLK_STS_DEV_RESOURCE) {
1851		- blk_mq_request_bypass_insert(rq,
	2124	+ blk_mq_request_bypass_insert(rq, false,
1852	2125	list_empty(list));
1853	2126	break;
1854	2127	}
1855	2128	blk_mq_end_request(rq, ret);
1856		- }
	2129	+ } else
	2130	+ queued++;
	2131	+ }
	2132	+
	2133	+ /*
	2134	+ * If we didn't flush the entire list, we could have told
	2135	+ * the driver there was more coming, but that turned out to
	2136	+ * be a lie.
	2137	+ */
	2138	+ if ((!list_empty(list) \|\| errors) &&
	2139	+ hctx->queue->mq_ops->commit_rqs && queued)
	2140	+ hctx->queue->mq_ops->commit_rqs(hctx);
	2141	+}
	2142	+
	2143	+static void blk_add_rq_to_plug(struct blk_plug plug, struct request rq)
	2144	+{
	2145	+ list_add_tail(&rq->queuelist, &plug->mq_list);
	2146	+ plug->rq_count++;
	2147	+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
	2148	+ struct request *tmp;
	2149	+
	2150	+ tmp = list_first_entry(&plug->mq_list, struct request,
	2151	+ queuelist);
	2152	+ if (tmp->q != rq->q)
	2153	+ plug->multiple_queues = true;
1857	2154	}
1858	2155	}
1859	2156
1860		-static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
	2157	+/*
	2158	+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
	2159	+ * queues. This is important for md arrays to benefit from merging
	2160	+ * requests.
	2161	+ */
	2162	+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1861	2163	{
	2164	+ if (plug->multiple_queues)
	2165	+ return BLK_MAX_REQUEST_COUNT * 2;
	2166	+ return BLK_MAX_REQUEST_COUNT;
	2167	+}
	2168	+
	2169	+/**
	2170	+ * blk_mq_submit_bio - Create and send a request to block device.
	2171	+ * @bio: Bio pointer.
	2172	+ *
	2173	+ * Builds up a request structure from @q and @bio and send to the device. The
	2174	+ * request may not be queued directly to hardware if:
	2175	+ * * This request can be merged with another one
	2176	+ * * We want to place request at plug queue for possible future merging
	2177	+ * * There is an IO scheduler active at this queue
	2178	+ *
	2179	+ * It will not queue the request if there is an error with the bio, or at the
	2180	+ * request creation.
	2181	+ *
	2182	+ * Returns: Request queue cookie.
	2183	+ */
	2184	+blk_qc_t blk_mq_submit_bio(struct bio *bio)
	2185	+{
	2186	+ struct request_queue *q = bio->bi_disk->queue;
1862	2187	const int is_sync = op_is_sync(bio->bi_opf);
1863	2188	const int is_flush_fua = op_is_flush(bio->bi_opf);
1864		- struct blk_mq_alloc_data data = { .flags = 0 };
	2189	+ struct blk_mq_alloc_data data = {
	2190	+ .q = q,
	2191	+ };
1865	2192	struct request *rq;
1866		- unsigned int request_count = 0;
1867	2193	struct blk_plug *plug;
1868	2194	struct request *same_queue_rq = NULL;
	2195	+ unsigned int nr_segs;
1869	2196	blk_qc_t cookie;
	2197	+ blk_status_t ret;
1870	2198
1871	2199	blk_queue_bounce(q, &bio);
1872		-
1873		- blk_queue_split(q, &bio);
	2200	+ __blk_queue_split(&bio, &nr_segs);
1874	2201
1875	2202	if (!bio_integrity_prep(bio))
1876		- return BLK_QC_T_NONE;
	2203	+ goto queue_exit;
1877	2204
1878	2205	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1879		- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1880		- return BLK_QC_T_NONE;
	2206	+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
	2207	+ goto queue_exit;
1881	2208
1882		- if (blk_mq_sched_bio_merge(q, bio))
1883		- return BLK_QC_T_NONE;
	2209	+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
	2210	+ goto queue_exit;
1884	2211
1885		- rq_qos_throttle(q, bio, NULL);
	2212	+ rq_qos_throttle(q, bio);
1886	2213
1887		- trace_block_getrq(q, bio, bio->bi_opf);
1888		-
1889		- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
	2214	+ data.cmd_flags = bio->bi_opf;
	2215	+ rq = __blk_mq_alloc_request(&data);
1890	2216	if (unlikely(!rq)) {
1891	2217	rq_qos_cleanup(q, bio);
1892	2218	if (bio->bi_opf & REQ_NOWAIT)
1893	2219	bio_wouldblock_error(bio);
1894		- return BLK_QC_T_NONE;
	2220	+ goto queue_exit;
1895	2221	}
	2222	+
	2223	+ trace_block_getrq(q, bio, bio->bi_opf);
1896	2224
1897	2225	rq_qos_track(q, rq, bio);
1898	2226
1899	2227	cookie = request_to_qc_t(data.hctx, rq);
1900	2228
1901		- plug = current->plug;
1902		- if (unlikely(is_flush_fua)) {
1903		- blk_mq_put_ctx(data.ctx);
1904		- blk_mq_bio_to_request(rq, bio);
	2229	+ blk_mq_bio_to_request(rq, bio, nr_segs);
1905	2230
1906		- /* bypass scheduler for flush rq */
	2231	+ ret = blk_crypto_init_request(rq);
	2232	+ if (ret != BLK_STS_OK) {
	2233	+ bio->bi_status = ret;
	2234	+ bio_endio(bio);
	2235	+ blk_mq_free_request(rq);
	2236	+ return BLK_QC_T_NONE;
	2237	+ }
	2238	+
	2239	+ plug = blk_mq_plug(q, bio);
	2240	+ if (unlikely(is_flush_fua)) {
	2241	+ /* Bypass scheduler for flush requests */
1907	2242	blk_insert_flush(rq);
1908	2243	blk_mq_run_hw_queue(data.hctx, true);
1909		- } else if (plug && q->nr_hw_queues == 1) {
1910		- struct request *last = NULL;
1911		-
1912		- blk_mq_put_ctx(data.ctx);
1913		- blk_mq_bio_to_request(rq, bio);
1914		-
	2244	+ } else if (plug && (q->nr_hw_queues == 1 \|\|
	2245	+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) \|\|
	2246	+ q->mq_ops->commit_rqs \|\| !blk_queue_nonrot(q))) {
1915	2247	/*
1916		- * @request_count may become stale because of schedule
1917		- * out, so check the list again.
	2248	+ * Use plugging if we have a ->commit_rqs() hook as well, as
	2249	+ * we know the driver uses bd->last in a smart fashion.
	2250	+ *
	2251	+ * Use normal plugging if this disk is slow HDD, as sequential
	2252	+ * IO may benefit a lot from plug merging.
1918	2253	*/
1919		- if (list_empty(&plug->mq_list))
1920		- request_count = 0;
1921		- else if (blk_queue_nomerges(q))
1922		- request_count = blk_plug_queued_count(q);
	2254	+ unsigned int request_count = plug->rq_count;
	2255	+ struct request *last = NULL;
1923	2256
1924	2257	if (!request_count)
1925	2258	trace_block_plug(q);
1926	2259	else
1927	2260	last = list_entry_rq(plug->mq_list.prev);
1928	2261
1929		- if (request_count >= BLK_MAX_REQUEST_COUNT \|\| (last &&
	2262	+ if (request_count >= blk_plug_max_rq_count(plug) \|\| (last &&
1930	2263	blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1931	2264	blk_flush_plug_list(plug, false);
1932	2265	trace_block_plug(q);
1933	2266	}
1934	2267
1935		- list_add_tail(&rq->queuelist, &plug->mq_list);
	2268	+ blk_add_rq_to_plug(plug, rq);
	2269	+ } else if (q->elevator) {
	2270	+ /* Insert the request at the IO scheduler queue */
	2271	+ blk_mq_sched_insert_request(rq, false, true, true);
1936	2272	} else if (plug && !blk_queue_nomerges(q)) {
1937		- blk_mq_bio_to_request(rq, bio);
1938		-
1939	2273	/*
1940	2274	* We do limited plugging. If the bio can be merged, do that.
1941	2275	* Otherwise the existing request in the plug list will be
..	..	@@ -1945,30 +2279,74 @@
1945	2279	*/
1946	2280	if (list_empty(&plug->mq_list))
1947	2281	same_queue_rq = NULL;
1948		- if (same_queue_rq)
	2282	+ if (same_queue_rq) {
1949	2283	list_del_init(&same_queue_rq->queuelist);
1950		- list_add_tail(&rq->queuelist, &plug->mq_list);
1951		-
1952		- blk_mq_put_ctx(data.ctx);
	2284	+ plug->rq_count--;
	2285	+ }
	2286	+ blk_add_rq_to_plug(plug, rq);
	2287	+ trace_block_plug(q);
1953	2288
1954	2289	if (same_queue_rq) {
1955		- data.hctx = blk_mq_map_queue(q,
1956		- same_queue_rq->mq_ctx->cpu);
	2290	+ data.hctx = same_queue_rq->mq_hctx;
	2291	+ trace_block_unplug(q, 1, true);
1957	2292	blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1958	2293	&cookie);
1959	2294	}
1960		- } else if ((q->nr_hw_queues > 1 && is_sync) \|\| (!q->elevator &&
1961		- !data.hctx->dispatch_busy)) {
1962		- blk_mq_put_ctx(data.ctx);
1963		- blk_mq_bio_to_request(rq, bio);
	2295	+ } else if ((q->nr_hw_queues > 1 && is_sync) \|\|
	2296	+ !data.hctx->dispatch_busy) {
	2297	+ /*
	2298	+ * There is no scheduler and we can try to send directly
	2299	+ * to the hardware.
	2300	+ */
1964	2301	blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1965	2302	} else {
1966		- blk_mq_put_ctx(data.ctx);
1967		- blk_mq_bio_to_request(rq, bio);
	2303	+ /* Default case. */
1968	2304	blk_mq_sched_insert_request(rq, false, true, true);
1969	2305	}
1970	2306
1971	2307	return cookie;
	2308	+queue_exit:
	2309	+ blk_queue_exit(q);
	2310	+ return BLK_QC_T_NONE;
	2311	+}
	2312	+
	2313	+static size_t order_to_size(unsigned int order)
	2314	+{
	2315	+ return (size_t)PAGE_SIZE << order;
	2316	+}
	2317	+
	2318	+/* called before freeing request pool in @tags */
	2319	+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
	2320	+ struct blk_mq_tags *tags, unsigned int hctx_idx)
	2321	+{
	2322	+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
	2323	+ struct page *page;
	2324	+ unsigned long flags;
	2325	+
	2326	+ list_for_each_entry(page, &tags->page_list, lru) {
	2327	+ unsigned long start = (unsigned long)page_address(page);
	2328	+ unsigned long end = start + order_to_size(page->private);
	2329	+ int i;
	2330	+
	2331	+ for (i = 0; i < set->queue_depth; i++) {
	2332	+ struct request *rq = drv_tags->rqs[i];
	2333	+ unsigned long rq_addr = (unsigned long)rq;
	2334	+
	2335	+ if (rq_addr >= start && rq_addr < end) {
	2336	+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
	2337	+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
	2338	+ }
	2339	+ }
	2340	+ }
	2341	+
	2342	+ /*
	2343	+ * Wait until all pending iteration is done.
	2344	+ *
	2345	+ * Request reference is cleared and it is guaranteed to be observed
	2346	+ * after the ->lock is released.
	2347	+ */
	2348	+ spin_lock_irqsave(&drv_tags->lock, flags);
	2349	+ spin_unlock_irqrestore(&drv_tags->lock, flags);
1972	2350	}
1973	2351
1974	2352	void blk_mq_free_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
..	..	@@ -1989,42 +2367,44 @@
1989	2367	}
1990	2368	}
1991	2369
	2370	+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
	2371	+
1992	2372	while (!list_empty(&tags->page_list)) {
1993	2373	page = list_first_entry(&tags->page_list, struct page, lru);
1994	2374	list_del_init(&page->lru);
1995	2375	/*
1996	2376	* Remove kmemleak object previously allocated in
1997		- * blk_mq_init_rq_map().
	2377	+ * blk_mq_alloc_rqs().
1998	2378	*/
1999	2379	kmemleak_free(page_address(page));
2000	2380	__free_pages(page, page->private);
2001	2381	}
2002	2382	}
2003	2383
2004		-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
	2384	+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
2005	2385	{
2006	2386	kfree(tags->rqs);
2007	2387	tags->rqs = NULL;
2008	2388	kfree(tags->static_rqs);
2009	2389	tags->static_rqs = NULL;
2010	2390
2011		- blk_mq_free_tags(tags);
	2391	+ blk_mq_free_tags(tags, flags);
2012	2392	}
2013	2393
2014	2394	struct blk_mq_tags blk_mq_alloc_rq_map(struct blk_mq_tag_set set,
2015	2395	unsigned int hctx_idx,
2016	2396	unsigned int nr_tags,
2017		- unsigned int reserved_tags)
	2397	+ unsigned int reserved_tags,
	2398	+ unsigned int flags)
2018	2399	{
2019	2400	struct blk_mq_tags *tags;
2020	2401	int node;
2021	2402
2022		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2403	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2023	2404	if (node == NUMA_NO_NODE)
2024	2405	node = set->numa_node;
2025	2406
2026		- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2027		- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
	2407	+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
2028	2408	if (!tags)
2029	2409	return NULL;
2030	2410
..	..	@@ -2032,7 +2412,7 @@
2032	2412	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2033	2413	node);
2034	2414	if (!tags->rqs) {
2035		- blk_mq_free_tags(tags);
	2415	+ blk_mq_free_tags(tags, flags);
2036	2416	return NULL;
2037	2417	}
2038	2418
..	..	@@ -2041,16 +2421,11 @@
2041	2421	node);
2042	2422	if (!tags->static_rqs) {
2043	2423	kfree(tags->rqs);
2044		- blk_mq_free_tags(tags);
	2424	+ blk_mq_free_tags(tags, flags);
2045	2425	return NULL;
2046	2426	}
2047	2427
2048	2428	return tags;
2049		-}
2050		-
2051		-static size_t order_to_size(unsigned int order)
2052		-{
2053		- return (size_t)PAGE_SIZE << order;
2054	2429	}
2055	2430
2056	2431	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
..	..	@@ -2075,7 +2450,7 @@
2075	2450	size_t rq_size, left;
2076	2451	int node;
2077	2452
2078		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2453	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2079	2454	if (node == NUMA_NO_NODE)
2080	2455	node = set->numa_node;
2081	2456
..	..	@@ -2087,6 +2462,7 @@
2087	2462	*/
2088	2463	rq_size = round_up(sizeof(struct request) + set->cmd_size,
2089	2464	cache_line_size());
	2465	+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
2090	2466	left = rq_size * depth;
2091	2467
2092	2468	for (i = 0; i < depth; ) {
..	..	@@ -2145,6 +2521,86 @@
2145	2521	return -ENOMEM;
2146	2522	}
2147	2523
	2524	+struct rq_iter_data {
	2525	+ struct blk_mq_hw_ctx *hctx;
	2526	+ bool has_rq;
	2527	+};
	2528	+
	2529	+static bool blk_mq_has_request(struct request rq, void data, bool reserved)
	2530	+{
	2531	+ struct rq_iter_data *iter_data = data;
	2532	+
	2533	+ if (rq->mq_hctx != iter_data->hctx)
	2534	+ return true;
	2535	+ iter_data->has_rq = true;
	2536	+ return false;
	2537	+}
	2538	+
	2539	+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
	2540	+{
	2541	+ struct blk_mq_tags *tags = hctx->sched_tags ?
	2542	+ hctx->sched_tags : hctx->tags;
	2543	+ struct rq_iter_data data = {
	2544	+ .hctx = hctx,
	2545	+ };
	2546	+
	2547	+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
	2548	+ return data.has_rq;
	2549	+}
	2550	+
	2551	+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
	2552	+ struct blk_mq_hw_ctx *hctx)
	2553	+{
	2554	+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
	2555	+ return false;
	2556	+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
	2557	+ return false;
	2558	+ return true;
	2559	+}
	2560	+
	2561	+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
	2562	+{
	2563	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2564	+ struct blk_mq_hw_ctx, cpuhp_online);
	2565	+
	2566	+ if (!cpumask_test_cpu(cpu, hctx->cpumask) \|\|
	2567	+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
	2568	+ return 0;
	2569	+
	2570	+ /*
	2571	+ * Prevent new request from being allocated on the current hctx.
	2572	+ *
	2573	+ * The smp_mb__after_atomic() Pairs with the implied barrier in
	2574	+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
	2575	+ * seen once we return from the tag allocator.
	2576	+ */
	2577	+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2578	+ smp_mb__after_atomic();
	2579	+
	2580	+ /*
	2581	+ * Try to grab a reference to the queue and wait for any outstanding
	2582	+ * requests. If we could not grab a reference the queue has been
	2583	+ * frozen and there are no requests.
	2584	+ */
	2585	+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
	2586	+ while (blk_mq_hctx_has_requests(hctx))
	2587	+ msleep(5);
	2588	+ percpu_ref_put(&hctx->queue->q_usage_counter);
	2589	+ }
	2590	+
	2591	+ return 0;
	2592	+}
	2593	+
	2594	+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
	2595	+{
	2596	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2597	+ struct blk_mq_hw_ctx, cpuhp_online);
	2598	+
	2599	+ if (cpumask_test_cpu(cpu, hctx->cpumask))
	2600	+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2601	+ return 0;
	2602	+}
	2603	+
2148	2604	/*
2149	2605	* 'cpu' is going away. splice any existing rq_list entries from this
2150	2606	* software queue to the hw queue dispatch list, and ensure that it
..	..	@@ -2155,13 +2611,18 @@
2155	2611	struct blk_mq_hw_ctx *hctx;
2156	2612	struct blk_mq_ctx *ctx;
2157	2613	LIST_HEAD(tmp);
	2614	+ enum hctx_type type;
2158	2615
2159	2616	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
	2617	+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
	2618	+ return 0;
	2619	+
2160	2620	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
	2621	+ type = hctx->type;
2161	2622
2162	2623	spin_lock(&ctx->lock);
2163		- if (!list_empty(&ctx->rq_list)) {
2164		- list_splice_init(&ctx->rq_list, &tmp);
	2624	+ if (!list_empty(&ctx->rq_lists[type])) {
	2625	+ list_splice_init(&ctx->rq_lists[type], &tmp);
2165	2626	blk_mq_hctx_clear_pending(hctx, ctx);
2166	2627	}
2167	2628	spin_unlock(&ctx->lock);
..	..	@@ -2179,8 +2640,40 @@
2179	2640
2180	2641	static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2181	2642	{
	2643	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2644	+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2645	+ &hctx->cpuhp_online);
2182	2646	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2183	2647	&hctx->cpuhp_dead);
	2648	+}
	2649	+
	2650	+/*
	2651	+ * Before freeing hw queue, clearing the flush request reference in
	2652	+ * tags->rqs[] for avoiding potential UAF.
	2653	+ */
	2654	+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
	2655	+ unsigned int queue_depth, struct request *flush_rq)
	2656	+{
	2657	+ int i;
	2658	+ unsigned long flags;
	2659	+
	2660	+ /* The hw queue may not be mapped yet */
	2661	+ if (!tags)
	2662	+ return;
	2663	+
	2664	+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
	2665	+
	2666	+ for (i = 0; i < queue_depth; i++)
	2667	+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
	2668	+
	2669	+ /*
	2670	+ * Wait until all pending iteration is done.
	2671	+ *
	2672	+ * Request reference is cleared and it is guaranteed to be observed
	2673	+ * after the ->lock is released.
	2674	+ */
	2675	+ spin_lock_irqsave(&tags->lock, flags);
	2676	+ spin_unlock_irqrestore(&tags->lock, flags);
2184	2677	}
2185	2678
2186	2679	/* hctx->ctxs will be freed in queue's release handler */
..	..	@@ -2188,18 +2681,24 @@
2188	2681	struct blk_mq_tag_set *set,
2189	2682	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2190	2683	{
2191		- blk_mq_debugfs_unregister_hctx(hctx);
	2684	+ struct request *flush_rq = hctx->fq->flush_rq;
2192	2685
2193	2686	if (blk_mq_hw_queue_mapped(hctx))
2194	2687	blk_mq_tag_idle(hctx);
2195	2688
	2689	+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
	2690	+ set->queue_depth, flush_rq);
2196	2691	if (set->ops->exit_request)
2197		- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
	2692	+ set->ops->exit_request(set, flush_rq, hctx_idx);
2198	2693
2199	2694	if (set->ops->exit_hctx)
2200	2695	set->ops->exit_hctx(hctx, hctx_idx);
2201	2696
2202	2697	blk_mq_remove_cpuhp(hctx);
	2698	+
	2699	+ spin_lock(&q->unused_hctx_lock);
	2700	+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
	2701	+ spin_unlock(&q->unused_hctx_lock);
2203	2702	}
2204	2703
2205	2704	static void blk_mq_exit_hw_queues(struct request_queue *q,
..	..	@@ -2211,112 +2710,160 @@
2211	2710	queue_for_each_hw_ctx(q, hctx, i) {
2212	2711	if (i == nr_queue)
2213	2712	break;
	2713	+ blk_mq_debugfs_unregister_hctx(hctx);
2214	2714	blk_mq_exit_hctx(q, set, hctx, i);
2215	2715	}
	2716	+}
	2717	+
	2718	+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	2719	+{
	2720	+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	2721	+
	2722	+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
	2723	+ __alignof__(struct blk_mq_hw_ctx)) !=
	2724	+ sizeof(struct blk_mq_hw_ctx));
	2725	+
	2726	+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
	2727	+ hw_ctx_size += sizeof(struct srcu_struct);
	2728	+
	2729	+ return hw_ctx_size;
2216	2730	}
2217	2731
2218	2732	static int blk_mq_init_hctx(struct request_queue *q,
2219	2733	struct blk_mq_tag_set *set,
2220	2734	struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2221	2735	{
2222		- int node;
	2736	+ hctx->queue_num = hctx_idx;
2223	2737
2224		- node = hctx->numa_node;
	2738	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2739	+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2740	+ &hctx->cpuhp_online);
	2741	+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
	2742	+
	2743	+ hctx->tags = set->tags[hctx_idx];
	2744	+
	2745	+ if (set->ops->init_hctx &&
	2746	+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
	2747	+ goto unregister_cpu_notifier;
	2748	+
	2749	+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
	2750	+ hctx->numa_node))
	2751	+ goto exit_hctx;
	2752	+ return 0;
	2753	+
	2754	+ exit_hctx:
	2755	+ if (set->ops->exit_hctx)
	2756	+ set->ops->exit_hctx(hctx, hctx_idx);
	2757	+ unregister_cpu_notifier:
	2758	+ blk_mq_remove_cpuhp(hctx);
	2759	+ return -1;
	2760	+}
	2761	+
	2762	+static struct blk_mq_hw_ctx *
	2763	+blk_mq_alloc_hctx(struct request_queue q, struct blk_mq_tag_set set,
	2764	+ int node)
	2765	+{
	2766	+ struct blk_mq_hw_ctx *hctx;
	2767	+ gfp_t gfp = GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY;
	2768	+
	2769	+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
	2770	+ if (!hctx)
	2771	+ goto fail_alloc_hctx;
	2772	+
	2773	+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
	2774	+ goto free_hctx;
	2775	+
	2776	+ atomic_set(&hctx->nr_active, 0);
2225	2777	if (node == NUMA_NO_NODE)
2226		- node = hctx->numa_node = set->numa_node;
	2778	+ node = set->numa_node;
	2779	+ hctx->numa_node = node;
2227	2780
2228	2781	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2229	2782	spin_lock_init(&hctx->lock);
2230	2783	INIT_LIST_HEAD(&hctx->dispatch);
2231	2784	hctx->queue = q;
2232		- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
	2785	+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
2233	2786
2234		- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2235		-
2236		- hctx->tags = set->tags[hctx_idx];
	2787	+ INIT_LIST_HEAD(&hctx->hctx_list);
2237	2788
2238	2789	/*
2239	2790	* Allocate space for all possible cpus to avoid allocation at
2240	2791	* runtime
2241	2792	*/
2242	2793	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2243		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node);
	2794	+ gfp, node);
2244	2795	if (!hctx->ctxs)
2245		- goto unregister_cpu_notifier;
	2796	+ goto free_cpumask;
2246	2797
2247	2798	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2248		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node))
	2799	+ gfp, node))
2249	2800	goto free_ctxs;
2250		-
2251	2801	hctx->nr_ctx = 0;
2252	2802
2253	2803	spin_lock_init(&hctx->dispatch_wait_lock);
2254	2804	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2255	2805	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2256	2806
2257		- if (set->ops->init_hctx &&
2258		- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2259		- goto free_bitmap;
2260		-
2261		- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2262		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
	2807	+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2263	2808	if (!hctx->fq)
2264		- goto exit_hctx;
2265		-
2266		- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2267		- goto free_fq;
	2809	+ goto free_bitmap;
2268	2810
2269	2811	if (hctx->flags & BLK_MQ_F_BLOCKING)
2270	2812	init_srcu_struct(hctx->srcu);
	2813	+ blk_mq_hctx_kobj_init(hctx);
2271	2814
2272		- blk_mq_debugfs_register_hctx(q, hctx);
	2815	+ return hctx;
2273	2816
2274		- return 0;
2275		-
2276		- free_fq:
2277		- blk_free_flush_queue(hctx->fq);
2278		- exit_hctx:
2279		- if (set->ops->exit_hctx)
2280		- set->ops->exit_hctx(hctx, hctx_idx);
2281	2817	free_bitmap:
2282	2818	sbitmap_free(&hctx->ctx_map);
2283	2819	free_ctxs:
2284	2820	kfree(hctx->ctxs);
2285		- unregister_cpu_notifier:
2286		- blk_mq_remove_cpuhp(hctx);
2287		- return -1;
	2821	+ free_cpumask:
	2822	+ free_cpumask_var(hctx->cpumask);
	2823	+ free_hctx:
	2824	+ kfree(hctx);
	2825	+ fail_alloc_hctx:
	2826	+ return NULL;
2288	2827	}
2289	2828
2290	2829	static void blk_mq_init_cpu_queues(struct request_queue *q,
2291	2830	unsigned int nr_hw_queues)
2292	2831	{
2293		- unsigned int i;
	2832	+ struct blk_mq_tag_set *set = q->tag_set;
	2833	+ unsigned int i, j;
2294	2834
2295	2835	for_each_possible_cpu(i) {
2296	2836	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2297	2837	struct blk_mq_hw_ctx *hctx;
	2838	+ int k;
2298	2839
2299	2840	__ctx->cpu = i;
2300	2841	spin_lock_init(&__ctx->lock);
2301		- INIT_LIST_HEAD(&__ctx->rq_list);
	2842	+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
	2843	+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
	2844	+
2302	2845	__ctx->queue = q;
2303	2846
2304	2847	/*
2305	2848	* Set local node, IFF we have more than one hw queue. If
2306	2849	* not, we remain on the home node of the device
2307	2850	*/
2308		- hctx = blk_mq_map_queue(q, i);
2309		- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2310		- hctx->numa_node = local_memory_node(cpu_to_node(i));
	2851	+ for (j = 0; j < set->nr_maps; j++) {
	2852	+ hctx = blk_mq_map_queue_type(q, j, i);
	2853	+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
	2854	+ hctx->numa_node = cpu_to_node(i);
	2855	+ }
2311	2856	}
2312	2857	}
2313	2858
2314		-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
	2859	+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
	2860	+ int hctx_idx)
2315	2861	{
	2862	+ unsigned int flags = set->flags;
2316	2863	int ret = 0;
2317	2864
2318	2865	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2319		- set->queue_depth, set->reserved_tags);
	2866	+ set->queue_depth, set->reserved_tags, flags);
2320	2867	if (!set->tags[hctx_idx])
2321	2868	return false;
2322	2869
..	..	@@ -2325,7 +2872,7 @@
2325	2872	if (!ret)
2326	2873	return true;
2327	2874
2328		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2875	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2329	2876	set->tags[hctx_idx] = NULL;
2330	2877	return false;
2331	2878	}
..	..	@@ -2333,16 +2880,18 @@
2333	2880	static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2334	2881	unsigned int hctx_idx)
2335	2882	{
2336		- if (set->tags[hctx_idx]) {
	2883	+ unsigned int flags = set->flags;
	2884	+
	2885	+ if (set->tags && set->tags[hctx_idx]) {
2337	2886	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2338		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2887	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2339	2888	set->tags[hctx_idx] = NULL;
2340	2889	}
2341	2890	}
2342	2891
2343	2892	static void blk_mq_map_swqueue(struct request_queue *q)
2344	2893	{
2345		- unsigned int i, hctx_idx;
	2894	+ unsigned int i, j, hctx_idx;
2346	2895	struct blk_mq_hw_ctx *hctx;
2347	2896	struct blk_mq_ctx *ctx;
2348	2897	struct blk_mq_tag_set *set = q->tag_set;
..	..	@@ -2359,25 +2908,52 @@
2359	2908	* If the cpu isn't present, the cpu is mapped to first hctx.
2360	2909	*/
2361	2910	for_each_possible_cpu(i) {
2362		- hctx_idx = q->mq_map[i];
2363		- /* unmapped hw queue can be remapped after CPU topo changed */
2364		- if (!set->tags[hctx_idx] &&
2365		- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2366		- /*
2367		- * If tags initialization fail for some hctx,
2368		- * that hctx won't be brought online. In this
2369		- * case, remap the current ctx to hctx[0] which
2370		- * is guaranteed to always have tags allocated
2371		- */
2372		- q->mq_map[i] = 0;
2373		- }
2374	2911
2375	2912	ctx = per_cpu_ptr(q->queue_ctx, i);
2376		- hctx = blk_mq_map_queue(q, i);
	2913	+ for (j = 0; j < set->nr_maps; j++) {
	2914	+ if (!set->map[j].nr_queues) {
	2915	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2916	+ HCTX_TYPE_DEFAULT, i);
	2917	+ continue;
	2918	+ }
	2919	+ hctx_idx = set->map[j].mq_map[i];
	2920	+ /* unmapped hw queue can be remapped after CPU topo changed */
	2921	+ if (!set->tags[hctx_idx] &&
	2922	+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
	2923	+ /*
	2924	+ * If tags initialization fail for some hctx,
	2925	+ * that hctx won't be brought online. In this
	2926	+ * case, remap the current ctx to hctx[0] which
	2927	+ * is guaranteed to always have tags allocated
	2928	+ */
	2929	+ set->map[j].mq_map[i] = 0;
	2930	+ }
2377	2931
2378		- cpumask_set_cpu(i, hctx->cpumask);
2379		- ctx->index_hw = hctx->nr_ctx;
2380		- hctx->ctxs[hctx->nr_ctx++] = ctx;
	2932	+ hctx = blk_mq_map_queue_type(q, j, i);
	2933	+ ctx->hctxs[j] = hctx;
	2934	+ /*
	2935	+ * If the CPU is already set in the mask, then we've
	2936	+ * mapped this one already. This can happen if
	2937	+ * devices share queues across queue maps.
	2938	+ */
	2939	+ if (cpumask_test_cpu(i, hctx->cpumask))
	2940	+ continue;
	2941	+
	2942	+ cpumask_set_cpu(i, hctx->cpumask);
	2943	+ hctx->type = j;
	2944	+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
	2945	+ hctx->ctxs[hctx->nr_ctx++] = ctx;
	2946	+
	2947	+ /*
	2948	+ * If the nr_ctx type overflows, we have exceeded the
	2949	+ * amount of sw queues we can support.
	2950	+ */
	2951	+ BUG_ON(!hctx->nr_ctx);
	2952	+ }
	2953	+
	2954	+ for (; j < HCTX_MAX_TYPES; j++)
	2955	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2956	+ HCTX_TYPE_DEFAULT, i);
2381	2957	}
2382	2958
2383	2959	queue_for_each_hw_ctx(q, hctx, i) {
..	..	@@ -2426,14 +3002,14 @@
2426	3002
2427	3003	queue_for_each_hw_ctx(q, hctx, i) {
2428	3004	if (shared)
2429		- hctx->flags \|= BLK_MQ_F_TAG_SHARED;
	3005	+ hctx->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2430	3006	else
2431		- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
	3007	+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2432	3008	}
2433	3009	}
2434	3010
2435		-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2436		- bool shared)
	3011	+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
	3012	+ bool shared)
2437	3013	{
2438	3014	struct request_queue *q;
2439	3015
..	..	@@ -2451,12 +3027,12 @@
2451	3027	struct blk_mq_tag_set *set = q->tag_set;
2452	3028
2453	3029	mutex_lock(&set->tag_list_lock);
2454		- list_del_rcu(&q->tag_set_list);
	3030	+ list_del(&q->tag_set_list);
2455	3031	if (list_is_singular(&set->tag_list)) {
2456	3032	/* just transitioned to unshared */
2457		- set->flags &= ~BLK_MQ_F_TAG_SHARED;
	3033	+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2458	3034	/* update existing queue */
2459		- blk_mq_update_tag_set_depth(set, false);
	3035	+ blk_mq_update_tag_set_shared(set, false);
2460	3036	}
2461	3037	mutex_unlock(&set->tag_list_lock);
2462	3038	INIT_LIST_HEAD(&q->tag_set_list);
..	..	@@ -2465,24 +3041,50 @@
2465	3041	static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2466	3042	struct request_queue *q)
2467	3043	{
2468		- q->tag_set = set;
2469		-
2470	3044	mutex_lock(&set->tag_list_lock);
2471	3045
2472	3046	/*
2473	3047	* Check to see if we're transitioning to shared (from 1 to 2 queues).
2474	3048	*/
2475	3049	if (!list_empty(&set->tag_list) &&
2476		- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2477		- set->flags \|= BLK_MQ_F_TAG_SHARED;
	3050	+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	3051	+ set->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2478	3052	/* update existing queue */
2479		- blk_mq_update_tag_set_depth(set, true);
	3053	+ blk_mq_update_tag_set_shared(set, true);
2480	3054	}
2481		- if (set->flags & BLK_MQ_F_TAG_SHARED)
	3055	+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2482	3056	queue_set_hctx_shared(q, true);
2483		- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
	3057	+ list_add_tail(&q->tag_set_list, &set->tag_list);
2484	3058
2485	3059	mutex_unlock(&set->tag_list_lock);
	3060	+}
	3061	+
	3062	+/* All allocations will be freed in release handler of q->mq_kobj */
	3063	+static int blk_mq_alloc_ctxs(struct request_queue *q)
	3064	+{
	3065	+ struct blk_mq_ctxs *ctxs;
	3066	+ int cpu;
	3067	+
	3068	+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
	3069	+ if (!ctxs)
	3070	+ return -ENOMEM;
	3071	+
	3072	+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
	3073	+ if (!ctxs->queue_ctx)
	3074	+ goto fail;
	3075	+
	3076	+ for_each_possible_cpu(cpu) {
	3077	+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
	3078	+ ctx->ctxs = ctxs;
	3079	+ }
	3080	+
	3081	+ q->mq_kobj = &ctxs->kobj;
	3082	+ q->queue_ctx = ctxs->queue_ctx;
	3083	+
	3084	+ return 0;
	3085	+ fail:
	3086	+ kfree(ctxs);
	3087	+ return -ENOMEM;
2486	3088	}
2487	3089
2488	3090	/*
..	..	@@ -2493,17 +3095,17 @@
2493	3095	*/
2494	3096	void blk_mq_release(struct request_queue *q)
2495	3097	{
2496		- struct blk_mq_hw_ctx *hctx;
2497		- unsigned int i;
	3098	+ struct blk_mq_hw_ctx hctx, next;
	3099	+ int i;
2498	3100
2499		- /* hctx kobj stays in hctx */
2500		- queue_for_each_hw_ctx(q, hctx, i) {
2501		- if (!hctx)
2502		- continue;
	3101	+ queue_for_each_hw_ctx(q, hctx, i)
	3102	+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
	3103	+
	3104	+ /* all hctx are in .unused_hctx_list now */
	3105	+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
	3106	+ list_del_init(&hctx->hctx_list);
2503	3107	kobject_put(&hctx->kobj);
2504	3108	}
2505		-
2506		- q->mq_map = NULL;
2507	3109
2508	3110	kfree(q->queue_hw_ctx);
2509	3111
..	..	@@ -2512,102 +3114,184 @@
2512	3114	* both share lifetime with request queue.
2513	3115	*/
2514	3116	blk_mq_sysfs_deinit(q);
2515		-
2516		- free_percpu(q->queue_ctx);
2517	3117	}
2518	3118
2519		-struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3119	+struct request_queue blk_mq_init_queue_data(struct blk_mq_tag_set set,
	3120	+ void *queuedata)
2520	3121	{
2521	3122	struct request_queue uninit_q, q;
2522	3123
2523		- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
	3124	+ uninit_q = blk_alloc_queue(set->numa_node);
2524	3125	if (!uninit_q)
2525	3126	return ERR_PTR(-ENOMEM);
	3127	+ uninit_q->queuedata = queuedata;
2526	3128
2527		- q = blk_mq_init_allocated_queue(set, uninit_q);
	3129	+ /*
	3130	+ * Initialize the queue without an elevator. device_add_disk() will do
	3131	+ * the initialization.
	3132	+ */
	3133	+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
2528	3134	if (IS_ERR(q))
2529	3135	blk_cleanup_queue(uninit_q);
2530	3136
2531	3137	return q;
2532	3138	}
	3139	+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
	3140	+
	3141	+struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3142	+{
	3143	+ return blk_mq_init_queue_data(set, NULL);
	3144	+}
2533	3145	EXPORT_SYMBOL(blk_mq_init_queue);
2534	3146
2535		-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	3147	+/*
	3148	+ * Helper for setting up a queue with mq ops, given queue depth, and
	3149	+ * the passed in mq ops flags.
	3150	+ */
	3151	+struct request_queue blk_mq_init_sq_queue(struct blk_mq_tag_set set,
	3152	+ const struct blk_mq_ops *ops,
	3153	+ unsigned int queue_depth,
	3154	+ unsigned int set_flags)
2536	3155	{
2537		- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	3156	+ struct request_queue *q;
	3157	+ int ret;
2538	3158
2539		- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2540		- __alignof__(struct blk_mq_hw_ctx)) !=
2541		- sizeof(struct blk_mq_hw_ctx));
	3159	+ memset(set, 0, sizeof(*set));
	3160	+ set->ops = ops;
	3161	+ set->nr_hw_queues = 1;
	3162	+ set->nr_maps = 1;
	3163	+ set->queue_depth = queue_depth;
	3164	+ set->numa_node = NUMA_NO_NODE;
	3165	+ set->flags = set_flags;
2542	3166
2543		- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2544		- hw_ctx_size += sizeof(struct srcu_struct);
	3167	+ ret = blk_mq_alloc_tag_set(set);
	3168	+ if (ret)
	3169	+ return ERR_PTR(ret);
2545	3170
2546		- return hw_ctx_size;
	3171	+ q = blk_mq_init_queue(set);
	3172	+ if (IS_ERR(q)) {
	3173	+ blk_mq_free_tag_set(set);
	3174	+ return q;
	3175	+ }
	3176	+
	3177	+ return q;
	3178	+}
	3179	+EXPORT_SYMBOL(blk_mq_init_sq_queue);
	3180	+
	3181	+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
	3182	+ struct blk_mq_tag_set set, struct request_queue q,
	3183	+ int hctx_idx, int node)
	3184	+{
	3185	+ struct blk_mq_hw_ctx hctx = NULL, tmp;
	3186	+
	3187	+ /* reuse dead hctx first */
	3188	+ spin_lock(&q->unused_hctx_lock);
	3189	+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
	3190	+ if (tmp->numa_node == node) {
	3191	+ hctx = tmp;
	3192	+ break;
	3193	+ }
	3194	+ }
	3195	+ if (hctx)
	3196	+ list_del_init(&hctx->hctx_list);
	3197	+ spin_unlock(&q->unused_hctx_lock);
	3198	+
	3199	+ if (!hctx)
	3200	+ hctx = blk_mq_alloc_hctx(q, set, node);
	3201	+ if (!hctx)
	3202	+ goto fail;
	3203	+
	3204	+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
	3205	+ goto free_hctx;
	3206	+
	3207	+ return hctx;
	3208	+
	3209	+ free_hctx:
	3210	+ kobject_put(&hctx->kobj);
	3211	+ fail:
	3212	+ return NULL;
2547	3213	}
2548	3214
2549	3215	static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2550	3216	struct request_queue *q)
2551	3217	{
2552		- int i, j;
	3218	+ int i, j, end;
2553	3219	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2554	3220
2555		- blk_mq_sysfs_unregister(q);
	3221	+ if (q->nr_hw_queues < set->nr_hw_queues) {
	3222	+ struct blk_mq_hw_ctx **new_hctxs;
	3223	+
	3224	+ new_hctxs = kcalloc_node(set->nr_hw_queues,
	3225	+ sizeof(*new_hctxs), GFP_KERNEL,
	3226	+ set->numa_node);
	3227	+ if (!new_hctxs)
	3228	+ return;
	3229	+ if (hctxs)
	3230	+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
	3231	+ sizeof(*hctxs));
	3232	+ q->queue_hw_ctx = new_hctxs;
	3233	+ kfree(hctxs);
	3234	+ hctxs = new_hctxs;
	3235	+ }
2556	3236
2557	3237	/* protect against switching io scheduler */
2558	3238	mutex_lock(&q->sysfs_lock);
2559	3239	for (i = 0; i < set->nr_hw_queues; i++) {
2560	3240	int node;
	3241	+ struct blk_mq_hw_ctx *hctx;
2561	3242
2562		- if (hctxs[i])
	3243	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
	3244	+ /*
	3245	+ * If the hw queue has been mapped to another numa node,
	3246	+ * we need to realloc the hctx. If allocation fails, fallback
	3247	+ * to use the previous one.
	3248	+ */
	3249	+ if (hctxs[i] && (hctxs[i]->numa_node == node))
2563	3250	continue;
2564	3251
2565		- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2566		- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2567		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2568		- node);
2569		- if (!hctxs[i])
2570		- break;
2571		-
2572		- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2573		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2574		- node)) {
2575		- kfree(hctxs[i]);
2576		- hctxs[i] = NULL;
2577		- break;
	3252	+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
	3253	+ if (hctx) {
	3254	+ if (hctxs[i])
	3255	+ blk_mq_exit_hctx(q, set, hctxs[i], i);
	3256	+ hctxs[i] = hctx;
	3257	+ } else {
	3258	+ if (hctxs[i])
	3259	+ pr_warn("Allocate new hctx on node %d fails,\
	3260	+ fallback to previous one on node %d\n",
	3261	+ node, hctxs[i]->numa_node);
	3262	+ else
	3263	+ break;
2578	3264	}
2579		-
2580		- atomic_set(&hctxs[i]->nr_active, 0);
2581		- hctxs[i]->numa_node = node;
2582		- hctxs[i]->queue_num = i;
2583		-
2584		- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2585		- free_cpumask_var(hctxs[i]->cpumask);
2586		- kfree(hctxs[i]);
2587		- hctxs[i] = NULL;
2588		- break;
2589		- }
2590		- blk_mq_hctx_kobj_init(hctxs[i]);
2591	3265	}
2592		- for (j = i; j < q->nr_hw_queues; j++) {
	3266	+ /*
	3267	+ * Increasing nr_hw_queues fails. Free the newly allocated
	3268	+ * hctxs and keep the previous q->nr_hw_queues.
	3269	+ */
	3270	+ if (i != set->nr_hw_queues) {
	3271	+ j = q->nr_hw_queues;
	3272	+ end = i;
	3273	+ } else {
	3274	+ j = i;
	3275	+ end = q->nr_hw_queues;
	3276	+ q->nr_hw_queues = set->nr_hw_queues;
	3277	+ }
	3278	+
	3279	+ for (; j < end; j++) {
2593	3280	struct blk_mq_hw_ctx *hctx = hctxs[j];
2594	3281
2595	3282	if (hctx) {
2596	3283	if (hctx->tags)
2597	3284	blk_mq_free_map_and_requests(set, j);
2598	3285	blk_mq_exit_hctx(q, set, hctx, j);
2599		- kobject_put(&hctx->kobj);
2600	3286	hctxs[j] = NULL;
2601		-
2602	3287	}
2603	3288	}
2604		- q->nr_hw_queues = i;
2605	3289	mutex_unlock(&q->sysfs_lock);
2606		- blk_mq_sysfs_register(q);
2607	3290	}
2608	3291
2609	3292	struct request_queue blk_mq_init_allocated_queue(struct blk_mq_tag_set set,
2610		- struct request_queue *q)
	3293	+ struct request_queue *q,
	3294	+ bool elevator_init)
2611	3295	{
2612	3296	/* mark the queue as mq asap */
2613	3297	q->mq_ops = set->ops;
..	..	@@ -2618,19 +3302,14 @@
2618	3302	if (!q->poll_cb)
2619	3303	goto err_exit;
2620	3304
2621		- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2622		- if (!q->queue_ctx)
2623		- goto err_exit;
	3305	+ if (blk_mq_alloc_ctxs(q))
	3306	+ goto err_poll;
2624	3307
2625	3308	/* init q->mq_kobj and sw queues' kobjects */
2626	3309	blk_mq_sysfs_init(q);
2627	3310
2628		- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2629		- GFP_KERNEL, set->numa_node);
2630		- if (!q->queue_hw_ctx)
2631		- goto err_percpu;
2632		-
2633		- q->mq_map = set->mq_map;
	3311	+ INIT_LIST_HEAD(&q->unused_hctx_list);
	3312	+ spin_lock_init(&q->unused_hctx_lock);
2634	3313
2635	3314	blk_mq_realloc_hw_ctxs(set, q);
2636	3315	if (!q->nr_hw_queues)
..	..	@@ -2639,12 +3318,12 @@
2639	3318	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2640	3319	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2641	3320
2642		- q->nr_queues = nr_cpu_ids;
	3321	+ q->tag_set = set;
2643	3322
2644	3323	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
2645		-
2646		- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2647		- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
	3324	+ if (set->nr_maps > HCTX_TYPE_POLL &&
	3325	+ set->map[HCTX_TYPE_POLL].nr_queues)
	3326	+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2648	3327
2649	3328	q->sg_reserved_size = INT_MAX;
2650	3329
..	..	@@ -2652,41 +3331,29 @@
2652	3331	INIT_LIST_HEAD(&q->requeue_list);
2653	3332	spin_lock_init(&q->requeue_lock);
2654	3333
2655		- blk_queue_make_request(q, blk_mq_make_request);
2656		- if (q->mq_ops->poll)
2657		- q->poll_fn = blk_mq_poll;
2658		-
2659		- /*
2660		- * Do this after blk_queue_make_request() overrides it...
2661		- */
2662	3334	q->nr_requests = set->queue_depth;
2663	3335
2664	3336	/*
2665	3337	* Default to classic polling
2666	3338	*/
2667		- q->poll_nsec = -1;
2668		-
2669		- if (set->ops->complete)
2670		- blk_queue_softirq_done(q, set->ops->complete);
	3339	+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
2671	3340
2672	3341	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2673	3342	blk_mq_add_queue_tag_set(set, q);
2674	3343	blk_mq_map_swqueue(q);
2675	3344
2676		- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2677		- int ret;
2678		-
2679		- ret = elevator_init_mq(q);
2680		- if (ret)
2681		- return ERR_PTR(ret);
2682		- }
	3345	+ if (elevator_init)
	3346	+ elevator_init_mq(q);
2683	3347
2684	3348	return q;
2685	3349
2686	3350	err_hctxs:
2687	3351	kfree(q->queue_hw_ctx);
2688		-err_percpu:
2689		- free_percpu(q->queue_ctx);
	3352	+ q->nr_hw_queues = 0;
	3353	+ blk_mq_sysfs_deinit(q);
	3354	+err_poll:
	3355	+ blk_stat_free_callback(q->poll_cb);
	3356	+ q->poll_cb = NULL;
2690	3357	err_exit:
2691	3358	q->mq_ops = NULL;
2692	3359	return ERR_PTR(-ENOMEM);
..	..	@@ -2704,38 +3371,21 @@
2704	3371	blk_mq_del_queue_tag_set(q);
2705	3372	}
2706	3373
2707		-/* Basically redo blk_mq_init_queue with queue frozen */
2708		-static void blk_mq_queue_reinit(struct request_queue *q)
2709		-{
2710		- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2711		-
2712		- blk_mq_debugfs_unregister_hctxs(q);
2713		- blk_mq_sysfs_unregister(q);
2714		-
2715		- /*
2716		- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2717		- * we should change hctx numa_node according to the new topology (this
2718		- * involves freeing and re-allocating memory, worth doing?)
2719		- */
2720		- blk_mq_map_swqueue(q);
2721		-
2722		- blk_mq_sysfs_register(q);
2723		- blk_mq_debugfs_register_hctxs(q);
2724		-}
2725		-
2726	3374	static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2727	3375	{
2728	3376	int i;
2729	3377
2730		- for (i = 0; i < set->nr_hw_queues; i++)
2731		- if (!__blk_mq_alloc_rq_map(set, i))
	3378	+ for (i = 0; i < set->nr_hw_queues; i++) {
	3379	+ if (!__blk_mq_alloc_map_and_request(set, i))
2732	3380	goto out_unwind;
	3381	+ cond_resched();
	3382	+ }
2733	3383
2734	3384	return 0;
2735	3385
2736	3386	out_unwind:
2737	3387	while (--i >= 0)
2738		- blk_mq_free_rq_map(set->tags[i]);
	3388	+ blk_mq_free_map_and_requests(set, i);
2739	3389
2740	3390	return -ENOMEM;
2741	3391	}
..	..	@@ -2745,7 +3395,7 @@
2745	3395	* may reduce the depth asked for, if memory is tight. set->queue_depth
2746	3396	* will be updated to reflect the allocated depth.
2747	3397	*/
2748		-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
	3398	+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
2749	3399	{
2750	3400	unsigned int depth;
2751	3401	int err;
..	..	@@ -2777,7 +3427,17 @@
2777	3427
2778	3428	static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2779	3429	{
2780		- if (set->ops->map_queues) {
	3430	+ /*
	3431	+ * blk_mq_map_queues() and multiple .map_queues() implementations
	3432	+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
	3433	+ * number of hardware queues.
	3434	+ */
	3435	+ if (set->nr_maps == 1)
	3436	+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
	3437	+
	3438	+ if (set->ops->map_queues && !is_kdump_kernel()) {
	3439	+ int i;
	3440	+
2781	3441	/*
2782	3442	* transport .map_queues is usually done in the following
2783	3443	* way:
..	..	@@ -2785,18 +3445,44 @@
2785	3445	* for (queue = 0; queue < set->nr_hw_queues; queue++) {
2786	3446	* mask = get_cpu_mask(queue)
2787	3447	* for_each_cpu(cpu, mask)
2788		- * set->mq_map[cpu] = queue;
	3448	+ * set->map[x].mq_map[cpu] = queue;
2789	3449	* }
2790	3450	*
2791	3451	* When we need to remap, the table has to be cleared for
2792	3452	* killing stale mapping since one CPU may not be mapped
2793	3453	* to any hw queue.
2794	3454	*/
2795		- blk_mq_clear_mq_map(set);
	3455	+ for (i = 0; i < set->nr_maps; i++)
	3456	+ blk_mq_clear_mq_map(&set->map[i]);
2796	3457
2797	3458	return set->ops->map_queues(set);
2798		- } else
2799		- return blk_mq_map_queues(set);
	3459	+ } else {
	3460	+ BUG_ON(set->nr_maps > 1);
	3461	+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3462	+ }
	3463	+}
	3464	+
	3465	+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
	3466	+ int cur_nr_hw_queues, int new_nr_hw_queues)
	3467	+{
	3468	+ struct blk_mq_tags **new_tags;
	3469	+
	3470	+ if (cur_nr_hw_queues >= new_nr_hw_queues)
	3471	+ return 0;
	3472	+
	3473	+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
	3474	+ GFP_KERNEL, set->numa_node);
	3475	+ if (!new_tags)
	3476	+ return -ENOMEM;
	3477	+
	3478	+ if (set->tags)
	3479	+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
	3480	+ sizeof(*set->tags));
	3481	+ kfree(set->tags);
	3482	+ set->tags = new_tags;
	3483	+ set->nr_hw_queues = new_nr_hw_queues;
	3484	+
	3485	+ return 0;
2800	3486	}
2801	3487
2802	3488	/*
..	..	@@ -2807,7 +3493,7 @@
2807	3493	*/
2808	3494	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2809	3495	{
2810		- int ret;
	3496	+ int i, ret;
2811	3497
2812	3498	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2813	3499
..	..	@@ -2830,6 +3516,11 @@
2830	3516	set->queue_depth = BLK_MQ_MAX_DEPTH;
2831	3517	}
2832	3518
	3519	+ if (!set->nr_maps)
	3520	+ set->nr_maps = 1;
	3521	+ else if (set->nr_maps > HCTX_MAX_TYPES)
	3522	+ return -EINVAL;
	3523	+
2833	3524	/*
2834	3525	* If a crashdump is active, then we are potentially in a very
2835	3526	* memory constrained environment. Limit us to 1 queue and
..	..	@@ -2837,42 +3528,59 @@
2837	3528	*/
2838	3529	if (is_kdump_kernel()) {
2839	3530	set->nr_hw_queues = 1;
	3531	+ set->nr_maps = 1;
2840	3532	set->queue_depth = min(64U, set->queue_depth);
2841	3533	}
2842	3534	/*
2843		- * There is no use for more h/w queues than cpus.
	3535	+ * There is no use for more h/w queues than cpus if we just have
	3536	+ * a single map
2844	3537	*/
2845		- if (set->nr_hw_queues > nr_cpu_ids)
	3538	+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2846	3539	set->nr_hw_queues = nr_cpu_ids;
2847	3540
2848		- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2849		- GFP_KERNEL, set->numa_node);
2850		- if (!set->tags)
	3541	+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
2851	3542	return -ENOMEM;
2852	3543
2853	3544	ret = -ENOMEM;
2854		- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2855		- GFP_KERNEL, set->numa_node);
2856		- if (!set->mq_map)
2857		- goto out_free_tags;
	3545	+ for (i = 0; i < set->nr_maps; i++) {
	3546	+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
	3547	+ sizeof(set->map[i].mq_map[0]),
	3548	+ GFP_KERNEL, set->numa_node);
	3549	+ if (!set->map[i].mq_map)
	3550	+ goto out_free_mq_map;
	3551	+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
	3552	+ }
2858	3553
2859	3554	ret = blk_mq_update_queue_map(set);
2860	3555	if (ret)
2861	3556	goto out_free_mq_map;
2862	3557
2863		- ret = blk_mq_alloc_rq_maps(set);
	3558	+ ret = blk_mq_alloc_map_and_requests(set);
2864	3559	if (ret)
2865	3560	goto out_free_mq_map;
	3561	+
	3562	+ if (blk_mq_is_sbitmap_shared(set->flags)) {
	3563	+ atomic_set(&set->active_queues_shared_sbitmap, 0);
	3564	+
	3565	+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
	3566	+ ret = -ENOMEM;
	3567	+ goto out_free_mq_rq_maps;
	3568	+ }
	3569	+ }
2866	3570
2867	3571	mutex_init(&set->tag_list_lock);
2868	3572	INIT_LIST_HEAD(&set->tag_list);
2869	3573
2870	3574	return 0;
2871	3575
	3576	+out_free_mq_rq_maps:
	3577	+ for (i = 0; i < set->nr_hw_queues; i++)
	3578	+ blk_mq_free_map_and_requests(set, i);
2872	3579	out_free_mq_map:
2873		- kfree(set->mq_map);
2874		- set->mq_map = NULL;
2875		-out_free_tags:
	3580	+ for (i = 0; i < set->nr_maps; i++) {
	3581	+ kfree(set->map[i].mq_map);
	3582	+ set->map[i].mq_map = NULL;
	3583	+ }
2876	3584	kfree(set->tags);
2877	3585	set->tags = NULL;
2878	3586	return ret;
..	..	@@ -2881,13 +3589,18 @@
2881	3589
2882	3590	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2883	3591	{
2884		- int i;
	3592	+ int i, j;
2885	3593
2886		- for (i = 0; i < nr_cpu_ids; i++)
	3594	+ for (i = 0; i < set->nr_hw_queues; i++)
2887	3595	blk_mq_free_map_and_requests(set, i);
2888	3596
2889		- kfree(set->mq_map);
2890		- set->mq_map = NULL;
	3597	+ if (blk_mq_is_sbitmap_shared(set->flags))
	3598	+ blk_mq_exit_shared_sbitmap(set);
	3599	+
	3600	+ for (j = 0; j < set->nr_maps; j++) {
	3601	+ kfree(set->map[j].mq_map);
	3602	+ set->map[j].mq_map = NULL;
	3603	+ }
2891	3604
2892	3605	kfree(set->tags);
2893	3606	set->tags = NULL;
..	..	@@ -2903,6 +3616,9 @@
2903	3616	if (!set)
2904	3617	return -EINVAL;
2905	3618
	3619	+ if (q->nr_requests == nr)
	3620	+ return 0;
	3621	+
2906	3622	blk_mq_freeze_queue(q);
2907	3623	blk_mq_quiesce_queue(q);
2908	3624
..	..	@@ -2917,14 +3633,16 @@
2917	3633	if (!hctx->sched_tags) {
2918	3634	ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2919	3635	false);
	3636	+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
	3637	+ blk_mq_tag_resize_shared_sbitmap(set, nr);
2920	3638	} else {
2921	3639	ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2922	3640	nr, true);
2923	3641	}
2924	3642	if (ret)
2925	3643	break;
2926		- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2927		- q->elevator->type->ops.mq.depth_updated(hctx);
	3644	+ if (q->elevator && q->elevator->type->ops.depth_updated)
	3645	+ q->elevator->type->ops.depth_updated(hctx);
2928	3646	}
2929	3647
2930	3648	if (!ret)
..	..	@@ -3011,20 +3729,19 @@
3011	3729	{
3012	3730	struct request_queue *q;
3013	3731	LIST_HEAD(head);
	3732	+ int prev_nr_hw_queues;
3014	3733
3015	3734	lockdep_assert_held(&set->tag_list_lock);
3016	3735
3017		- if (nr_hw_queues > nr_cpu_ids)
	3736	+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3018	3737	nr_hw_queues = nr_cpu_ids;
3019		- if (nr_hw_queues < 1 \|\| nr_hw_queues == set->nr_hw_queues)
	3738	+ if (nr_hw_queues < 1)
	3739	+ return;
	3740	+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
3020	3741	return;
3021	3742
3022	3743	list_for_each_entry(q, &set->tag_list, tag_set_list)
3023	3744	blk_mq_freeze_queue(q);
3024		- /*
3025		- * Sync with blk_mq_queue_tag_busy_iter.
3026		- */
3027		- synchronize_rcu();
3028	3745	/*
3029	3746	* Switch IO scheduler to 'none', cleaning up the data associated
3030	3747	* with the previous scheduler. We will switch back once we are done
..	..	@@ -3034,11 +3751,35 @@
3034	3751	if (!blk_mq_elv_switch_none(&head, q))
3035	3752	goto switch_back;
3036	3753
	3754	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3755	+ blk_mq_debugfs_unregister_hctxs(q);
	3756	+ blk_mq_sysfs_unregister(q);
	3757	+ }
	3758	+
	3759	+ prev_nr_hw_queues = set->nr_hw_queues;
	3760	+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
	3761	+ 0)
	3762	+ goto reregister;
	3763	+
3037	3764	set->nr_hw_queues = nr_hw_queues;
	3765	+fallback:
3038	3766	blk_mq_update_queue_map(set);
3039	3767	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3040	3768	blk_mq_realloc_hw_ctxs(set, q);
3041		- blk_mq_queue_reinit(q);
	3769	+ if (q->nr_hw_queues != set->nr_hw_queues) {
	3770	+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
	3771	+ nr_hw_queues, prev_nr_hw_queues);
	3772	+ set->nr_hw_queues = prev_nr_hw_queues;
	3773	+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3774	+ goto fallback;
	3775	+ }
	3776	+ blk_mq_map_swqueue(q);
	3777	+ }
	3778	+
	3779	+reregister:
	3780	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3781	+ blk_mq_sysfs_register(q);
	3782	+ blk_mq_debugfs_register_hctxs(q);
3042	3783	}
3043	3784
3044	3785	switch_back:
..	..	@@ -3092,7 +3833,6 @@
3092	3833	}
3093	3834
3094	3835	static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3095		- struct blk_mq_hw_ctx *hctx,
3096	3836	struct request *rq)
3097	3837	{
3098	3838	unsigned long ret = 0;
..	..	@@ -3125,7 +3865,6 @@
3125	3865	}
3126	3866
3127	3867	static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3128		- struct blk_mq_hw_ctx *hctx,
3129	3868	struct request *rq)
3130	3869	{
3131	3870	struct hrtimer_sleeper hs;
..	..	@@ -3137,18 +3876,15 @@
3137	3876	return false;
3138	3877
3139	3878	/*
3140		- * poll_nsec can be:
	3879	+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3141	3880	*
3142		- * -1: don't ever hybrid sleep
3143	3881	* 0: use half of prev avg
3144	3882	* >0: use this specific value
3145	3883	*/
3146		- if (q->poll_nsec == -1)
3147		- return false;
3148		- else if (q->poll_nsec > 0)
	3884	+ if (q->poll_nsec > 0)
3149	3885	nsecs = q->poll_nsec;
3150	3886	else
3151		- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
	3887	+ nsecs = blk_mq_poll_nsecs(q, rq);
3152	3888
3153	3889	if (!nsecs)
3154	3890	return false;
..	..	@@ -3162,14 +3898,14 @@
3162	3898	kt = nsecs;
3163	3899
3164	3900	mode = HRTIMER_MODE_REL;
3165		- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
	3901	+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3166	3902	hrtimer_set_expires(&hs.timer, kt);
3167	3903
3168	3904	do {
3169	3905	if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3170	3906	break;
3171	3907	set_current_state(TASK_UNINTERRUPTIBLE);
3172		- hrtimer_start_expires(&hs.timer, mode);
	3908	+ hrtimer_sleeper_start_expires(&hs, mode);
3173	3909	if (hs.task)
3174	3910	io_schedule();
3175	3911	hrtimer_cancel(&hs.timer);
..	..	@@ -3181,59 +3917,14 @@
3181	3917	return true;
3182	3918	}
3183	3919
3184		-static bool __blk_mq_poll(struct blk_mq_hw_ctx hctx, struct request rq)
	3920	+static bool blk_mq_poll_hybrid(struct request_queue *q,
	3921	+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3185	3922	{
3186		- struct request_queue *q = hctx->queue;
3187		- long state;
3188		-
3189		- /*
3190		- * If we sleep, have the caller restart the poll loop to reset
3191		- * the state. Like for the other success return cases, the
3192		- * caller is responsible for checking if the IO completed. If
3193		- * the IO isn't complete, we'll get called again and will go
3194		- * straight to the busy poll loop.
3195		- */
3196		- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3197		- return true;
3198		-
3199		- hctx->poll_considered++;
3200		-
3201		- state = current->state;
3202		- while (!need_resched()) {
3203		- int ret;
3204		-
3205		- hctx->poll_invoked++;
3206		-
3207		- ret = q->mq_ops->poll(hctx, rq->tag);
3208		- if (ret > 0) {
3209		- hctx->poll_success++;
3210		- set_current_state(TASK_RUNNING);
3211		- return true;
3212		- }
3213		-
3214		- if (signal_pending_state(state, current))
3215		- set_current_state(TASK_RUNNING);
3216		-
3217		- if (current->state == TASK_RUNNING)
3218		- return true;
3219		- if (ret < 0)
3220		- break;
3221		- cpu_relax();
3222		- }
3223		-
3224		- __set_current_state(TASK_RUNNING);
3225		- return false;
3226		-}
3227		-
3228		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3229		-{
3230		- struct blk_mq_hw_ctx *hctx;
3231	3923	struct request *rq;
3232	3924
3233		- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3925	+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3234	3926	return false;
3235	3927
3236		- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3237	3928	if (!blk_qc_t_is_internal(cookie))
3238	3929	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3239	3930	else {
..	..	@@ -3248,13 +3939,97 @@
3248	3939	return false;
3249	3940	}
3250	3941
3251		- return __blk_mq_poll(hctx, rq);
	3942	+ return blk_mq_poll_hybrid_sleep(q, rq);
3252	3943	}
	3944	+
	3945	+/**
	3946	+ * blk_poll - poll for IO completions
	3947	+ * @q: the queue
	3948	+ * @cookie: cookie passed back at IO submission time
	3949	+ * @spin: whether to spin for completions
	3950	+ *
	3951	+ * Description:
	3952	+ * Poll for completions on the passed in queue. Returns number of
	3953	+ * completed entries found. If @spin is true, then blk_poll will continue
	3954	+ * looping until at least one completion is found, unless the task is
	3955	+ * otherwise marked running (or we need to reschedule).
	3956	+ */
	3957	+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
	3958	+{
	3959	+ struct blk_mq_hw_ctx *hctx;
	3960	+ long state;
	3961	+
	3962	+ if (!blk_qc_t_valid(cookie) \|\|
	3963	+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3964	+ return 0;
	3965	+
	3966	+ if (current->plug)
	3967	+ blk_flush_plug_list(current->plug, false);
	3968	+
	3969	+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
	3970	+
	3971	+ /*
	3972	+ * If we sleep, have the caller restart the poll loop to reset
	3973	+ * the state. Like for the other success return cases, the
	3974	+ * caller is responsible for checking if the IO completed. If
	3975	+ * the IO isn't complete, we'll get called again and will go
	3976	+ * straight to the busy poll loop.
	3977	+ */
	3978	+ if (blk_mq_poll_hybrid(q, hctx, cookie))
	3979	+ return 1;
	3980	+
	3981	+ hctx->poll_considered++;
	3982	+
	3983	+ state = current->state;
	3984	+ do {
	3985	+ int ret;
	3986	+
	3987	+ hctx->poll_invoked++;
	3988	+
	3989	+ ret = q->mq_ops->poll(hctx);
	3990	+ if (ret > 0) {
	3991	+ hctx->poll_success++;
	3992	+ __set_current_state(TASK_RUNNING);
	3993	+ return ret;
	3994	+ }
	3995	+
	3996	+ if (signal_pending_state(state, current))
	3997	+ __set_current_state(TASK_RUNNING);
	3998	+
	3999	+ if (current->state == TASK_RUNNING)
	4000	+ return 1;
	4001	+ if (ret < 0 \|\| !spin)
	4002	+ break;
	4003	+ cpu_relax();
	4004	+ } while (!need_resched());
	4005	+
	4006	+ __set_current_state(TASK_RUNNING);
	4007	+ return 0;
	4008	+}
	4009	+EXPORT_SYMBOL_GPL(blk_poll);
	4010	+
	4011	+unsigned int blk_mq_rq_cpu(struct request *rq)
	4012	+{
	4013	+ return rq->mq_ctx->cpu;
	4014	+}
	4015	+EXPORT_SYMBOL(blk_mq_rq_cpu);
3253	4016
3254	4017	static int __init blk_mq_init(void)
3255	4018	{
	4019	+ int i;
	4020	+
	4021	+ for_each_possible_cpu(i)
	4022	+ init_llist_head(&per_cpu(blk_cpu_done, i));
	4023	+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
	4024	+
	4025	+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
	4026	+ "block/softirq:dead", NULL,
	4027	+ blk_softirq_cpu_dead);
3256	4028	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3257	4029	blk_mq_hctx_notify_dead);
	4030	+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
	4031	+ blk_mq_hctx_notify_online,
	4032	+ blk_mq_hctx_notify_offline);
3258	4033	return 0;
3259	4034	}
3260	4035	subsys_initcall(blk_mq_init);