~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Block multiqueue core code
3	4	*
..	..	@@ -25,30 +26,36 @@
25	26	#include <linux/delay.h>
26	27	#include <linux/crash_dump.h>
27	28	#include <linux/prefetch.h>
	29	+#include <linux/blk-crypto.h>
28	30
29	31	#include <trace/events/block.h>
30	32
31	33	#include <linux/blk-mq.h>
	34	+#include <linux/t10-pi.h>
32	35	#include "blk.h"
33	36	#include "blk-mq.h"
34	37	#include "blk-mq-debugfs.h"
35	38	#include "blk-mq-tag.h"
	39	+#include "blk-pm.h"
36	40	#include "blk-stat.h"
37	41	#include "blk-mq-sched.h"
38	42	#include "blk-rq-qos.h"
39	43
40		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
	44	+#include <trace/hooks/block.h>
	45	+
	46	+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
	47	+
41	48	static void blk_mq_poll_stats_start(struct request_queue *q);
42	49	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
43	50
44	51	static int blk_mq_poll_stats_bkt(const struct request *rq)
45	52	{
46		- int ddir, bytes, bucket;
	53	+ int ddir, sectors, bucket;
47	54
48	55	ddir = rq_data_dir(rq);
49		- bytes = blk_rq_bytes(rq);
	56	+ sectors = blk_rq_stats_sectors(rq);
50	57
51		- bucket = ddir + 2*(ilog2(bytes) - 9);
	58	+ bucket = ddir + 2 * ilog2(sectors);
52	59
53	60	if (bucket < 0)
54	61	return -1;
..	..	@@ -59,7 +66,8 @@
59	66	}
60	67
61	68	/*
62		- * Check if any of the ctx's have pending work in this hardware queue
	69	+ * Check if any of the ctx, dispatch list or elevator
	70	+ * have pending work in this hardware queue.
63	71	*/
64	72	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
65	73	{
..	..	@@ -74,75 +82,67 @@
74	82	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
75	83	struct blk_mq_ctx *ctx)
76	84	{
77		- if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
78		- sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
	85	+ const int bit = ctx->index_hw[hctx->type];
	86	+
	87	+ if (!sbitmap_test_bit(&hctx->ctx_map, bit))
	88	+ sbitmap_set_bit(&hctx->ctx_map, bit);
79	89	}
80	90
81	91	static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
82	92	struct blk_mq_ctx *ctx)
83	93	{
84		- sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
	94	+ const int bit = ctx->index_hw[hctx->type];
	95	+
	96	+ sbitmap_clear_bit(&hctx->ctx_map, bit);
85	97	}
86	98
87	99	struct mq_inflight {
88	100	struct hd_struct *part;
89		- unsigned int *inflight;
	101	+ unsigned int inflight[2];
90	102	};
91	103
92		-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
	104	+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
93	105	struct request rq, void priv,
94	106	bool reserved)
95	107	{
96	108	struct mq_inflight *mi = priv;
97	109
98		- /*
99		- * index[0] counts the specific partition that was asked for. index[1]
100		- * counts the ones that are active on the whole device, so increment
101		- * that if mi->part is indeed a partition, and not a whole device.
102		- */
103		- if (rq->part == mi->part)
104		- mi->inflight[0]++;
105		- if (mi->part->partno)
106		- mi->inflight[1]++;
107		-}
108		-
109		-void blk_mq_in_flight(struct request_queue q, struct hd_struct part,
110		- unsigned int inflight[2])
111		-{
112		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
113		-
114		- inflight[0] = inflight[1] = 0;
115		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
116		-}
117		-
118		-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
119		- struct request rq, void priv,
120		- bool reserved)
121		-{
122		- struct mq_inflight *mi = priv;
123		-
124		- if (rq->part == mi->part)
	110	+ if ((!mi->part->partno \|\| rq->part == mi->part) &&
	111	+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
125	112	mi->inflight[rq_data_dir(rq)]++;
	113	+
	114	+ return true;
	115	+}
	116	+
	117	+unsigned int blk_mq_in_flight(struct request_queue q, struct hd_struct part)
	118	+{
	119	+ struct mq_inflight mi = { .part = part };
	120	+
	121	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	122	+
	123	+ return mi.inflight[0] + mi.inflight[1];
126	124	}
127	125
128	126	void blk_mq_in_flight_rw(struct request_queue q, struct hd_struct part,
129	127	unsigned int inflight[2])
130	128	{
131		- struct mq_inflight mi = { .part = part, .inflight = inflight, };
	129	+ struct mq_inflight mi = { .part = part };
132	130
133		- inflight[0] = inflight[1] = 0;
134		- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
	131	+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
	132	+ inflight[0] = mi.inflight[0];
	133	+ inflight[1] = mi.inflight[1];
135	134	}
136	135
137	136	void blk_freeze_queue_start(struct request_queue *q)
138	137	{
139		- int freeze_depth;
140		-
141		- freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
142		- if (freeze_depth == 1) {
	138	+ mutex_lock(&q->mq_freeze_lock);
	139	+ if (++q->mq_freeze_depth == 1) {
143	140	percpu_ref_kill(&q->q_usage_counter);
144		- if (q->mq_ops)
	141	+ mutex_unlock(&q->mq_freeze_lock);
	142	+ if (queue_is_mq(q))
145	143	blk_mq_run_hw_queues(q, false);
	144	+ } else {
	145	+ mutex_unlock(&q->mq_freeze_lock);
146	146	}
147	147	}
148	148	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
..	..	@@ -176,8 +176,6 @@
176	176	* exported to drivers as the only user for unfreeze is blk_mq.
177	177	*/
178	178	blk_freeze_queue_start(q);
179		- if (!q->mq_ops)
180		- blk_drain_queue(q);
181	179	blk_mq_freeze_queue_wait(q);
182	180	}
183	181
..	..	@@ -193,14 +191,14 @@
193	191
194	192	void blk_mq_unfreeze_queue(struct request_queue *q)
195	193	{
196		- int freeze_depth;
197		-
198		- freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
199		- WARN_ON_ONCE(freeze_depth < 0);
200		- if (!freeze_depth) {
201		- percpu_ref_reinit(&q->q_usage_counter);
	194	+ mutex_lock(&q->mq_freeze_lock);
	195	+ q->mq_freeze_depth--;
	196	+ WARN_ON_ONCE(q->mq_freeze_depth < 0);
	197	+ if (!q->mq_freeze_depth) {
	198	+ percpu_ref_resurrect(&q->q_usage_counter);
202	199	wake_up_all(&q->mq_freeze_wq);
203	200	}
	201	+ mutex_unlock(&q->mq_freeze_lock);
204	202	}
205	203	EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
206	204
..	..	@@ -268,40 +266,37 @@
268	266	blk_mq_tag_wakeup_all(hctx->tags, true);
269	267	}
270	268
271		-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
	269	+/*
	270	+ * Only need start/end time stamping if we have iostat or
	271	+ * blk stats enabled, or using an IO scheduler.
	272	+ */
	273	+static inline bool blk_mq_need_time_stamp(struct request *rq)
272	274	{
273		- return blk_mq_has_free_tags(hctx->tags);
	275	+ return (rq->rq_flags & (RQF_IO_STAT \| RQF_STATS)) \|\| rq->q->elevator;
274	276	}
275		-EXPORT_SYMBOL(blk_mq_can_queue);
276	277
277	278	static struct request blk_mq_rq_ctx_init(struct blk_mq_alloc_data data,
278		- unsigned int tag, unsigned int op)
	279	+ unsigned int tag, u64 alloc_time_ns)
279	280	{
280	281	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
281	282	struct request *rq = tags->static_rqs[tag];
282		- req_flags_t rq_flags = 0;
283	283
284		- if (data->flags & BLK_MQ_REQ_INTERNAL) {
285		- rq->tag = -1;
	284	+ if (data->q->elevator) {
	285	+ rq->tag = BLK_MQ_NO_TAG;
286	286	rq->internal_tag = tag;
287	287	} else {
288		- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289		- rq_flags = RQF_MQ_INFLIGHT;
290		- atomic_inc(&data->hctx->nr_active);
291		- }
292	288	rq->tag = tag;
293		- rq->internal_tag = -1;
294		- data->hctx->tags->rqs[rq->tag] = rq;
	289	+ rq->internal_tag = BLK_MQ_NO_TAG;
295	290	}
296	291
297	292	/* csd/requeue_work/fifo_time is initialized before use */
298	293	rq->q = data->q;
299	294	rq->mq_ctx = data->ctx;
300		- rq->rq_flags = rq_flags;
301		- rq->cpu = -1;
302		- rq->cmd_flags = op;
303		- if (data->flags & BLK_MQ_REQ_PREEMPT)
304		- rq->rq_flags \|= RQF_PREEMPT;
	295	+ rq->mq_hctx = data->hctx;
	296	+ rq->rq_flags = 0;
	297	+ rq->cmd_flags = data->cmd_flags;
	298	+ if (data->flags & BLK_MQ_REQ_PM)
	299	+ rq->rq_flags \|= RQF_PM;
305	300	if (blk_queue_io_stat(data->q))
306	301	rq->rq_flags \|= RQF_IO_STAT;
307	302	INIT_LIST_HEAD(&rq->queuelist);
..	..	@@ -309,97 +304,110 @@
309	304	RB_CLEAR_NODE(&rq->rb_node);
310	305	rq->rq_disk = NULL;
311	306	rq->part = NULL;
312		- rq->start_time_ns = ktime_get_ns();
	307	+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	308	+ rq->alloc_time_ns = alloc_time_ns;
	309	+#endif
	310	+ if (blk_mq_need_time_stamp(rq))
	311	+ rq->start_time_ns = ktime_get_ns();
	312	+ else
	313	+ rq->start_time_ns = 0;
313	314	rq->io_start_time_ns = 0;
	315	+ rq->stats_sectors = 0;
314	316	rq->nr_phys_segments = 0;
315	317	#if defined(CONFIG_BLK_DEV_INTEGRITY)
316	318	rq->nr_integrity_segments = 0;
317	319	#endif
318		- rq->special = NULL;
	320	+ blk_crypto_rq_set_defaults(rq);
319	321	/* tag was already set */
320		- rq->extra_len = 0;
321		- rq->__deadline = 0;
	322	+ WRITE_ONCE(rq->deadline, 0);
322	323
323		- INIT_LIST_HEAD(&rq->timeout_list);
324	324	rq->timeout = 0;
325	325
326	326	rq->end_io = NULL;
327	327	rq->end_io_data = NULL;
328		- rq->next_rq = NULL;
329	328
330		-#ifdef CONFIG_BLK_CGROUP
331		- rq->rl = NULL;
332		-#endif
333		-
334		- data->ctx->rq_dispatched[op_is_sync(op)]++;
	329	+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
335	330	refcount_set(&rq->ref, 1);
	331	+
	332	+ if (!op_is_flush(data->cmd_flags)) {
	333	+ struct elevator_queue *e = data->q->elevator;
	334	+
	335	+ rq->elv.icq = NULL;
	336	+ if (e && e->type->ops.prepare_request) {
	337	+ if (e->type->icq_cache)
	338	+ blk_mq_sched_assign_ioc(rq);
	339	+
	340	+ e->type->ops.prepare_request(rq);
	341	+ rq->rq_flags \|= RQF_ELVPRIV;
	342	+ }
	343	+ }
	344	+
	345	+ data->hctx->queued++;
	346	+ trace_android_vh_blk_rq_ctx_init(rq, tags, data, alloc_time_ns);
336	347	return rq;
337	348	}
338	349
339		-static struct request blk_mq_get_request(struct request_queue q,
340		- struct bio *bio, unsigned int op,
341		- struct blk_mq_alloc_data *data)
	350	+static struct request __blk_mq_alloc_request(struct blk_mq_alloc_data data)
342	351	{
	352	+ struct request_queue *q = data->q;
343	353	struct elevator_queue *e = q->elevator;
344		- struct request *rq;
	354	+ u64 alloc_time_ns = 0;
345	355	unsigned int tag;
346		- bool put_ctx_on_error = false;
347	356
348		- blk_queue_enter_live(q);
349		- data->q = q;
350		- if (likely(!data->ctx)) {
351		- data->ctx = blk_mq_get_ctx(q);
352		- put_ctx_on_error = true;
353		- }
354		- if (likely(!data->hctx))
355		- data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
356		- if (op & REQ_NOWAIT)
	357	+ /* alloc_time includes depth and tag waits */
	358	+ if (blk_queue_rq_alloc_time(q))
	359	+ alloc_time_ns = ktime_get_ns();
	360	+
	361	+ if (data->cmd_flags & REQ_NOWAIT)
357	362	data->flags \|= BLK_MQ_REQ_NOWAIT;
358	363
359	364	if (e) {
360		- data->flags \|= BLK_MQ_REQ_INTERNAL;
361		-
362	365	/*
363	366	* Flush requests are special and go directly to the
364	367	* dispatch list. Don't include reserved tags in the
365	368	* limiting, as it isn't useful.
366	369	*/
367		- if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
	370	+ if (!op_is_flush(data->cmd_flags) &&
	371	+ e->type->ops.limit_depth &&
368	372	!(data->flags & BLK_MQ_REQ_RESERVED))
369		- e->type->ops.mq.limit_depth(op, data);
370		- } else {
	373	+ e->type->ops.limit_depth(data->cmd_flags, data);
	374	+ }
	375	+
	376	+retry:
	377	+ data->ctx = blk_mq_get_ctx(q);
	378	+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
	379	+ if (!e)
371	380	blk_mq_tag_busy(data->hctx);
372		- }
373	381
	382	+ /*
	383	+ * Waiting allocations only fail because of an inactive hctx. In that
	384	+ * case just retry the hctx assignment and tag allocation as CPU hotplug
	385	+ * should have migrated us to an online CPU by now.
	386	+ */
374	387	tag = blk_mq_get_tag(data);
375		- if (tag == BLK_MQ_TAG_FAIL) {
376		- if (put_ctx_on_error) {
377		- blk_mq_put_ctx(data->ctx);
378		- data->ctx = NULL;
379		- }
380		- blk_queue_exit(q);
381		- return NULL;
382		- }
	388	+ if (tag == BLK_MQ_NO_TAG) {
	389	+ if (data->flags & BLK_MQ_REQ_NOWAIT)
	390	+ return NULL;
383	391
384		- rq = blk_mq_rq_ctx_init(data, tag, op);
385		- if (!op_is_flush(op)) {
386		- rq->elv.icq = NULL;
387		- if (e && e->type->ops.mq.prepare_request) {
388		- if (e->type->icq_cache && rq_ioc(bio))
389		- blk_mq_sched_assign_ioc(rq, bio);
390		-
391		- e->type->ops.mq.prepare_request(rq, bio);
392		- rq->rq_flags \|= RQF_ELVPRIV;
393		- }
	392	+ /*
	393	+ * Give up the CPU and sleep for a random short time to ensure
	394	+ * that thread using a realtime scheduling class are migrated
	395	+ * off the CPU, and thus off the hctx that is going away.
	396	+ */
	397	+ msleep(3);
	398	+ goto retry;
394	399	}
395		- data->hctx->queued++;
396		- return rq;
	400	+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
397	401	}
398	402
399	403	struct request blk_mq_alloc_request(struct request_queue q, unsigned int op,
400	404	blk_mq_req_flags_t flags)
401	405	{
402		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
	406	+ struct blk_mq_alloc_data data = {
	407	+ .q = q,
	408	+ .flags = flags,
	409	+ .cmd_flags = op,
	410	+ };
403	411	struct request *rq;
404	412	int ret;
405	413
..	..	@@ -407,28 +415,35 @@
407	415	if (ret)
408	416	return ERR_PTR(ret);
409	417
410		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
411		- blk_queue_exit(q);
412		-
	418	+ rq = __blk_mq_alloc_request(&data);
413	419	if (!rq)
414		- return ERR_PTR(-EWOULDBLOCK);
415		-
416		- blk_mq_put_ctx(alloc_data.ctx);
417		-
	420	+ goto out_queue_exit;
418	421	rq->__data_len = 0;
419	422	rq->__sector = (sector_t) -1;
420	423	rq->bio = rq->biotail = NULL;
421	424	return rq;
	425	+out_queue_exit:
	426	+ blk_queue_exit(q);
	427	+ return ERR_PTR(-EWOULDBLOCK);
422	428	}
423	429	EXPORT_SYMBOL(blk_mq_alloc_request);
424	430
425	431	struct request blk_mq_alloc_request_hctx(struct request_queue q,
426	432	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
427	433	{
428		- struct blk_mq_alloc_data alloc_data = { .flags = flags };
429		- struct request *rq;
	434	+ struct blk_mq_alloc_data data = {
	435	+ .q = q,
	436	+ .flags = flags,
	437	+ .cmd_flags = op,
	438	+ };
	439	+ u64 alloc_time_ns = 0;
430	440	unsigned int cpu;
	441	+ unsigned int tag;
431	442	int ret;
	443	+
	444	+ /* alloc_time includes depth and tag waits */
	445	+ if (blk_queue_rq_alloc_time(q))
	446	+ alloc_time_ns = ktime_get_ns();
432	447
433	448	/*
434	449	* If the tag allocator sleeps we could get an allocation for a
..	..	@@ -436,7 +451,8 @@
436	451	* allocator for this for the rare use case of a command tied to
437	452	* a specific queue.
438	453	*/
439		- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
	454	+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) \|\|
	455	+ WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
440	456	return ERR_PTR(-EINVAL);
441	457
442	458	if (hctx_idx >= q->nr_hw_queues)
..	..	@@ -450,21 +466,27 @@
450	466	* Check if the hardware context is actually mapped to anything.
451	467	* If not tell the caller that it should skip this queue.
452	468	*/
453		- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
454		- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
455		- blk_queue_exit(q);
456		- return ERR_PTR(-EXDEV);
457		- }
458		- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
459		- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
	469	+ ret = -EXDEV;
	470	+ data.hctx = q->queue_hw_ctx[hctx_idx];
	471	+ if (!blk_mq_hw_queue_mapped(data.hctx))
	472	+ goto out_queue_exit;
	473	+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
	474	+ if (cpu >= nr_cpu_ids)
	475	+ goto out_queue_exit;
	476	+ data.ctx = __blk_mq_get_ctx(q, cpu);
460	477
461		- rq = blk_mq_get_request(q, NULL, op, &alloc_data);
	478	+ if (!q->elevator)
	479	+ blk_mq_tag_busy(data.hctx);
	480	+
	481	+ ret = -EWOULDBLOCK;
	482	+ tag = blk_mq_get_tag(&data);
	483	+ if (tag == BLK_MQ_NO_TAG)
	484	+ goto out_queue_exit;
	485	+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
	486	+
	487	+out_queue_exit:
462	488	blk_queue_exit(q);
463		-
464		- if (!rq)
465		- return ERR_PTR(-EWOULDBLOCK);
466		-
467		- return rq;
	489	+ return ERR_PTR(ret);
468	490	}
469	491	EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
470	492
..	..	@@ -472,13 +494,16 @@
472	494	{
473	495	struct request_queue *q = rq->q;
474	496	struct blk_mq_ctx *ctx = rq->mq_ctx;
475		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	497	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
476	498	const int sched_tag = rq->internal_tag;
477	499
478		- if (rq->tag != -1)
479		- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
480		- if (sched_tag != -1)
481		- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	500	+ blk_crypto_free_request(rq);
	501	+ blk_pm_mark_last_busy(rq);
	502	+ rq->mq_hctx = NULL;
	503	+ if (rq->tag != BLK_MQ_NO_TAG)
	504	+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
	505	+ if (sched_tag != BLK_MQ_NO_TAG)
	506	+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
482	507	blk_mq_sched_restart(hctx);
483	508	blk_queue_exit(q);
484	509	}
..	..	@@ -488,11 +513,11 @@
488	513	struct request_queue *q = rq->q;
489	514	struct elevator_queue *e = q->elevator;
490	515	struct blk_mq_ctx *ctx = rq->mq_ctx;
491		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	516	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
492	517
493	518	if (rq->rq_flags & RQF_ELVPRIV) {
494		- if (e && e->type->ops.mq.finish_request)
495		- e->type->ops.mq.finish_request(rq);
	519	+ if (e && e->type->ops.finish_request)
	520	+ e->type->ops.finish_request(rq);
496	521	if (rq->elv.icq) {
497	522	put_io_context(rq->elv.icq->ioc);
498	523	rq->elv.icq = NULL;
..	..	@@ -501,15 +526,12 @@
501	526
502	527	ctx->rq_completed[rq_is_sync(rq)]++;
503	528	if (rq->rq_flags & RQF_MQ_INFLIGHT)
504		- atomic_dec(&hctx->nr_active);
	529	+ __blk_mq_dec_active_requests(hctx);
505	530
506	531	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
507	532	laptop_io_completion(q->backing_dev_info);
508	533
509	534	rq_qos_done(q, rq);
510		-
511		- if (blk_rq_rl(rq))
512		- blk_put_rl(blk_rq_rl(rq));
513	535
514	536	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
515	537	if (refcount_dec_and_test(&rq->ref))
..	..	@@ -519,12 +541,17 @@
519	541
520	542	inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
521	543	{
522		- u64 now = ktime_get_ns();
	544	+ u64 now = 0;
	545	+
	546	+ if (blk_mq_need_time_stamp(rq))
	547	+ now = ktime_get_ns();
523	548
524	549	if (rq->rq_flags & RQF_STATS) {
525	550	blk_mq_poll_stats_start(rq->q);
526	551	blk_stat_add(rq, now);
527	552	}
	553	+
	554	+ blk_mq_sched_completed_request(rq, now);
528	555
529	556	blk_account_io_done(rq, now);
530	557
..	..	@@ -532,8 +559,6 @@
532	559	rq_qos_done(rq->q, rq);
533	560	rq->end_io(rq, error);
534	561	} else {
535		- if (unlikely(blk_bidi_rq(rq)))
536		- blk_mq_free_request(rq->next_rq);
537	562	blk_mq_free_request(rq);
538	563	}
539	564	}
..	..	@@ -547,43 +572,139 @@
547	572	}
548	573	EXPORT_SYMBOL(blk_mq_end_request);
549	574
	575	+/*
	576	+ * Softirq action handler - move entries to local list and loop over them
	577	+ * while passing them to the queue registered handler.
	578	+ */
	579	+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
	580	+{
	581	+ struct list_head *cpu_list, local_list;
	582	+
	583	+ local_irq_disable();
	584	+ cpu_list = this_cpu_ptr(&blk_cpu_done);
	585	+ list_replace_init(cpu_list, &local_list);
	586	+ local_irq_enable();
	587	+
	588	+ while (!list_empty(&local_list)) {
	589	+ struct request *rq;
	590	+
	591	+ rq = list_entry(local_list.next, struct request, ipi_list);
	592	+ list_del_init(&rq->ipi_list);
	593	+ rq->q->mq_ops->complete(rq);
	594	+ }
	595	+}
	596	+
	597	+static void blk_mq_trigger_softirq(struct request *rq)
	598	+{
	599	+ struct list_head *list;
	600	+ unsigned long flags;
	601	+
	602	+ local_irq_save(flags);
	603	+ list = this_cpu_ptr(&blk_cpu_done);
	604	+ list_add_tail(&rq->ipi_list, list);
	605	+
	606	+ /*
	607	+ * If the list only contains our just added request, signal a raise of
	608	+ * the softirq. If there are already entries there, someone already
	609	+ * raised the irq but it hasn't run yet.
	610	+ */
	611	+ if (list->next == &rq->ipi_list)
	612	+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
	613	+ local_irq_restore(flags);
	614	+}
	615	+
	616	+static int blk_softirq_cpu_dead(unsigned int cpu)
	617	+{
	618	+ /*
	619	+ * If a CPU goes away, splice its entries to the current CPU
	620	+ * and trigger a run of the softirq
	621	+ */
	622	+ local_irq_disable();
	623	+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
	624	+ this_cpu_ptr(&blk_cpu_done));
	625	+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
	626	+ local_irq_enable();
	627	+
	628	+ return 0;
	629	+}
	630	+
	631	+
550	632	static void __blk_mq_complete_request_remote(void *data)
551	633	{
552	634	struct request *rq = data;
553	635
554		- rq->q->softirq_done_fn(rq);
	636	+ /*
	637	+ * For most of single queue controllers, there is only one irq vector
	638	+ * for handling I/O completion, and the only irq's affinity is set
	639	+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
	640	+ * is handled on one specific CPU.
	641	+ *
	642	+ * So complete I/O requests in softirq context in case of single queue
	643	+ * devices to avoid degrading I/O performance due to irqsoff latency.
	644	+ */
	645	+ if (rq->q->nr_hw_queues == 1)
	646	+ blk_mq_trigger_softirq(rq);
	647	+ else
	648	+ rq->q->mq_ops->complete(rq);
555	649	}
556	650
557		-static void __blk_mq_complete_request(struct request *rq)
	651	+static inline bool blk_mq_complete_need_ipi(struct request *rq)
558	652	{
559		- struct blk_mq_ctx *ctx = rq->mq_ctx;
560		- bool shared = false;
561		- int cpu;
	653	+ int cpu = raw_smp_processor_id();
562	654
563		- if (!blk_mq_mark_complete(rq))
564		- return;
565		- if (rq->internal_tag != -1)
566		- blk_mq_sched_completed_request(rq);
	655	+ if (!IS_ENABLED(CONFIG_SMP) \|\|
	656	+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
	657	+ return false;
567	658
568		- if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
569		- rq->q->softirq_done_fn(rq);
570		- return;
571		- }
	659	+ /* same CPU or cache domain? Complete locally */
	660	+ if (cpu == rq->mq_ctx->cpu \|\|
	661	+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
	662	+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
	663	+ return false;
572	664
573		- cpu = get_cpu();
574		- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
575		- shared = cpus_share_cache(cpu, ctx->cpu);
	665	+ /* don't try to IPI to an offline CPU */
	666	+ return cpu_online(rq->mq_ctx->cpu);
	667	+}
576	668
577		- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
	669	+bool blk_mq_complete_request_remote(struct request *rq)
	670	+{
	671	+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
	672	+
	673	+ /*
	674	+ * For a polled request, always complete locallly, it's pointless
	675	+ * to redirect the completion.
	676	+ */
	677	+ if (rq->cmd_flags & REQ_HIPRI)
	678	+ return false;
	679	+
	680	+ if (blk_mq_complete_need_ipi(rq)) {
578	681	rq->csd.func = __blk_mq_complete_request_remote;
579	682	rq->csd.info = rq;
580	683	rq->csd.flags = 0;
581		- smp_call_function_single_async(ctx->cpu, &rq->csd);
	684	+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
582	685	} else {
583		- rq->q->softirq_done_fn(rq);
	686	+ if (rq->q->nr_hw_queues > 1)
	687	+ return false;
	688	+ blk_mq_trigger_softirq(rq);
584	689	}
585		- put_cpu();
	690	+
	691	+ return true;
586	692	}
	693	+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
	694	+
	695	+/**
	696	+ * blk_mq_complete_request - end I/O on a request
	697	+ * @rq: the request being processed
	698	+ *
	699	+ * Description:
	700	+ * Complete a request by scheduling the ->complete_rq operation.
	701	+ **/
	702	+void blk_mq_complete_request(struct request *rq)
	703	+{
	704	+ if (!blk_mq_complete_request_remote(rq))
	705	+ rq->q->mq_ops->complete(rq);
	706	+}
	707	+EXPORT_SYMBOL(blk_mq_complete_request);
587	708
588	709	static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
589	710	__releases(hctx->srcu)
..	..	@@ -606,40 +727,22 @@
606	727	}
607	728
608	729	/**
609		- * blk_mq_complete_request - end I/O on a request
610		- * @rq: the request being processed
	730	+ * blk_mq_start_request - Start processing a request
	731	+ * @rq: Pointer to request to be started
611	732	*
612		- * Description:
613		- * Ends all I/O on a request. It does not handle partial completions.
614		- * The actual completion happens out-of-order, through a IPI handler.
615		- **/
616		-void blk_mq_complete_request(struct request *rq)
617		-{
618		- if (unlikely(blk_should_fake_timeout(rq->q)))
619		- return;
620		- __blk_mq_complete_request(rq);
621		-}
622		-EXPORT_SYMBOL(blk_mq_complete_request);
623		-
624		-int blk_mq_request_started(struct request *rq)
625		-{
626		- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
627		-}
628		-EXPORT_SYMBOL_GPL(blk_mq_request_started);
629		-
	733	+ * Function used by device drivers to notify the block layer that a request
	734	+ * is going to be processed now, so blk layer can do proper initializations
	735	+ * such as starting the timeout timer.
	736	+ */
630	737	void blk_mq_start_request(struct request *rq)
631	738	{
632	739	struct request_queue *q = rq->q;
633		-
634		- blk_mq_sched_started_request(rq);
635	740
636	741	trace_block_rq_issue(q, rq);
637	742
638	743	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
639	744	rq->io_start_time_ns = ktime_get_ns();
640		-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
641		- rq->throtl_size = blk_rq_sectors(rq);
642		-#endif
	745	+ rq->stats_sectors = blk_rq_sectors(rq);
643	746	rq->rq_flags \|= RQF_STATS;
644	747	rq_qos_issue(q, rq);
645	748	}
..	..	@@ -649,14 +752,10 @@
649	752	blk_add_timer(rq);
650	753	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
651	754
652		- if (q->dma_drain_size && blk_rq_bytes(rq)) {
653		- /*
654		- * Make sure space for the drain appears. We know we can do
655		- * this because max_hw_segments has been adjusted to be one
656		- * fewer than the device can handle.
657		- */
658		- rq->nr_phys_segments++;
659		- }
	755	+#ifdef CONFIG_BLK_DEV_INTEGRITY
	756	+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
	757	+ q->integrity.profile->prepare_fn(rq);
	758	+#endif
660	759	}
661	760	EXPORT_SYMBOL(blk_mq_start_request);
662	761
..	..	@@ -672,8 +771,6 @@
672	771	if (blk_mq_request_started(rq)) {
673	772	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
674	773	rq->rq_flags &= ~RQF_TIMED_OUT;
675		- if (q->dma_drain_size && blk_rq_bytes(rq))
676		- rq->nr_phys_segments--;
677	774	}
678	775	}
679	776
..	..	@@ -684,7 +781,6 @@
684	781	/* this request will be re-inserted to io scheduler queue */
685	782	blk_mq_sched_requeue_request(rq);
686	783
687		- BUG_ON(blk_queued_rq(rq));
688	784	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
689	785	}
690	786	EXPORT_SYMBOL(blk_mq_requeue_request);
..	..	@@ -712,7 +808,7 @@
712	808	* merge.
713	809	*/
714	810	if (rq->rq_flags & RQF_DONTPREP)
715		- blk_mq_request_bypass_insert(rq, false);
	811	+ blk_mq_request_bypass_insert(rq, false, false);
716	812	else
717	813	blk_mq_sched_insert_request(rq, true, false, false);
718	814	}
..	..	@@ -750,7 +846,6 @@
750	846	if (kick_requeue_list)
751	847	blk_mq_kick_requeue_list(q);
752	848	}
753		-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
754	849
755	850	void blk_mq_kick_requeue_list(struct request_queue *q)
756	851	{
..	..	@@ -777,6 +872,32 @@
777	872	}
778	873	EXPORT_SYMBOL(blk_mq_tag_to_rq);
779	874
	875	+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx hctx, struct request rq,
	876	+ void *priv, bool reserved)
	877	+{
	878	+ /*
	879	+ * If we find a request that isn't idle and the queue matches,
	880	+ * we know the queue is busy. Return false to stop the iteration.
	881	+ */
	882	+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
	883	+ bool *busy = priv;
	884	+
	885	+ *busy = true;
	886	+ return false;
	887	+ }
	888	+
	889	+ return true;
	890	+}
	891	+
	892	+bool blk_mq_queue_inflight(struct request_queue *q)
	893	+{
	894	+ bool busy = false;
	895	+
	896	+ blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
	897	+ return busy;
	898	+}
	899	+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
	900	+
780	901	static void blk_mq_rq_timed_out(struct request *req, bool reserved)
781	902	{
782	903	req->rq_flags \|= RQF_TIMED_OUT;
..	..	@@ -801,7 +922,7 @@
801	922	if (rq->rq_flags & RQF_TIMED_OUT)
802	923	return false;
803	924
804		- deadline = blk_rq_deadline(rq);
	925	+ deadline = READ_ONCE(rq->deadline);
805	926	if (time_after_eq(jiffies, deadline))
806	927	return true;
807	928
..	..	@@ -812,43 +933,29 @@
812	933	return false;
813	934	}
814	935
815		-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
	936	+void blk_mq_put_rq_ref(struct request *rq)
	937	+{
	938	+ if (is_flush_rq(rq))
	939	+ rq->end_io(rq, 0);
	940	+ else if (refcount_dec_and_test(&rq->ref))
	941	+ __blk_mq_free_request(rq);
	942	+}
	943	+
	944	+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816	945	struct request rq, void priv, bool reserved)
817	946	{
818	947	unsigned long *next = priv;
819	948
820	949	/*
821		- * Just do a quick check if it is expired before locking the request in
822		- * so we're not unnecessarilly synchronizing across CPUs.
823		- */
824		- if (!blk_mq_req_expired(rq, next))
825		- return;
826		-
827		- /*
828		- * We have reason to believe the request may be expired. Take a
829		- * reference on the request to lock this request lifetime into its
830		- * currently allocated context to prevent it from being reallocated in
831		- * the event the completion by-passes this timeout handler.
832		- *
833		- * If the reference was already released, then the driver beat the
834		- * timeout handler to posting a natural completion.
835		- */
836		- if (!refcount_inc_not_zero(&rq->ref))
837		- return;
838		-
839		- /*
840		- * The request is now locked and cannot be reallocated underneath the
841		- * timeout handler's processing. Re-verify this exact request is truly
842		- * expired; if it is not expired, then the request was completed and
843		- * reallocated as a new request.
	950	+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
	951	+ * be reallocated underneath the timeout handler's processing, then
	952	+ * the expire check is reliable. If the request is not expired, then
	953	+ * it was completed and reallocated as a new request after returning
	954	+ * from blk_mq_check_expired().
844	955	*/
845	956	if (blk_mq_req_expired(rq, next))
846	957	blk_mq_rq_timed_out(rq, reserved);
847		-
848		- if (is_flush_rq(rq, hctx))
849		- rq->end_io(rq, 0);
850		- else if (refcount_dec_and_test(&rq->ref))
851		- __blk_mq_free_request(rq);
	958	+ return true;
852	959	}
853	960
854	961	static void blk_mq_timeout_work(struct work_struct *work)
..	..	@@ -905,9 +1012,10 @@
905	1012	struct flush_busy_ctx_data *flush_data = data;
906	1013	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
907	1014	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1015	+ enum hctx_type type = hctx->type;
908	1016
909	1017	spin_lock(&ctx->lock);
910		- list_splice_tail_init(&ctx->rq_list, flush_data->list);
	1018	+ list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
911	1019	sbitmap_clear_bit(sb, bitnr);
912	1020	spin_unlock(&ctx->lock);
913	1021	return true;
..	..	@@ -939,12 +1047,13 @@
939	1047	struct dispatch_rq_data *dispatch_data = data;
940	1048	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
941	1049	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
	1050	+ enum hctx_type type = hctx->type;
942	1051
943	1052	spin_lock(&ctx->lock);
944		- if (!list_empty(&ctx->rq_list)) {
945		- dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
	1053	+ if (!list_empty(&ctx->rq_lists[type])) {
	1054	+ dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
946	1055	list_del_init(&dispatch_data->rq->queuelist);
947		- if (list_empty(&ctx->rq_list))
	1056	+ if (list_empty(&ctx->rq_lists[type]))
948	1057	sbitmap_clear_bit(sb, bitnr);
949	1058	}
950	1059	spin_unlock(&ctx->lock);
..	..	@@ -955,7 +1064,7 @@
955	1064	struct request blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx hctx,
956	1065	struct blk_mq_ctx *start)
957	1066	{
958		- unsigned off = start ? start->index_hw : 0;
	1067	+ unsigned off = start ? start->index_hw[hctx->type] : 0;
959	1068	struct dispatch_rq_data data = {
960	1069	.hctx = hctx,
961	1070	.rq = NULL,
..	..	@@ -975,33 +1084,44 @@
975	1084	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
976	1085	}
977	1086
978		-bool blk_mq_get_driver_tag(struct request *rq)
	1087	+static bool __blk_mq_get_driver_tag(struct request *rq)
979	1088	{
980		- struct blk_mq_alloc_data data = {
981		- .q = rq->q,
982		- .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
983		- .flags = BLK_MQ_REQ_NOWAIT,
984		- };
985		- bool shared;
	1089	+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
	1090	+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
	1091	+ int tag;
986	1092
987		- if (rq->tag != -1)
988		- goto done;
	1093	+ blk_mq_tag_busy(rq->mq_hctx);
989	1094
990		- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
991		- data.flags \|= BLK_MQ_REQ_RESERVED;
992		-
993		- shared = blk_mq_tag_busy(data.hctx);
994		- rq->tag = blk_mq_get_tag(&data);
995		- if (rq->tag >= 0) {
996		- if (shared) {
997		- rq->rq_flags \|= RQF_MQ_INFLIGHT;
998		- atomic_inc(&data.hctx->nr_active);
999		- }
1000		- data.hctx->tags->rqs[rq->tag] = rq;
	1095	+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
	1096	+ bt = rq->mq_hctx->tags->breserved_tags;
	1097	+ tag_offset = 0;
	1098	+ } else {
	1099	+ if (!hctx_may_queue(rq->mq_hctx, bt))
	1100	+ return false;
1001	1101	}
1002	1102
1003		-done:
1004		- return rq->tag != -1;
	1103	+ tag = __sbitmap_queue_get(bt);
	1104	+ if (tag == BLK_MQ_NO_TAG)
	1105	+ return false;
	1106	+
	1107	+ rq->tag = tag + tag_offset;
	1108	+ return true;
	1109	+}
	1110	+
	1111	+static bool blk_mq_get_driver_tag(struct request *rq)
	1112	+{
	1113	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1114	+
	1115	+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
	1116	+ return false;
	1117	+
	1118	+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
	1119	+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
	1120	+ rq->rq_flags \|= RQF_MQ_INFLIGHT;
	1121	+ __blk_mq_inc_active_requests(hctx);
	1122	+ }
	1123	+ hctx->tags->rqs[rq->tag] = rq;
	1124	+ return true;
1005	1125	}
1006	1126
1007	1127	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
..	..	@@ -1012,7 +1132,13 @@
1012	1132	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1013	1133
1014	1134	spin_lock(&hctx->dispatch_wait_lock);
1015		- list_del_init(&wait->entry);
	1135	+ if (!list_empty(&wait->entry)) {
	1136	+ struct sbitmap_queue *sbq;
	1137	+
	1138	+ list_del_init(&wait->entry);
	1139	+ sbq = hctx->tags->bitmap_tags;
	1140	+ atomic_dec(&sbq->ws_active);
	1141	+ }
1016	1142	spin_unlock(&hctx->dispatch_wait_lock);
1017	1143
1018	1144	blk_mq_run_hw_queue(hctx, true);
..	..	@@ -1028,13 +1154,13 @@
1028	1154	static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1029	1155	struct request *rq)
1030	1156	{
	1157	+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
1031	1158	struct wait_queue_head *wq;
1032	1159	wait_queue_entry_t *wait;
1033	1160	bool ret;
1034	1161
1035		- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1036		- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1037		- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
	1162	+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	1163	+ blk_mq_sched_mark_restart_hctx(hctx);
1038	1164
1039	1165	/*
1040	1166	* It's possible that a tag was freed in the window between the
..	..	@@ -1051,7 +1177,7 @@
1051	1177	if (!list_empty_careful(&wait->entry))
1052	1178	return false;
1053	1179
1054		- wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
	1180	+ wq = &bt_wait_ptr(sbq, hctx)->wait;
1055	1181
1056	1182	spin_lock_irq(&wq->lock);
1057	1183	spin_lock(&hctx->dispatch_wait_lock);
..	..	@@ -1061,6 +1187,7 @@
1061	1187	return false;
1062	1188	}
1063	1189
	1190	+ atomic_inc(&sbq->ws_active);
1064	1191	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1065	1192	__add_wait_queue(wq, wait);
1066	1193
..	..	@@ -1081,6 +1208,7 @@
1081	1208	* someone else gets the wakeup.
1082	1209	*/
1083	1210	list_del_init(&wait->entry);
	1211	+ atomic_dec(&sbq->ws_active);
1084	1212	spin_unlock(&hctx->dispatch_wait_lock);
1085	1213	spin_unlock_irq(&wq->lock);
1086	1214
..	..	@@ -1099,9 +1227,6 @@
1099	1227	static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1100	1228	{
1101	1229	unsigned int ewma;
1102		-
1103		- if (hctx->queue->elevator)
1104		- return;
1105	1230
1106	1231	ewma = hctx->dispatch_busy;
1107	1232
..	..	@@ -1135,22 +1260,83 @@
1135	1260	__blk_mq_requeue_request(rq);
1136	1261	}
1137	1262
	1263	+static void blk_mq_handle_zone_resource(struct request *rq,
	1264	+ struct list_head *zone_list)
	1265	+{
	1266	+ /*
	1267	+ * If we end up here it is because we cannot dispatch a request to a
	1268	+ * specific zone due to LLD level zone-write locking or other zone
	1269	+ * related resource not being available. In this case, set the request
	1270	+ * aside in zone_list for retrying it later.
	1271	+ */
	1272	+ list_add(&rq->queuelist, zone_list);
	1273	+ __blk_mq_requeue_request(rq);
	1274	+}
	1275	+
	1276	+enum prep_dispatch {
	1277	+ PREP_DISPATCH_OK,
	1278	+ PREP_DISPATCH_NO_TAG,
	1279	+ PREP_DISPATCH_NO_BUDGET,
	1280	+};
	1281	+
	1282	+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
	1283	+ bool need_budget)
	1284	+{
	1285	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
	1286	+
	1287	+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
	1288	+ blk_mq_put_driver_tag(rq);
	1289	+ return PREP_DISPATCH_NO_BUDGET;
	1290	+ }
	1291	+
	1292	+ if (!blk_mq_get_driver_tag(rq)) {
	1293	+ /*
	1294	+ * The initial allocation attempt failed, so we need to
	1295	+ * rerun the hardware queue when a tag is freed. The
	1296	+ * waitqueue takes care of that. If the queue is run
	1297	+ * before we add this entry back on the dispatch list,
	1298	+ * we'll re-run it below.
	1299	+ */
	1300	+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
	1301	+ /*
	1302	+ * All budgets not got from this function will be put
	1303	+ * together during handling partial dispatch
	1304	+ */
	1305	+ if (need_budget)
	1306	+ blk_mq_put_dispatch_budget(rq->q);
	1307	+ return PREP_DISPATCH_NO_TAG;
	1308	+ }
	1309	+ }
	1310	+
	1311	+ return PREP_DISPATCH_OK;
	1312	+}
	1313	+
	1314	+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
	1315	+static void blk_mq_release_budgets(struct request_queue *q,
	1316	+ unsigned int nr_budgets)
	1317	+{
	1318	+ int i;
	1319	+
	1320	+ for (i = 0; i < nr_budgets; i++)
	1321	+ blk_mq_put_dispatch_budget(q);
	1322	+}
	1323	+
1138	1324	/*
1139	1325	* Returns true if we did some work AND can potentially do more.
1140	1326	*/
1141		-bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
1142		- bool got_budget)
	1327	+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx hctx, struct list_head list,
	1328	+ unsigned int nr_budgets)
1143	1329	{
1144		- struct blk_mq_hw_ctx *hctx;
	1330	+ enum prep_dispatch prep;
	1331	+ struct request_queue *q = hctx->queue;
1145	1332	struct request rq, nxt;
1146		- bool no_tag = false;
1147	1333	int errors, queued;
1148	1334	blk_status_t ret = BLK_STS_OK;
	1335	+ LIST_HEAD(zone_list);
	1336	+ bool needs_resource = false;
1149	1337
1150	1338	if (list_empty(list))
1151	1339	return false;
1152		-
1153		- WARN_ON(!list_is_singular(list) && got_budget);
1154	1340
1155	1341	/*
1156	1342	* Now process all the entries, sending them to the driver.
..	..	@@ -1161,29 +1347,10 @@
1161	1347
1162	1348	rq = list_first_entry(list, struct request, queuelist);
1163	1349
1164		- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
1165		- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
	1350	+ WARN_ON_ONCE(hctx != rq->mq_hctx);
	1351	+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
	1352	+ if (prep != PREP_DISPATCH_OK)
1166	1353	break;
1167		-
1168		- if (!blk_mq_get_driver_tag(rq)) {
1169		- /*
1170		- * The initial allocation attempt failed, so we need to
1171		- * rerun the hardware queue when a tag is freed. The
1172		- * waitqueue takes care of that. If the queue is run
1173		- * before we add this entry back on the dispatch list,
1174		- * we'll re-run it below.
1175		- */
1176		- if (!blk_mq_mark_tag_wait(hctx, rq)) {
1177		- blk_mq_put_dispatch_budget(hctx);
1178		- /*
1179		- * For non-shared tags, the RESTART check
1180		- * will suffice.
1181		- */
1182		- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1183		- no_tag = true;
1184		- break;
1185		- }
1186		- }
1187	1354
1188	1355	list_del_init(&rq->queuelist);
1189	1356
..	..	@@ -1200,32 +1367,63 @@
1200	1367	bd.last = !blk_mq_get_driver_tag(nxt);
1201	1368	}
1202	1369
	1370	+ /*
	1371	+ * once the request is queued to lld, no need to cover the
	1372	+ * budget any more
	1373	+ */
	1374	+ if (nr_budgets)
	1375	+ nr_budgets--;
1203	1376	ret = q->mq_ops->queue_rq(hctx, &bd);
1204		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE) {
1205		- blk_mq_handle_dev_resource(rq, list);
	1377	+ switch (ret) {
	1378	+ case BLK_STS_OK:
	1379	+ queued++;
1206	1380	break;
1207		- }
1208		-
1209		- if (unlikely(ret != BLK_STS_OK)) {
	1381	+ case BLK_STS_RESOURCE:
	1382	+ needs_resource = true;
	1383	+ fallthrough;
	1384	+ case BLK_STS_DEV_RESOURCE:
	1385	+ blk_mq_handle_dev_resource(rq, list);
	1386	+ goto out;
	1387	+ case BLK_STS_ZONE_RESOURCE:
	1388	+ /*
	1389	+ * Move the request to zone_list and keep going through
	1390	+ * the dispatch list to find more requests the drive can
	1391	+ * accept.
	1392	+ */
	1393	+ blk_mq_handle_zone_resource(rq, &zone_list);
	1394	+ needs_resource = true;
	1395	+ break;
	1396	+ default:
1210	1397	errors++;
1211	1398	blk_mq_end_request(rq, BLK_STS_IOERR);
1212		- continue;
1213	1399	}
1214		-
1215		- queued++;
1216	1400	} while (!list_empty(list));
	1401	+out:
	1402	+ if (!list_empty(&zone_list))
	1403	+ list_splice_tail_init(&zone_list, list);
1217	1404
1218	1405	hctx->dispatched[queued_to_index(queued)]++;
1219	1406
	1407	+ /* If we didn't flush the entire list, we could have told the driver
	1408	+ * there was more coming, but that turned out to be a lie.
	1409	+ */
	1410	+ if ((!list_empty(list) \|\| errors \|\| needs_resource \|\|
	1411	+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
	1412	+ q->mq_ops->commit_rqs(hctx);
1220	1413	/*
1221	1414	* Any items that need requeuing? Stuff them into hctx->dispatch,
1222	1415	* that is where we will continue on next queue run.
1223	1416	*/
1224	1417	if (!list_empty(list)) {
1225	1418	bool needs_restart;
	1419	+ /* For non-shared tags, the RESTART check will suffice */
	1420	+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
	1421	+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
	1422	+
	1423	+ blk_mq_release_budgets(q, nr_budgets);
1226	1424
1227	1425	spin_lock(&hctx->lock);
1228		- list_splice_init(list, &hctx->dispatch);
	1426	+ list_splice_tail_init(list, &hctx->dispatch);
1229	1427	spin_unlock(&hctx->lock);
1230	1428
1231	1429	/*
..	..	@@ -1259,13 +1457,17 @@
1259	1457	*
1260	1458	* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1261	1459	* bit is set, run queue after a delay to avoid IO stalls
1262		- * that could otherwise occur if the queue is idle.
	1460	+ * that could otherwise occur if the queue is idle. We'll do
	1461	+ * similar if we couldn't get budget or couldn't lock a zone
	1462	+ * and SCHED_RESTART is set.
1263	1463	*/
1264	1464	needs_restart = blk_mq_sched_needs_restart(hctx);
	1465	+ if (prep == PREP_DISPATCH_NO_BUDGET)
	1466	+ needs_resource = true;
1265	1467	if (!needs_restart \|\|
1266	1468	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1267	1469	blk_mq_run_hw_queue(hctx, true);
1268		- else if (needs_restart && (ret == BLK_STS_RESOURCE))
	1470	+ else if (needs_restart && needs_resource)
1269	1471	blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1270	1472
1271	1473	blk_mq_update_dispatch_busy(hctx, true);
..	..	@@ -1273,16 +1475,15 @@
1273	1475	} else
1274	1476	blk_mq_update_dispatch_busy(hctx, false);
1275	1477
1276		- /*
1277		- * If the host/device is unable to accept more work, inform the
1278		- * caller of that.
1279		- */
1280		- if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1281		- return false;
1282		-
1283	1478	return (queued + errors) != 0;
1284	1479	}
1285	1480
	1481	+/**
	1482	+ * __blk_mq_run_hw_queue - Run a hardware queue.
	1483	+ * @hctx: Pointer to the hardware queue to run.
	1484	+ *
	1485	+ * Send pending requests to the hardware.
	1486	+ */
1286	1487	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1287	1488	{
1288	1489	int srcu_idx;
..	..	@@ -1380,6 +1581,15 @@
1380	1581	return next_cpu;
1381	1582	}
1382	1583
	1584	+/**
	1585	+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
	1586	+ * @hctx: Pointer to the hardware queue to run.
	1587	+ * @async: If we want to run the queue asynchronously.
	1588	+ * @msecs: Microseconds of delay to wait before running the queue.
	1589	+ *
	1590	+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
	1591	+ * with a delay of @msecs.
	1592	+ */
1383	1593	static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1384	1594	unsigned long msecs)
1385	1595	{
..	..	@@ -1401,13 +1611,29 @@
1401	1611	msecs_to_jiffies(msecs));
1402	1612	}
1403	1613
	1614	+/**
	1615	+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
	1616	+ * @hctx: Pointer to the hardware queue to run.
	1617	+ * @msecs: Microseconds of delay to wait before running the queue.
	1618	+ *
	1619	+ * Run a hardware queue asynchronously with a delay of @msecs.
	1620	+ */
1404	1621	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1405	1622	{
1406	1623	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1407	1624	}
1408	1625	EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1409	1626
1410		-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
	1627	+/**
	1628	+ * blk_mq_run_hw_queue - Start to run a hardware queue.
	1629	+ * @hctx: Pointer to the hardware queue to run.
	1630	+ * @async: If we want to run the queue asynchronously.
	1631	+ *
	1632	+ * Check if the request queue is not in a quiesced state and if there are
	1633	+ * pending requests to be sent. If this is true, run the queue to send requests
	1634	+ * to hardware.
	1635	+ */
	1636	+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1411	1637	{
1412	1638	int srcu_idx;
1413	1639	bool need_run;
..	..	@@ -1425,28 +1651,101 @@
1425	1651	blk_mq_hctx_has_pending(hctx);
1426	1652	hctx_unlock(hctx, srcu_idx);
1427	1653
1428		- if (need_run) {
	1654	+ if (need_run)
1429	1655	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1430		- return true;
1431		- }
1432		-
1433		- return false;
1434	1656	}
1435	1657	EXPORT_SYMBOL(blk_mq_run_hw_queue);
1436	1658
	1659	+/*
	1660	+ * Is the request queue handled by an IO scheduler that does not respect
	1661	+ * hardware queues when dispatching?
	1662	+ */
	1663	+static bool blk_mq_has_sqsched(struct request_queue *q)
	1664	+{
	1665	+ struct elevator_queue *e = q->elevator;
	1666	+
	1667	+ if (e && e->type->ops.dispatch_request &&
	1668	+ !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
	1669	+ return true;
	1670	+ return false;
	1671	+}
	1672	+
	1673	+/*
	1674	+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
	1675	+ * scheduler.
	1676	+ */
	1677	+static struct blk_mq_hw_ctx blk_mq_get_sq_hctx(struct request_queue q)
	1678	+{
	1679	+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	1680	+ /*
	1681	+ * If the IO scheduler does not respect hardware queues when
	1682	+ * dispatching, we just don't bother with multiple HW queues and
	1683	+ * dispatch from hctx for the current CPU since running multiple queues
	1684	+ * just causes lock contention inside the scheduler and pointless cache
	1685	+ * bouncing.
	1686	+ */
	1687	+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
	1688	+
	1689	+ if (!blk_mq_hctx_stopped(hctx))
	1690	+ return hctx;
	1691	+ return NULL;
	1692	+}
	1693	+
	1694	+/**
	1695	+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
	1696	+ * @q: Pointer to the request queue to run.
	1697	+ * @async: If we want to run the queue asynchronously.
	1698	+ */
1437	1699	void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1438	1700	{
1439		- struct blk_mq_hw_ctx *hctx;
	1701	+ struct blk_mq_hw_ctx hctx, sq_hctx;
1440	1702	int i;
1441	1703
	1704	+ sq_hctx = NULL;
	1705	+ if (blk_mq_has_sqsched(q))
	1706	+ sq_hctx = blk_mq_get_sq_hctx(q);
1442	1707	queue_for_each_hw_ctx(q, hctx, i) {
1443	1708	if (blk_mq_hctx_stopped(hctx))
1444	1709	continue;
1445		-
1446		- blk_mq_run_hw_queue(hctx, async);
	1710	+ /*
	1711	+ * Dispatch from this hctx either if there's no hctx preferred
	1712	+ * by IO scheduler or if it has requests that bypass the
	1713	+ * scheduler.
	1714	+ */
	1715	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1716	+ !list_empty_careful(&hctx->dispatch))
	1717	+ blk_mq_run_hw_queue(hctx, async);
1447	1718	}
1448	1719	}
1449	1720	EXPORT_SYMBOL(blk_mq_run_hw_queues);
	1721	+
	1722	+/**
	1723	+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
	1724	+ * @q: Pointer to the request queue to run.
	1725	+ * @msecs: Microseconds of delay to wait before running the queues.
	1726	+ */
	1727	+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
	1728	+{
	1729	+ struct blk_mq_hw_ctx hctx, sq_hctx;
	1730	+ int i;
	1731	+
	1732	+ sq_hctx = NULL;
	1733	+ if (blk_mq_has_sqsched(q))
	1734	+ sq_hctx = blk_mq_get_sq_hctx(q);
	1735	+ queue_for_each_hw_ctx(q, hctx, i) {
	1736	+ if (blk_mq_hctx_stopped(hctx))
	1737	+ continue;
	1738	+ /*
	1739	+ * Dispatch from this hctx either if there's no hctx preferred
	1740	+ * by IO scheduler or if it has requests that bypass the
	1741	+ * scheduler.
	1742	+ */
	1743	+ if (!sq_hctx \|\| sq_hctx == hctx \|\|
	1744	+ !list_empty_careful(&hctx->dispatch))
	1745	+ blk_mq_delay_run_hw_queue(hctx, msecs);
	1746	+ }
	1747	+}
	1748	+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1450	1749
1451	1750	/**
1452	1751	* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
..	..	@@ -1551,7 +1850,7 @@
1551	1850	/*
1552	1851	* If we are stopped, don't run the queue.
1553	1852	*/
1554		- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
	1853	+ if (blk_mq_hctx_stopped(hctx))
1555	1854	return;
1556	1855
1557	1856	__blk_mq_run_hw_queue(hctx);
..	..	@@ -1562,15 +1861,16 @@
1562	1861	bool at_head)
1563	1862	{
1564	1863	struct blk_mq_ctx *ctx = rq->mq_ctx;
	1864	+ enum hctx_type type = hctx->type;
1565	1865
1566	1866	lockdep_assert_held(&ctx->lock);
1567	1867
1568	1868	trace_block_rq_insert(hctx->queue, rq);
1569	1869
1570	1870	if (at_head)
1571		- list_add(&rq->queuelist, &ctx->rq_list);
	1871	+ list_add(&rq->queuelist, &ctx->rq_lists[type]);
1572	1872	else
1573		- list_add_tail(&rq->queuelist, &ctx->rq_list);
	1873	+ list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1574	1874	}
1575	1875
1576	1876	void __blk_mq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
..	..	@@ -1584,17 +1884,25 @@
1584	1884	blk_mq_hctx_mark_pending(hctx, ctx);
1585	1885	}
1586	1886
1587		-/*
	1887	+/**
	1888	+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
	1889	+ * @rq: Pointer to request to be inserted.
	1890	+ * @at_head: true if the request should be inserted at the head of the list.
	1891	+ * @run_queue: If we should run the hardware queue after inserting the request.
	1892	+ *
1588	1893	* Should only be used carefully, when the caller knows we want to
1589	1894	* bypass a potential IO scheduler on the target device.
1590	1895	*/
1591		-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
	1896	+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
	1897	+ bool run_queue)
1592	1898	{
1593		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1594		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	1899	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1595	1900
1596	1901	spin_lock(&hctx->lock);
1597		- list_add_tail(&rq->queuelist, &hctx->dispatch);
	1902	+ if (at_head)
	1903	+ list_add(&rq->queuelist, &hctx->dispatch);
	1904	+ else
	1905	+ list_add_tail(&rq->queuelist, &hctx->dispatch);
1598	1906	spin_unlock(&hctx->lock);
1599	1907
1600	1908	if (run_queue)
..	..	@@ -1606,6 +1914,7 @@
1606	1914
1607	1915	{
1608	1916	struct request *rq;
	1917	+ enum hctx_type type = hctx->type;
1609	1918
1610	1919	/*
1611	1920	* preemption doesn't flush plug list, so it's possible ctx->cpu is
..	..	@@ -1617,95 +1926,87 @@
1617	1926	}
1618	1927
1619	1928	spin_lock(&ctx->lock);
1620		- list_splice_tail_init(list, &ctx->rq_list);
	1929	+ list_splice_tail_init(list, &ctx->rq_lists[type]);
1621	1930	blk_mq_hctx_mark_pending(hctx, ctx);
1622	1931	spin_unlock(&ctx->lock);
1623	1932	}
1624	1933
1625		-static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
	1934	+static int plug_rq_cmp(void priv, struct list_head a, struct list_head *b)
1626	1935	{
1627	1936	struct request *rqa = container_of(a, struct request, queuelist);
1628	1937	struct request *rqb = container_of(b, struct request, queuelist);
1629	1938
1630		- return !(rqa->mq_ctx < rqb->mq_ctx \|\|
1631		- (rqa->mq_ctx == rqb->mq_ctx &&
1632		- blk_rq_pos(rqa) < blk_rq_pos(rqb)));
	1939	+ if (rqa->mq_ctx != rqb->mq_ctx)
	1940	+ return rqa->mq_ctx > rqb->mq_ctx;
	1941	+ if (rqa->mq_hctx != rqb->mq_hctx)
	1942	+ return rqa->mq_hctx > rqb->mq_hctx;
	1943	+
	1944	+ return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1633	1945	}
1634	1946
1635	1947	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1636	1948	{
1637		- struct blk_mq_ctx *this_ctx;
1638		- struct request_queue *this_q;
1639		- struct request *rq;
1640	1949	LIST_HEAD(list);
1641		- LIST_HEAD(ctx_list);
1642		- unsigned int depth;
1643	1950
	1951	+ if (list_empty(&plug->mq_list))
	1952	+ return;
1644	1953	list_splice_init(&plug->mq_list, &list);
1645	1954
1646		- list_sort(NULL, &list, plug_ctx_cmp);
	1955	+ if (plug->rq_count > 2 && plug->multiple_queues)
	1956	+ list_sort(NULL, &list, plug_rq_cmp);
1647	1957
1648		- this_q = NULL;
1649		- this_ctx = NULL;
1650		- depth = 0;
	1958	+ plug->rq_count = 0;
1651	1959
1652		- while (!list_empty(&list)) {
1653		- rq = list_entry_rq(list.next);
1654		- list_del_init(&rq->queuelist);
1655		- BUG_ON(!rq->q);
1656		- if (rq->mq_ctx != this_ctx) {
1657		- if (this_ctx) {
1658		- trace_block_unplug(this_q, depth, !from_schedule);
1659		- blk_mq_sched_insert_requests(this_q, this_ctx,
1660		- &ctx_list,
1661		- from_schedule);
1662		- }
	1960	+ do {
	1961	+ struct list_head rq_list;
	1962	+ struct request rq, head_rq = list_entry_rq(list.next);
	1963	+ struct list_head pos = &head_rq->queuelist; / skip first */
	1964	+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
	1965	+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
	1966	+ unsigned int depth = 1;
1663	1967
1664		- this_ctx = rq->mq_ctx;
1665		- this_q = rq->q;
1666		- depth = 0;
	1968	+ list_for_each_continue(pos, &list) {
	1969	+ rq = list_entry_rq(pos);
	1970	+ BUG_ON(!rq->q);
	1971	+ if (rq->mq_hctx != this_hctx \|\| rq->mq_ctx != this_ctx)
	1972	+ break;
	1973	+ depth++;
1667	1974	}
1668	1975
1669		- depth++;
1670		- list_add_tail(&rq->queuelist, &ctx_list);
1671		- }
1672		-
1673		- /*
1674		- * If 'this_ctx' is set, we know we have entries to complete
1675		- * on 'ctx_list'. Do those.
1676		- */
1677		- if (this_ctx) {
1678		- trace_block_unplug(this_q, depth, !from_schedule);
1679		- blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
	1976	+ list_cut_before(&rq_list, &list, pos);
	1977	+ trace_block_unplug(head_rq->q, depth, !from_schedule);
	1978	+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1680	1979	from_schedule);
1681		- }
	1980	+ } while(!list_empty(&list));
1682	1981	}
1683	1982
1684		-static void blk_mq_bio_to_request(struct request rq, struct bio bio)
	1983	+static void blk_mq_bio_to_request(struct request rq, struct bio bio,
	1984	+ unsigned int nr_segs)
1685	1985	{
1686		- blk_init_request_from_bio(rq, bio);
	1986	+ int err;
1687	1987
1688		- blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
	1988	+ if (bio->bi_opf & REQ_RAHEAD)
	1989	+ rq->cmd_flags \|= REQ_FAILFAST_MASK;
1689	1990
1690		- blk_account_io_start(rq, true);
1691		-}
	1991	+ rq->__sector = bio->bi_iter.bi_sector;
	1992	+ rq->write_hint = bio->bi_write_hint;
	1993	+ blk_rq_bio_prep(rq, bio, nr_segs);
1692	1994
1693		-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx hctx, struct request rq)
1694		-{
1695		- if (rq->tag != -1)
1696		- return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
	1995	+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
	1996	+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
	1997	+ WARN_ON_ONCE(err);
1697	1998
1698		- return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
	1999	+ blk_account_io_start(rq);
1699	2000	}
1700	2001
1701	2002	static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1702	2003	struct request *rq,
1703		- blk_qc_t *cookie)
	2004	+ blk_qc_t *cookie, bool last)
1704	2005	{
1705	2006	struct request_queue *q = rq->q;
1706	2007	struct blk_mq_queue_data bd = {
1707	2008	.rq = rq,
1708		- .last = true,
	2009	+ .last = last,
1709	2010	};
1710	2011	blk_qc_t new_cookie;
1711	2012	blk_status_t ret;
..	..	@@ -1740,7 +2041,7 @@
1740	2041	static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1741	2042	struct request *rq,
1742	2043	blk_qc_t *cookie,
1743		- bool bypass_insert)
	2044	+ bool bypass_insert, bool last)
1744	2045	{
1745	2046	struct request_queue *q = rq->q;
1746	2047	bool run_queue = true;
..	..	@@ -1761,23 +2062,35 @@
1761	2062	if (q->elevator && !bypass_insert)
1762	2063	goto insert;
1763	2064
1764		- if (!blk_mq_get_dispatch_budget(hctx))
	2065	+ if (!blk_mq_get_dispatch_budget(q))
1765	2066	goto insert;
1766	2067
1767	2068	if (!blk_mq_get_driver_tag(rq)) {
1768		- blk_mq_put_dispatch_budget(hctx);
	2069	+ blk_mq_put_dispatch_budget(q);
1769	2070	goto insert;
1770	2071	}
1771	2072
1772		- return __blk_mq_issue_directly(hctx, rq, cookie);
	2073	+ return __blk_mq_issue_directly(hctx, rq, cookie, last);
1773	2074	insert:
1774	2075	if (bypass_insert)
1775	2076	return BLK_STS_RESOURCE;
1776	2077
1777		- blk_mq_request_bypass_insert(rq, run_queue);
	2078	+ blk_mq_sched_insert_request(rq, false, run_queue, false);
	2079	+
1778	2080	return BLK_STS_OK;
1779	2081	}
1780	2082
	2083	+/**
	2084	+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
	2085	+ * @hctx: Pointer of the associated hardware queue.
	2086	+ * @rq: Pointer to request to be sent.
	2087	+ * @cookie: Request queue cookie.
	2088	+ *
	2089	+ * If the device has enough resources to accept a new request now, send the
	2090	+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
	2091	+ * we can try send it another time in the future. Requests inserted at this
	2092	+ * queue have higher priority.
	2093	+ */
1781	2094	static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1782	2095	struct request rq, blk_qc_t cookie)
1783	2096	{
..	..	@@ -1788,25 +2101,24 @@
1788	2101
1789	2102	hctx_lock(hctx, &srcu_idx);
1790	2103
1791		- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
	2104	+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1792	2105	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
1793		- blk_mq_request_bypass_insert(rq, true);
	2106	+ blk_mq_request_bypass_insert(rq, false, true);
1794	2107	else if (ret != BLK_STS_OK)
1795	2108	blk_mq_end_request(rq, ret);
1796	2109
1797	2110	hctx_unlock(hctx, srcu_idx);
1798	2111	}
1799	2112
1800		-blk_status_t blk_mq_request_issue_directly(struct request *rq)
	2113	+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1801	2114	{
1802	2115	blk_status_t ret;
1803	2116	int srcu_idx;
1804	2117	blk_qc_t unused_cookie;
1805		- struct blk_mq_ctx *ctx = rq->mq_ctx;
1806		- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
	2118	+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1807	2119
1808	2120	hctx_lock(hctx, &srcu_idx);
1809		- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
	2121	+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1810	2122	hctx_unlock(hctx, srcu_idx);
1811	2123
1812	2124	return ret;
..	..	@@ -1815,104 +2127,169 @@
1815	2127	void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1816	2128	struct list_head *list)
1817	2129	{
	2130	+ int queued = 0;
	2131	+ int errors = 0;
	2132	+
1818	2133	while (!list_empty(list)) {
1819	2134	blk_status_t ret;
1820	2135	struct request *rq = list_first_entry(list, struct request,
1821	2136	queuelist);
1822	2137
1823	2138	list_del_init(&rq->queuelist);
1824		- ret = blk_mq_request_issue_directly(rq);
	2139	+ ret = blk_mq_request_issue_directly(rq, list_empty(list));
1825	2140	if (ret != BLK_STS_OK) {
	2141	+ errors++;
1826	2142	if (ret == BLK_STS_RESOURCE \|\|
1827	2143	ret == BLK_STS_DEV_RESOURCE) {
1828		- blk_mq_request_bypass_insert(rq,
	2144	+ blk_mq_request_bypass_insert(rq, false,
1829	2145	list_empty(list));
1830	2146	break;
1831	2147	}
1832	2148	blk_mq_end_request(rq, ret);
1833		- }
	2149	+ } else
	2150	+ queued++;
	2151	+ }
	2152	+
	2153	+ /*
	2154	+ * If we didn't flush the entire list, we could have told
	2155	+ * the driver there was more coming, but that turned out to
	2156	+ * be a lie.
	2157	+ */
	2158	+ if ((!list_empty(list) \|\| errors) &&
	2159	+ hctx->queue->mq_ops->commit_rqs && queued)
	2160	+ hctx->queue->mq_ops->commit_rqs(hctx);
	2161	+}
	2162	+
	2163	+static void blk_add_rq_to_plug(struct blk_plug plug, struct request rq)
	2164	+{
	2165	+ list_add_tail(&rq->queuelist, &plug->mq_list);
	2166	+ plug->rq_count++;
	2167	+ if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
	2168	+ struct request *tmp;
	2169	+
	2170	+ tmp = list_first_entry(&plug->mq_list, struct request,
	2171	+ queuelist);
	2172	+ if (tmp->q != rq->q)
	2173	+ plug->multiple_queues = true;
1834	2174	}
1835	2175	}
1836	2176
1837		-static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
	2177	+/*
	2178	+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
	2179	+ * queues. This is important for md arrays to benefit from merging
	2180	+ * requests.
	2181	+ */
	2182	+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1838	2183	{
	2184	+ if (plug->multiple_queues)
	2185	+ return BLK_MAX_REQUEST_COUNT * 2;
	2186	+ return BLK_MAX_REQUEST_COUNT;
	2187	+}
	2188	+
	2189	+/**
	2190	+ * blk_mq_submit_bio - Create and send a request to block device.
	2191	+ * @bio: Bio pointer.
	2192	+ *
	2193	+ * Builds up a request structure from @q and @bio and send to the device. The
	2194	+ * request may not be queued directly to hardware if:
	2195	+ * * This request can be merged with another one
	2196	+ * * We want to place request at plug queue for possible future merging
	2197	+ * * There is an IO scheduler active at this queue
	2198	+ *
	2199	+ * It will not queue the request if there is an error with the bio, or at the
	2200	+ * request creation.
	2201	+ *
	2202	+ * Returns: Request queue cookie.
	2203	+ */
	2204	+blk_qc_t blk_mq_submit_bio(struct bio *bio)
	2205	+{
	2206	+ struct request_queue *q = bio->bi_disk->queue;
1839	2207	const int is_sync = op_is_sync(bio->bi_opf);
1840	2208	const int is_flush_fua = op_is_flush(bio->bi_opf);
1841		- struct blk_mq_alloc_data data = { .flags = 0 };
	2209	+ struct blk_mq_alloc_data data = {
	2210	+ .q = q,
	2211	+ };
1842	2212	struct request *rq;
1843		- unsigned int request_count = 0;
1844	2213	struct blk_plug *plug;
1845	2214	struct request *same_queue_rq = NULL;
	2215	+ unsigned int nr_segs;
1846	2216	blk_qc_t cookie;
	2217	+ blk_status_t ret;
1847	2218
1848	2219	blk_queue_bounce(q, &bio);
1849		-
1850		- blk_queue_split(q, &bio);
	2220	+ __blk_queue_split(&bio, &nr_segs);
1851	2221
1852	2222	if (!bio_integrity_prep(bio))
1853		- return BLK_QC_T_NONE;
	2223	+ goto queue_exit;
1854	2224
1855	2225	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1856		- blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1857		- return BLK_QC_T_NONE;
	2226	+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
	2227	+ goto queue_exit;
1858	2228
1859		- if (blk_mq_sched_bio_merge(q, bio))
1860		- return BLK_QC_T_NONE;
	2229	+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
	2230	+ goto queue_exit;
1861	2231
1862		- rq_qos_throttle(q, bio, NULL);
	2232	+ rq_qos_throttle(q, bio);
1863	2233
1864		- trace_block_getrq(q, bio, bio->bi_opf);
1865		-
1866		- rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
	2234	+ data.cmd_flags = bio->bi_opf;
	2235	+ rq = __blk_mq_alloc_request(&data);
1867	2236	if (unlikely(!rq)) {
1868	2237	rq_qos_cleanup(q, bio);
1869	2238	if (bio->bi_opf & REQ_NOWAIT)
1870	2239	bio_wouldblock_error(bio);
1871		- return BLK_QC_T_NONE;
	2240	+ goto queue_exit;
1872	2241	}
	2242	+
	2243	+ trace_block_getrq(q, bio, bio->bi_opf);
1873	2244
1874	2245	rq_qos_track(q, rq, bio);
1875	2246
1876	2247	cookie = request_to_qc_t(data.hctx, rq);
1877	2248
1878		- plug = current->plug;
1879		- if (unlikely(is_flush_fua)) {
1880		- blk_mq_put_ctx(data.ctx);
1881		- blk_mq_bio_to_request(rq, bio);
	2249	+ blk_mq_bio_to_request(rq, bio, nr_segs);
1882	2250
1883		- /* bypass scheduler for flush rq */
	2251	+ ret = blk_crypto_rq_get_keyslot(rq);
	2252	+ if (ret != BLK_STS_OK) {
	2253	+ bio->bi_status = ret;
	2254	+ bio_endio(bio);
	2255	+ blk_mq_free_request(rq);
	2256	+ return BLK_QC_T_NONE;
	2257	+ }
	2258	+
	2259	+ plug = blk_mq_plug(q, bio);
	2260	+ if (unlikely(is_flush_fua)) {
	2261	+ /* Bypass scheduler for flush requests */
1884	2262	blk_insert_flush(rq);
1885	2263	blk_mq_run_hw_queue(data.hctx, true);
1886		- } else if (plug && q->nr_hw_queues == 1) {
1887		- struct request *last = NULL;
1888		-
1889		- blk_mq_put_ctx(data.ctx);
1890		- blk_mq_bio_to_request(rq, bio);
1891		-
	2264	+ } else if (plug && (q->nr_hw_queues == 1 \|\|
	2265	+ blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) \|\|
	2266	+ q->mq_ops->commit_rqs \|\| !blk_queue_nonrot(q))) {
1892	2267	/*
1893		- * @request_count may become stale because of schedule
1894		- * out, so check the list again.
	2268	+ * Use plugging if we have a ->commit_rqs() hook as well, as
	2269	+ * we know the driver uses bd->last in a smart fashion.
	2270	+ *
	2271	+ * Use normal plugging if this disk is slow HDD, as sequential
	2272	+ * IO may benefit a lot from plug merging.
1895	2273	*/
1896		- if (list_empty(&plug->mq_list))
1897		- request_count = 0;
1898		- else if (blk_queue_nomerges(q))
1899		- request_count = blk_plug_queued_count(q);
	2274	+ unsigned int request_count = plug->rq_count;
	2275	+ struct request *last = NULL;
1900	2276
1901	2277	if (!request_count)
1902	2278	trace_block_plug(q);
1903	2279	else
1904	2280	last = list_entry_rq(plug->mq_list.prev);
1905	2281
1906		- if (request_count >= BLK_MAX_REQUEST_COUNT \|\| (last &&
	2282	+ if (request_count >= blk_plug_max_rq_count(plug) \|\| (last &&
1907	2283	blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1908	2284	blk_flush_plug_list(plug, false);
1909	2285	trace_block_plug(q);
1910	2286	}
1911	2287
1912		- list_add_tail(&rq->queuelist, &plug->mq_list);
	2288	+ blk_add_rq_to_plug(plug, rq);
	2289	+ } else if (q->elevator) {
	2290	+ /* Insert the request at the IO scheduler queue */
	2291	+ blk_mq_sched_insert_request(rq, false, true, true);
1913	2292	} else if (plug && !blk_queue_nomerges(q)) {
1914		- blk_mq_bio_to_request(rq, bio);
1915		-
1916	2293	/*
1917	2294	* We do limited plugging. If the bio can be merged, do that.
1918	2295	* Otherwise the existing request in the plug list will be
..	..	@@ -1922,30 +2299,74 @@
1922	2299	*/
1923	2300	if (list_empty(&plug->mq_list))
1924	2301	same_queue_rq = NULL;
1925		- if (same_queue_rq)
	2302	+ if (same_queue_rq) {
1926	2303	list_del_init(&same_queue_rq->queuelist);
1927		- list_add_tail(&rq->queuelist, &plug->mq_list);
1928		-
1929		- blk_mq_put_ctx(data.ctx);
	2304	+ plug->rq_count--;
	2305	+ }
	2306	+ blk_add_rq_to_plug(plug, rq);
	2307	+ trace_block_plug(q);
1930	2308
1931	2309	if (same_queue_rq) {
1932		- data.hctx = blk_mq_map_queue(q,
1933		- same_queue_rq->mq_ctx->cpu);
	2310	+ data.hctx = same_queue_rq->mq_hctx;
	2311	+ trace_block_unplug(q, 1, true);
1934	2312	blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1935	2313	&cookie);
1936	2314	}
1937		- } else if ((q->nr_hw_queues > 1 && is_sync) \|\| (!q->elevator &&
1938		- !data.hctx->dispatch_busy)) {
1939		- blk_mq_put_ctx(data.ctx);
1940		- blk_mq_bio_to_request(rq, bio);
	2315	+ } else if ((q->nr_hw_queues > 1 && is_sync) \|\|
	2316	+ !data.hctx->dispatch_busy) {
	2317	+ /*
	2318	+ * There is no scheduler and we can try to send directly
	2319	+ * to the hardware.
	2320	+ */
1941	2321	blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1942	2322	} else {
1943		- blk_mq_put_ctx(data.ctx);
1944		- blk_mq_bio_to_request(rq, bio);
	2323	+ /* Default case. */
1945	2324	blk_mq_sched_insert_request(rq, false, true, true);
1946	2325	}
1947	2326
1948	2327	return cookie;
	2328	+queue_exit:
	2329	+ blk_queue_exit(q);
	2330	+ return BLK_QC_T_NONE;
	2331	+}
	2332	+
	2333	+static size_t order_to_size(unsigned int order)
	2334	+{
	2335	+ return (size_t)PAGE_SIZE << order;
	2336	+}
	2337	+
	2338	+/* called before freeing request pool in @tags */
	2339	+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
	2340	+ struct blk_mq_tags *tags, unsigned int hctx_idx)
	2341	+{
	2342	+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
	2343	+ struct page *page;
	2344	+ unsigned long flags;
	2345	+
	2346	+ list_for_each_entry(page, &tags->page_list, lru) {
	2347	+ unsigned long start = (unsigned long)page_address(page);
	2348	+ unsigned long end = start + order_to_size(page->private);
	2349	+ int i;
	2350	+
	2351	+ for (i = 0; i < set->queue_depth; i++) {
	2352	+ struct request *rq = drv_tags->rqs[i];
	2353	+ unsigned long rq_addr = (unsigned long)rq;
	2354	+
	2355	+ if (rq_addr >= start && rq_addr < end) {
	2356	+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
	2357	+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
	2358	+ }
	2359	+ }
	2360	+ }
	2361	+
	2362	+ /*
	2363	+ * Wait until all pending iteration is done.
	2364	+ *
	2365	+ * Request reference is cleared and it is guaranteed to be observed
	2366	+ * after the ->lock is released.
	2367	+ */
	2368	+ spin_lock_irqsave(&drv_tags->lock, flags);
	2369	+ spin_unlock_irqrestore(&drv_tags->lock, flags);
1949	2370	}
1950	2371
1951	2372	void blk_mq_free_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
..	..	@@ -1966,42 +2387,44 @@
1966	2387	}
1967	2388	}
1968	2389
	2390	+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
	2391	+
1969	2392	while (!list_empty(&tags->page_list)) {
1970	2393	page = list_first_entry(&tags->page_list, struct page, lru);
1971	2394	list_del_init(&page->lru);
1972	2395	/*
1973	2396	* Remove kmemleak object previously allocated in
1974		- * blk_mq_init_rq_map().
	2397	+ * blk_mq_alloc_rqs().
1975	2398	*/
1976	2399	kmemleak_free(page_address(page));
1977	2400	__free_pages(page, page->private);
1978	2401	}
1979	2402	}
1980	2403
1981		-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
	2404	+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
1982	2405	{
1983	2406	kfree(tags->rqs);
1984	2407	tags->rqs = NULL;
1985	2408	kfree(tags->static_rqs);
1986	2409	tags->static_rqs = NULL;
1987	2410
1988		- blk_mq_free_tags(tags);
	2411	+ blk_mq_free_tags(tags, flags);
1989	2412	}
1990	2413
1991	2414	struct blk_mq_tags blk_mq_alloc_rq_map(struct blk_mq_tag_set set,
1992	2415	unsigned int hctx_idx,
1993	2416	unsigned int nr_tags,
1994		- unsigned int reserved_tags)
	2417	+ unsigned int reserved_tags,
	2418	+ unsigned int flags)
1995	2419	{
1996	2420	struct blk_mq_tags *tags;
1997	2421	int node;
1998	2422
1999		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2423	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2000	2424	if (node == NUMA_NO_NODE)
2001	2425	node = set->numa_node;
2002	2426
2003		- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2004		- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
	2427	+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
2005	2428	if (!tags)
2006	2429	return NULL;
2007	2430
..	..	@@ -2009,7 +2432,7 @@
2009	2432	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2010	2433	node);
2011	2434	if (!tags->rqs) {
2012		- blk_mq_free_tags(tags);
	2435	+ blk_mq_free_tags(tags, flags);
2013	2436	return NULL;
2014	2437	}
2015	2438
..	..	@@ -2018,16 +2441,11 @@
2018	2441	node);
2019	2442	if (!tags->static_rqs) {
2020	2443	kfree(tags->rqs);
2021		- blk_mq_free_tags(tags);
	2444	+ blk_mq_free_tags(tags, flags);
2022	2445	return NULL;
2023	2446	}
2024	2447
2025	2448	return tags;
2026		-}
2027		-
2028		-static size_t order_to_size(unsigned int order)
2029		-{
2030		- return (size_t)PAGE_SIZE << order;
2031	2449	}
2032	2450
2033	2451	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
..	..	@@ -2052,7 +2470,7 @@
2052	2470	size_t rq_size, left;
2053	2471	int node;
2054	2472
2055		- node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
	2473	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2056	2474	if (node == NUMA_NO_NODE)
2057	2475	node = set->numa_node;
2058	2476
..	..	@@ -2064,6 +2482,7 @@
2064	2482	*/
2065	2483	rq_size = round_up(sizeof(struct request) + set->cmd_size,
2066	2484	cache_line_size());
	2485	+ trace_android_vh_blk_alloc_rqs(&rq_size, set, tags);
2067	2486	left = rq_size * depth;
2068	2487
2069	2488	for (i = 0; i < depth; ) {
..	..	@@ -2122,6 +2541,86 @@
2122	2541	return -ENOMEM;
2123	2542	}
2124	2543
	2544	+struct rq_iter_data {
	2545	+ struct blk_mq_hw_ctx *hctx;
	2546	+ bool has_rq;
	2547	+};
	2548	+
	2549	+static bool blk_mq_has_request(struct request rq, void data, bool reserved)
	2550	+{
	2551	+ struct rq_iter_data *iter_data = data;
	2552	+
	2553	+ if (rq->mq_hctx != iter_data->hctx)
	2554	+ return true;
	2555	+ iter_data->has_rq = true;
	2556	+ return false;
	2557	+}
	2558	+
	2559	+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
	2560	+{
	2561	+ struct blk_mq_tags *tags = hctx->sched_tags ?
	2562	+ hctx->sched_tags : hctx->tags;
	2563	+ struct rq_iter_data data = {
	2564	+ .hctx = hctx,
	2565	+ };
	2566	+
	2567	+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
	2568	+ return data.has_rq;
	2569	+}
	2570	+
	2571	+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
	2572	+ struct blk_mq_hw_ctx *hctx)
	2573	+{
	2574	+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
	2575	+ return false;
	2576	+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
	2577	+ return false;
	2578	+ return true;
	2579	+}
	2580	+
	2581	+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
	2582	+{
	2583	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2584	+ struct blk_mq_hw_ctx, cpuhp_online);
	2585	+
	2586	+ if (!cpumask_test_cpu(cpu, hctx->cpumask) \|\|
	2587	+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
	2588	+ return 0;
	2589	+
	2590	+ /*
	2591	+ * Prevent new request from being allocated on the current hctx.
	2592	+ *
	2593	+ * The smp_mb__after_atomic() Pairs with the implied barrier in
	2594	+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
	2595	+ * seen once we return from the tag allocator.
	2596	+ */
	2597	+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2598	+ smp_mb__after_atomic();
	2599	+
	2600	+ /*
	2601	+ * Try to grab a reference to the queue and wait for any outstanding
	2602	+ * requests. If we could not grab a reference the queue has been
	2603	+ * frozen and there are no requests.
	2604	+ */
	2605	+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
	2606	+ while (blk_mq_hctx_has_requests(hctx))
	2607	+ msleep(5);
	2608	+ percpu_ref_put(&hctx->queue->q_usage_counter);
	2609	+ }
	2610	+
	2611	+ return 0;
	2612	+}
	2613	+
	2614	+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
	2615	+{
	2616	+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
	2617	+ struct blk_mq_hw_ctx, cpuhp_online);
	2618	+
	2619	+ if (cpumask_test_cpu(cpu, hctx->cpumask))
	2620	+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
	2621	+ return 0;
	2622	+}
	2623	+
2125	2624	/*
2126	2625	* 'cpu' is going away. splice any existing rq_list entries from this
2127	2626	* software queue to the hw queue dispatch list, and ensure that it
..	..	@@ -2132,13 +2631,18 @@
2132	2631	struct blk_mq_hw_ctx *hctx;
2133	2632	struct blk_mq_ctx *ctx;
2134	2633	LIST_HEAD(tmp);
	2634	+ enum hctx_type type;
2135	2635
2136	2636	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
	2637	+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
	2638	+ return 0;
	2639	+
2137	2640	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
	2641	+ type = hctx->type;
2138	2642
2139	2643	spin_lock(&ctx->lock);
2140		- if (!list_empty(&ctx->rq_list)) {
2141		- list_splice_init(&ctx->rq_list, &tmp);
	2644	+ if (!list_empty(&ctx->rq_lists[type])) {
	2645	+ list_splice_init(&ctx->rq_lists[type], &tmp);
2142	2646	blk_mq_hctx_clear_pending(hctx, ctx);
2143	2647	}
2144	2648	spin_unlock(&ctx->lock);
..	..	@@ -2156,8 +2660,40 @@
2156	2660
2157	2661	static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2158	2662	{
	2663	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2664	+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2665	+ &hctx->cpuhp_online);
2159	2666	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2160	2667	&hctx->cpuhp_dead);
	2668	+}
	2669	+
	2670	+/*
	2671	+ * Before freeing hw queue, clearing the flush request reference in
	2672	+ * tags->rqs[] for avoiding potential UAF.
	2673	+ */
	2674	+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
	2675	+ unsigned int queue_depth, struct request *flush_rq)
	2676	+{
	2677	+ int i;
	2678	+ unsigned long flags;
	2679	+
	2680	+ /* The hw queue may not be mapped yet */
	2681	+ if (!tags)
	2682	+ return;
	2683	+
	2684	+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
	2685	+
	2686	+ for (i = 0; i < queue_depth; i++)
	2687	+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
	2688	+
	2689	+ /*
	2690	+ * Wait until all pending iteration is done.
	2691	+ *
	2692	+ * Request reference is cleared and it is guaranteed to be observed
	2693	+ * after the ->lock is released.
	2694	+ */
	2695	+ spin_lock_irqsave(&tags->lock, flags);
	2696	+ spin_unlock_irqrestore(&tags->lock, flags);
2161	2697	}
2162	2698
2163	2699	/* hctx->ctxs will be freed in queue's release handler */
..	..	@@ -2165,18 +2701,24 @@
2165	2701	struct blk_mq_tag_set *set,
2166	2702	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2167	2703	{
2168		- blk_mq_debugfs_unregister_hctx(hctx);
	2704	+ struct request *flush_rq = hctx->fq->flush_rq;
2169	2705
2170	2706	if (blk_mq_hw_queue_mapped(hctx))
2171	2707	blk_mq_tag_idle(hctx);
2172	2708
	2709	+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
	2710	+ set->queue_depth, flush_rq);
2173	2711	if (set->ops->exit_request)
2174		- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
	2712	+ set->ops->exit_request(set, flush_rq, hctx_idx);
2175	2713
2176	2714	if (set->ops->exit_hctx)
2177	2715	set->ops->exit_hctx(hctx, hctx_idx);
2178	2716
2179	2717	blk_mq_remove_cpuhp(hctx);
	2718	+
	2719	+ spin_lock(&q->unused_hctx_lock);
	2720	+ list_add(&hctx->hctx_list, &q->unused_hctx_list);
	2721	+ spin_unlock(&q->unused_hctx_lock);
2180	2722	}
2181	2723
2182	2724	static void blk_mq_exit_hw_queues(struct request_queue *q,
..	..	@@ -2188,112 +2730,160 @@
2188	2730	queue_for_each_hw_ctx(q, hctx, i) {
2189	2731	if (i == nr_queue)
2190	2732	break;
	2733	+ blk_mq_debugfs_unregister_hctx(hctx);
2191	2734	blk_mq_exit_hctx(q, set, hctx, i);
2192	2735	}
	2736	+}
	2737	+
	2738	+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	2739	+{
	2740	+ int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	2741	+
	2742	+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
	2743	+ __alignof__(struct blk_mq_hw_ctx)) !=
	2744	+ sizeof(struct blk_mq_hw_ctx));
	2745	+
	2746	+ if (tag_set->flags & BLK_MQ_F_BLOCKING)
	2747	+ hw_ctx_size += sizeof(struct srcu_struct);
	2748	+
	2749	+ return hw_ctx_size;
2193	2750	}
2194	2751
2195	2752	static int blk_mq_init_hctx(struct request_queue *q,
2196	2753	struct blk_mq_tag_set *set,
2197	2754	struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2198	2755	{
2199		- int node;
	2756	+ hctx->queue_num = hctx_idx;
2200	2757
2201		- node = hctx->numa_node;
	2758	+ if (!(hctx->flags & BLK_MQ_F_STACKING))
	2759	+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
	2760	+ &hctx->cpuhp_online);
	2761	+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
	2762	+
	2763	+ hctx->tags = set->tags[hctx_idx];
	2764	+
	2765	+ if (set->ops->init_hctx &&
	2766	+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
	2767	+ goto unregister_cpu_notifier;
	2768	+
	2769	+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
	2770	+ hctx->numa_node))
	2771	+ goto exit_hctx;
	2772	+ return 0;
	2773	+
	2774	+ exit_hctx:
	2775	+ if (set->ops->exit_hctx)
	2776	+ set->ops->exit_hctx(hctx, hctx_idx);
	2777	+ unregister_cpu_notifier:
	2778	+ blk_mq_remove_cpuhp(hctx);
	2779	+ return -1;
	2780	+}
	2781	+
	2782	+static struct blk_mq_hw_ctx *
	2783	+blk_mq_alloc_hctx(struct request_queue q, struct blk_mq_tag_set set,
	2784	+ int node)
	2785	+{
	2786	+ struct blk_mq_hw_ctx *hctx;
	2787	+ gfp_t gfp = GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY;
	2788	+
	2789	+ hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
	2790	+ if (!hctx)
	2791	+ goto fail_alloc_hctx;
	2792	+
	2793	+ if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
	2794	+ goto free_hctx;
	2795	+
	2796	+ atomic_set(&hctx->nr_active, 0);
2202	2797	if (node == NUMA_NO_NODE)
2203		- node = hctx->numa_node = set->numa_node;
	2798	+ node = set->numa_node;
	2799	+ hctx->numa_node = node;
2204	2800
2205	2801	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2206	2802	spin_lock_init(&hctx->lock);
2207	2803	INIT_LIST_HEAD(&hctx->dispatch);
2208	2804	hctx->queue = q;
2209		- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
	2805	+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
2210	2806
2211		- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2212		-
2213		- hctx->tags = set->tags[hctx_idx];
	2807	+ INIT_LIST_HEAD(&hctx->hctx_list);
2214	2808
2215	2809	/*
2216	2810	* Allocate space for all possible cpus to avoid allocation at
2217	2811	* runtime
2218	2812	*/
2219	2813	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2220		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node);
	2814	+ gfp, node);
2221	2815	if (!hctx->ctxs)
2222		- goto unregister_cpu_notifier;
	2816	+ goto free_cpumask;
2223	2817
2224	2818	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2225		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node))
	2819	+ gfp, node))
2226	2820	goto free_ctxs;
2227		-
2228	2821	hctx->nr_ctx = 0;
2229	2822
2230	2823	spin_lock_init(&hctx->dispatch_wait_lock);
2231	2824	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2232	2825	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2233	2826
2234		- if (set->ops->init_hctx &&
2235		- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2236		- goto free_bitmap;
2237		-
2238		- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2239		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
	2827	+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2240	2828	if (!hctx->fq)
2241		- goto exit_hctx;
2242		-
2243		- if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2244		- goto free_fq;
	2829	+ goto free_bitmap;
2245	2830
2246	2831	if (hctx->flags & BLK_MQ_F_BLOCKING)
2247	2832	init_srcu_struct(hctx->srcu);
	2833	+ blk_mq_hctx_kobj_init(hctx);
2248	2834
2249		- blk_mq_debugfs_register_hctx(q, hctx);
	2835	+ return hctx;
2250	2836
2251		- return 0;
2252		-
2253		- free_fq:
2254		- blk_free_flush_queue(hctx->fq);
2255		- exit_hctx:
2256		- if (set->ops->exit_hctx)
2257		- set->ops->exit_hctx(hctx, hctx_idx);
2258	2837	free_bitmap:
2259	2838	sbitmap_free(&hctx->ctx_map);
2260	2839	free_ctxs:
2261	2840	kfree(hctx->ctxs);
2262		- unregister_cpu_notifier:
2263		- blk_mq_remove_cpuhp(hctx);
2264		- return -1;
	2841	+ free_cpumask:
	2842	+ free_cpumask_var(hctx->cpumask);
	2843	+ free_hctx:
	2844	+ kfree(hctx);
	2845	+ fail_alloc_hctx:
	2846	+ return NULL;
2265	2847	}
2266	2848
2267	2849	static void blk_mq_init_cpu_queues(struct request_queue *q,
2268	2850	unsigned int nr_hw_queues)
2269	2851	{
2270		- unsigned int i;
	2852	+ struct blk_mq_tag_set *set = q->tag_set;
	2853	+ unsigned int i, j;
2271	2854
2272	2855	for_each_possible_cpu(i) {
2273	2856	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2274	2857	struct blk_mq_hw_ctx *hctx;
	2858	+ int k;
2275	2859
2276	2860	__ctx->cpu = i;
2277	2861	spin_lock_init(&__ctx->lock);
2278		- INIT_LIST_HEAD(&__ctx->rq_list);
	2862	+ for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
	2863	+ INIT_LIST_HEAD(&__ctx->rq_lists[k]);
	2864	+
2279	2865	__ctx->queue = q;
2280	2866
2281	2867	/*
2282	2868	* Set local node, IFF we have more than one hw queue. If
2283	2869	* not, we remain on the home node of the device
2284	2870	*/
2285		- hctx = blk_mq_map_queue(q, i);
2286		- if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2287		- hctx->numa_node = local_memory_node(cpu_to_node(i));
	2871	+ for (j = 0; j < set->nr_maps; j++) {
	2872	+ hctx = blk_mq_map_queue_type(q, j, i);
	2873	+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
	2874	+ hctx->numa_node = cpu_to_node(i);
	2875	+ }
2288	2876	}
2289	2877	}
2290	2878
2291		-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
	2879	+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
	2880	+ int hctx_idx)
2292	2881	{
	2882	+ unsigned int flags = set->flags;
2293	2883	int ret = 0;
2294	2884
2295	2885	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2296		- set->queue_depth, set->reserved_tags);
	2886	+ set->queue_depth, set->reserved_tags, flags);
2297	2887	if (!set->tags[hctx_idx])
2298	2888	return false;
2299	2889
..	..	@@ -2302,7 +2892,7 @@
2302	2892	if (!ret)
2303	2893	return true;
2304	2894
2305		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2895	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2306	2896	set->tags[hctx_idx] = NULL;
2307	2897	return false;
2308	2898	}
..	..	@@ -2310,16 +2900,18 @@
2310	2900	static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2311	2901	unsigned int hctx_idx)
2312	2902	{
2313		- if (set->tags[hctx_idx]) {
	2903	+ unsigned int flags = set->flags;
	2904	+
	2905	+ if (set->tags && set->tags[hctx_idx]) {
2314	2906	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2315		- blk_mq_free_rq_map(set->tags[hctx_idx]);
	2907	+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
2316	2908	set->tags[hctx_idx] = NULL;
2317	2909	}
2318	2910	}
2319	2911
2320	2912	static void blk_mq_map_swqueue(struct request_queue *q)
2321	2913	{
2322		- unsigned int i, hctx_idx;
	2914	+ unsigned int i, j, hctx_idx;
2323	2915	struct blk_mq_hw_ctx *hctx;
2324	2916	struct blk_mq_ctx *ctx;
2325	2917	struct blk_mq_tag_set *set = q->tag_set;
..	..	@@ -2336,25 +2928,52 @@
2336	2928	* If the cpu isn't present, the cpu is mapped to first hctx.
2337	2929	*/
2338	2930	for_each_possible_cpu(i) {
2339		- hctx_idx = q->mq_map[i];
2340		- /* unmapped hw queue can be remapped after CPU topo changed */
2341		- if (!set->tags[hctx_idx] &&
2342		- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2343		- /*
2344		- * If tags initialization fail for some hctx,
2345		- * that hctx won't be brought online. In this
2346		- * case, remap the current ctx to hctx[0] which
2347		- * is guaranteed to always have tags allocated
2348		- */
2349		- q->mq_map[i] = 0;
2350		- }
2351	2931
2352	2932	ctx = per_cpu_ptr(q->queue_ctx, i);
2353		- hctx = blk_mq_map_queue(q, i);
	2933	+ for (j = 0; j < set->nr_maps; j++) {
	2934	+ if (!set->map[j].nr_queues) {
	2935	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2936	+ HCTX_TYPE_DEFAULT, i);
	2937	+ continue;
	2938	+ }
	2939	+ hctx_idx = set->map[j].mq_map[i];
	2940	+ /* unmapped hw queue can be remapped after CPU topo changed */
	2941	+ if (!set->tags[hctx_idx] &&
	2942	+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
	2943	+ /*
	2944	+ * If tags initialization fail for some hctx,
	2945	+ * that hctx won't be brought online. In this
	2946	+ * case, remap the current ctx to hctx[0] which
	2947	+ * is guaranteed to always have tags allocated
	2948	+ */
	2949	+ set->map[j].mq_map[i] = 0;
	2950	+ }
2354	2951
2355		- cpumask_set_cpu(i, hctx->cpumask);
2356		- ctx->index_hw = hctx->nr_ctx;
2357		- hctx->ctxs[hctx->nr_ctx++] = ctx;
	2952	+ hctx = blk_mq_map_queue_type(q, j, i);
	2953	+ ctx->hctxs[j] = hctx;
	2954	+ /*
	2955	+ * If the CPU is already set in the mask, then we've
	2956	+ * mapped this one already. This can happen if
	2957	+ * devices share queues across queue maps.
	2958	+ */
	2959	+ if (cpumask_test_cpu(i, hctx->cpumask))
	2960	+ continue;
	2961	+
	2962	+ cpumask_set_cpu(i, hctx->cpumask);
	2963	+ hctx->type = j;
	2964	+ ctx->index_hw[hctx->type] = hctx->nr_ctx;
	2965	+ hctx->ctxs[hctx->nr_ctx++] = ctx;
	2966	+
	2967	+ /*
	2968	+ * If the nr_ctx type overflows, we have exceeded the
	2969	+ * amount of sw queues we can support.
	2970	+ */
	2971	+ BUG_ON(!hctx->nr_ctx);
	2972	+ }
	2973	+
	2974	+ for (; j < HCTX_MAX_TYPES; j++)
	2975	+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
	2976	+ HCTX_TYPE_DEFAULT, i);
2358	2977	}
2359	2978
2360	2979	queue_for_each_hw_ctx(q, hctx, i) {
..	..	@@ -2403,14 +3022,14 @@
2403	3022
2404	3023	queue_for_each_hw_ctx(q, hctx, i) {
2405	3024	if (shared)
2406		- hctx->flags \|= BLK_MQ_F_TAG_SHARED;
	3025	+ hctx->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2407	3026	else
2408		- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
	3027	+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2409	3028	}
2410	3029	}
2411	3030
2412		-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2413		- bool shared)
	3031	+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
	3032	+ bool shared)
2414	3033	{
2415	3034	struct request_queue *q;
2416	3035
..	..	@@ -2428,12 +3047,12 @@
2428	3047	struct blk_mq_tag_set *set = q->tag_set;
2429	3048
2430	3049	mutex_lock(&set->tag_list_lock);
2431		- list_del_rcu(&q->tag_set_list);
	3050	+ list_del(&q->tag_set_list);
2432	3051	if (list_is_singular(&set->tag_list)) {
2433	3052	/* just transitioned to unshared */
2434		- set->flags &= ~BLK_MQ_F_TAG_SHARED;
	3053	+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2435	3054	/* update existing queue */
2436		- blk_mq_update_tag_set_depth(set, false);
	3055	+ blk_mq_update_tag_set_shared(set, false);
2437	3056	}
2438	3057	mutex_unlock(&set->tag_list_lock);
2439	3058	INIT_LIST_HEAD(&q->tag_set_list);
..	..	@@ -2442,24 +3061,50 @@
2442	3061	static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2443	3062	struct request_queue *q)
2444	3063	{
2445		- q->tag_set = set;
2446		-
2447	3064	mutex_lock(&set->tag_list_lock);
2448	3065
2449	3066	/*
2450	3067	* Check to see if we're transitioning to shared (from 1 to 2 queues).
2451	3068	*/
2452	3069	if (!list_empty(&set->tag_list) &&
2453		- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2454		- set->flags \|= BLK_MQ_F_TAG_SHARED;
	3070	+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
	3071	+ set->flags \|= BLK_MQ_F_TAG_QUEUE_SHARED;
2455	3072	/* update existing queue */
2456		- blk_mq_update_tag_set_depth(set, true);
	3073	+ blk_mq_update_tag_set_shared(set, true);
2457	3074	}
2458		- if (set->flags & BLK_MQ_F_TAG_SHARED)
	3075	+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2459	3076	queue_set_hctx_shared(q, true);
2460		- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
	3077	+ list_add_tail(&q->tag_set_list, &set->tag_list);
2461	3078
2462	3079	mutex_unlock(&set->tag_list_lock);
	3080	+}
	3081	+
	3082	+/* All allocations will be freed in release handler of q->mq_kobj */
	3083	+static int blk_mq_alloc_ctxs(struct request_queue *q)
	3084	+{
	3085	+ struct blk_mq_ctxs *ctxs;
	3086	+ int cpu;
	3087	+
	3088	+ ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
	3089	+ if (!ctxs)
	3090	+ return -ENOMEM;
	3091	+
	3092	+ ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
	3093	+ if (!ctxs->queue_ctx)
	3094	+ goto fail;
	3095	+
	3096	+ for_each_possible_cpu(cpu) {
	3097	+ struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
	3098	+ ctx->ctxs = ctxs;
	3099	+ }
	3100	+
	3101	+ q->mq_kobj = &ctxs->kobj;
	3102	+ q->queue_ctx = ctxs->queue_ctx;
	3103	+
	3104	+ return 0;
	3105	+ fail:
	3106	+ kfree(ctxs);
	3107	+ return -ENOMEM;
2463	3108	}
2464	3109
2465	3110	/*
..	..	@@ -2470,17 +3115,17 @@
2470	3115	*/
2471	3116	void blk_mq_release(struct request_queue *q)
2472	3117	{
2473		- struct blk_mq_hw_ctx *hctx;
2474		- unsigned int i;
	3118	+ struct blk_mq_hw_ctx hctx, next;
	3119	+ int i;
2475	3120
2476		- /* hctx kobj stays in hctx */
2477		- queue_for_each_hw_ctx(q, hctx, i) {
2478		- if (!hctx)
2479		- continue;
	3121	+ queue_for_each_hw_ctx(q, hctx, i)
	3122	+ WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
	3123	+
	3124	+ /* all hctx are in .unused_hctx_list now */
	3125	+ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
	3126	+ list_del_init(&hctx->hctx_list);
2480	3127	kobject_put(&hctx->kobj);
2481	3128	}
2482		-
2483		- q->mq_map = NULL;
2484	3129
2485	3130	kfree(q->queue_hw_ctx);
2486	3131
..	..	@@ -2489,102 +3134,184 @@
2489	3134	* both share lifetime with request queue.
2490	3135	*/
2491	3136	blk_mq_sysfs_deinit(q);
2492		-
2493		- free_percpu(q->queue_ctx);
2494	3137	}
2495	3138
2496		-struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3139	+struct request_queue blk_mq_init_queue_data(struct blk_mq_tag_set set,
	3140	+ void *queuedata)
2497	3141	{
2498	3142	struct request_queue uninit_q, q;
2499	3143
2500		- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
	3144	+ uninit_q = blk_alloc_queue(set->numa_node);
2501	3145	if (!uninit_q)
2502	3146	return ERR_PTR(-ENOMEM);
	3147	+ uninit_q->queuedata = queuedata;
2503	3148
2504		- q = blk_mq_init_allocated_queue(set, uninit_q);
	3149	+ /*
	3150	+ * Initialize the queue without an elevator. device_add_disk() will do
	3151	+ * the initialization.
	3152	+ */
	3153	+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
2505	3154	if (IS_ERR(q))
2506	3155	blk_cleanup_queue(uninit_q);
2507	3156
2508	3157	return q;
2509	3158	}
	3159	+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
	3160	+
	3161	+struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
	3162	+{
	3163	+ return blk_mq_init_queue_data(set, NULL);
	3164	+}
2510	3165	EXPORT_SYMBOL(blk_mq_init_queue);
2511	3166
2512		-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
	3167	+/*
	3168	+ * Helper for setting up a queue with mq ops, given queue depth, and
	3169	+ * the passed in mq ops flags.
	3170	+ */
	3171	+struct request_queue blk_mq_init_sq_queue(struct blk_mq_tag_set set,
	3172	+ const struct blk_mq_ops *ops,
	3173	+ unsigned int queue_depth,
	3174	+ unsigned int set_flags)
2513	3175	{
2514		- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
	3176	+ struct request_queue *q;
	3177	+ int ret;
2515	3178
2516		- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2517		- __alignof__(struct blk_mq_hw_ctx)) !=
2518		- sizeof(struct blk_mq_hw_ctx));
	3179	+ memset(set, 0, sizeof(*set));
	3180	+ set->ops = ops;
	3181	+ set->nr_hw_queues = 1;
	3182	+ set->nr_maps = 1;
	3183	+ set->queue_depth = queue_depth;
	3184	+ set->numa_node = NUMA_NO_NODE;
	3185	+ set->flags = set_flags;
2519	3186
2520		- if (tag_set->flags & BLK_MQ_F_BLOCKING)
2521		- hw_ctx_size += sizeof(struct srcu_struct);
	3187	+ ret = blk_mq_alloc_tag_set(set);
	3188	+ if (ret)
	3189	+ return ERR_PTR(ret);
2522	3190
2523		- return hw_ctx_size;
	3191	+ q = blk_mq_init_queue(set);
	3192	+ if (IS_ERR(q)) {
	3193	+ blk_mq_free_tag_set(set);
	3194	+ return q;
	3195	+ }
	3196	+
	3197	+ return q;
	3198	+}
	3199	+EXPORT_SYMBOL(blk_mq_init_sq_queue);
	3200	+
	3201	+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
	3202	+ struct blk_mq_tag_set set, struct request_queue q,
	3203	+ int hctx_idx, int node)
	3204	+{
	3205	+ struct blk_mq_hw_ctx hctx = NULL, tmp;
	3206	+
	3207	+ /* reuse dead hctx first */
	3208	+ spin_lock(&q->unused_hctx_lock);
	3209	+ list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
	3210	+ if (tmp->numa_node == node) {
	3211	+ hctx = tmp;
	3212	+ break;
	3213	+ }
	3214	+ }
	3215	+ if (hctx)
	3216	+ list_del_init(&hctx->hctx_list);
	3217	+ spin_unlock(&q->unused_hctx_lock);
	3218	+
	3219	+ if (!hctx)
	3220	+ hctx = blk_mq_alloc_hctx(q, set, node);
	3221	+ if (!hctx)
	3222	+ goto fail;
	3223	+
	3224	+ if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
	3225	+ goto free_hctx;
	3226	+
	3227	+ return hctx;
	3228	+
	3229	+ free_hctx:
	3230	+ kobject_put(&hctx->kobj);
	3231	+ fail:
	3232	+ return NULL;
2524	3233	}
2525	3234
2526	3235	static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2527	3236	struct request_queue *q)
2528	3237	{
2529		- int i, j;
	3238	+ int i, j, end;
2530	3239	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2531	3240
2532		- blk_mq_sysfs_unregister(q);
	3241	+ if (q->nr_hw_queues < set->nr_hw_queues) {
	3242	+ struct blk_mq_hw_ctx **new_hctxs;
	3243	+
	3244	+ new_hctxs = kcalloc_node(set->nr_hw_queues,
	3245	+ sizeof(*new_hctxs), GFP_KERNEL,
	3246	+ set->numa_node);
	3247	+ if (!new_hctxs)
	3248	+ return;
	3249	+ if (hctxs)
	3250	+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
	3251	+ sizeof(*hctxs));
	3252	+ q->queue_hw_ctx = new_hctxs;
	3253	+ kfree(hctxs);
	3254	+ hctxs = new_hctxs;
	3255	+ }
2533	3256
2534	3257	/* protect against switching io scheduler */
2535	3258	mutex_lock(&q->sysfs_lock);
2536	3259	for (i = 0; i < set->nr_hw_queues; i++) {
2537	3260	int node;
	3261	+ struct blk_mq_hw_ctx *hctx;
2538	3262
2539		- if (hctxs[i])
	3263	+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
	3264	+ /*
	3265	+ * If the hw queue has been mapped to another numa node,
	3266	+ * we need to realloc the hctx. If allocation fails, fallback
	3267	+ * to use the previous one.
	3268	+ */
	3269	+ if (hctxs[i] && (hctxs[i]->numa_node == node))
2540	3270	continue;
2541	3271
2542		- node = blk_mq_hw_queue_to_node(q->mq_map, i);
2543		- hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2544		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2545		- node);
2546		- if (!hctxs[i])
2547		- break;
2548		-
2549		- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
2550		- GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
2551		- node)) {
2552		- kfree(hctxs[i]);
2553		- hctxs[i] = NULL;
2554		- break;
	3272	+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
	3273	+ if (hctx) {
	3274	+ if (hctxs[i])
	3275	+ blk_mq_exit_hctx(q, set, hctxs[i], i);
	3276	+ hctxs[i] = hctx;
	3277	+ } else {
	3278	+ if (hctxs[i])
	3279	+ pr_warn("Allocate new hctx on node %d fails,\
	3280	+ fallback to previous one on node %d\n",
	3281	+ node, hctxs[i]->numa_node);
	3282	+ else
	3283	+ break;
2555	3284	}
2556		-
2557		- atomic_set(&hctxs[i]->nr_active, 0);
2558		- hctxs[i]->numa_node = node;
2559		- hctxs[i]->queue_num = i;
2560		-
2561		- if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2562		- free_cpumask_var(hctxs[i]->cpumask);
2563		- kfree(hctxs[i]);
2564		- hctxs[i] = NULL;
2565		- break;
2566		- }
2567		- blk_mq_hctx_kobj_init(hctxs[i]);
2568	3285	}
2569		- for (j = i; j < q->nr_hw_queues; j++) {
	3286	+ /*
	3287	+ * Increasing nr_hw_queues fails. Free the newly allocated
	3288	+ * hctxs and keep the previous q->nr_hw_queues.
	3289	+ */
	3290	+ if (i != set->nr_hw_queues) {
	3291	+ j = q->nr_hw_queues;
	3292	+ end = i;
	3293	+ } else {
	3294	+ j = i;
	3295	+ end = q->nr_hw_queues;
	3296	+ q->nr_hw_queues = set->nr_hw_queues;
	3297	+ }
	3298	+
	3299	+ for (; j < end; j++) {
2570	3300	struct blk_mq_hw_ctx *hctx = hctxs[j];
2571	3301
2572	3302	if (hctx) {
2573	3303	if (hctx->tags)
2574	3304	blk_mq_free_map_and_requests(set, j);
2575	3305	blk_mq_exit_hctx(q, set, hctx, j);
2576		- kobject_put(&hctx->kobj);
2577	3306	hctxs[j] = NULL;
2578		-
2579	3307	}
2580	3308	}
2581		- q->nr_hw_queues = i;
2582	3309	mutex_unlock(&q->sysfs_lock);
2583		- blk_mq_sysfs_register(q);
2584	3310	}
2585	3311
2586	3312	struct request_queue blk_mq_init_allocated_queue(struct blk_mq_tag_set set,
2587		- struct request_queue *q)
	3313	+ struct request_queue *q,
	3314	+ bool elevator_init)
2588	3315	{
2589	3316	/* mark the queue as mq asap */
2590	3317	q->mq_ops = set->ops;
..	..	@@ -2595,19 +3322,14 @@
2595	3322	if (!q->poll_cb)
2596	3323	goto err_exit;
2597	3324
2598		- q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2599		- if (!q->queue_ctx)
2600		- goto err_exit;
	3325	+ if (blk_mq_alloc_ctxs(q))
	3326	+ goto err_poll;
2601	3327
2602	3328	/* init q->mq_kobj and sw queues' kobjects */
2603	3329	blk_mq_sysfs_init(q);
2604	3330
2605		- q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
2606		- GFP_KERNEL, set->numa_node);
2607		- if (!q->queue_hw_ctx)
2608		- goto err_percpu;
2609		-
2610		- q->mq_map = set->mq_map;
	3331	+ INIT_LIST_HEAD(&q->unused_hctx_list);
	3332	+ spin_lock_init(&q->unused_hctx_lock);
2611	3333
2612	3334	blk_mq_realloc_hw_ctxs(set, q);
2613	3335	if (!q->nr_hw_queues)
..	..	@@ -2616,12 +3338,12 @@
2616	3338	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2617	3339	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2618	3340
2619		- q->nr_queues = nr_cpu_ids;
	3341	+ q->tag_set = set;
2620	3342
2621	3343	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
2622		-
2623		- if (!(set->flags & BLK_MQ_F_SG_MERGE))
2624		- queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
	3344	+ if (set->nr_maps > HCTX_TYPE_POLL &&
	3345	+ set->map[HCTX_TYPE_POLL].nr_queues)
	3346	+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2625	3347
2626	3348	q->sg_reserved_size = INT_MAX;
2627	3349
..	..	@@ -2629,41 +3351,29 @@
2629	3351	INIT_LIST_HEAD(&q->requeue_list);
2630	3352	spin_lock_init(&q->requeue_lock);
2631	3353
2632		- blk_queue_make_request(q, blk_mq_make_request);
2633		- if (q->mq_ops->poll)
2634		- q->poll_fn = blk_mq_poll;
2635		-
2636		- /*
2637		- * Do this after blk_queue_make_request() overrides it...
2638		- */
2639	3354	q->nr_requests = set->queue_depth;
2640	3355
2641	3356	/*
2642	3357	* Default to classic polling
2643	3358	*/
2644		- q->poll_nsec = -1;
2645		-
2646		- if (set->ops->complete)
2647		- blk_queue_softirq_done(q, set->ops->complete);
	3359	+ q->poll_nsec = BLK_MQ_POLL_CLASSIC;
2648	3360
2649	3361	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2650	3362	blk_mq_add_queue_tag_set(set, q);
2651	3363	blk_mq_map_swqueue(q);
2652	3364
2653		- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2654		- int ret;
2655		-
2656		- ret = elevator_init_mq(q);
2657		- if (ret)
2658		- return ERR_PTR(ret);
2659		- }
	3365	+ if (elevator_init)
	3366	+ elevator_init_mq(q);
2660	3367
2661	3368	return q;
2662	3369
2663	3370	err_hctxs:
2664	3371	kfree(q->queue_hw_ctx);
2665		-err_percpu:
2666		- free_percpu(q->queue_ctx);
	3372	+ q->nr_hw_queues = 0;
	3373	+ blk_mq_sysfs_deinit(q);
	3374	+err_poll:
	3375	+ blk_stat_free_callback(q->poll_cb);
	3376	+ q->poll_cb = NULL;
2667	3377	err_exit:
2668	3378	q->mq_ops = NULL;
2669	3379	return ERR_PTR(-ENOMEM);
..	..	@@ -2681,38 +3391,21 @@
2681	3391	blk_mq_del_queue_tag_set(q);
2682	3392	}
2683	3393
2684		-/* Basically redo blk_mq_init_queue with queue frozen */
2685		-static void blk_mq_queue_reinit(struct request_queue *q)
2686		-{
2687		- WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2688		-
2689		- blk_mq_debugfs_unregister_hctxs(q);
2690		- blk_mq_sysfs_unregister(q);
2691		-
2692		- /*
2693		- * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2694		- * we should change hctx numa_node according to the new topology (this
2695		- * involves freeing and re-allocating memory, worth doing?)
2696		- */
2697		- blk_mq_map_swqueue(q);
2698		-
2699		- blk_mq_sysfs_register(q);
2700		- blk_mq_debugfs_register_hctxs(q);
2701		-}
2702		-
2703	3394	static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2704	3395	{
2705	3396	int i;
2706	3397
2707		- for (i = 0; i < set->nr_hw_queues; i++)
2708		- if (!__blk_mq_alloc_rq_map(set, i))
	3398	+ for (i = 0; i < set->nr_hw_queues; i++) {
	3399	+ if (!__blk_mq_alloc_map_and_request(set, i))
2709	3400	goto out_unwind;
	3401	+ cond_resched();
	3402	+ }
2710	3403
2711	3404	return 0;
2712	3405
2713	3406	out_unwind:
2714	3407	while (--i >= 0)
2715		- blk_mq_free_rq_map(set->tags[i]);
	3408	+ blk_mq_free_map_and_requests(set, i);
2716	3409
2717	3410	return -ENOMEM;
2718	3411	}
..	..	@@ -2722,7 +3415,7 @@
2722	3415	* may reduce the depth asked for, if memory is tight. set->queue_depth
2723	3416	* will be updated to reflect the allocated depth.
2724	3417	*/
2725		-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
	3418	+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
2726	3419	{
2727	3420	unsigned int depth;
2728	3421	int err;
..	..	@@ -2754,7 +3447,17 @@
2754	3447
2755	3448	static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2756	3449	{
2757		- if (set->ops->map_queues) {
	3450	+ /*
	3451	+ * blk_mq_map_queues() and multiple .map_queues() implementations
	3452	+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
	3453	+ * number of hardware queues.
	3454	+ */
	3455	+ if (set->nr_maps == 1)
	3456	+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
	3457	+
	3458	+ if (set->ops->map_queues && !is_kdump_kernel()) {
	3459	+ int i;
	3460	+
2758	3461	/*
2759	3462	* transport .map_queues is usually done in the following
2760	3463	* way:
..	..	@@ -2762,18 +3465,44 @@
2762	3465	* for (queue = 0; queue < set->nr_hw_queues; queue++) {
2763	3466	* mask = get_cpu_mask(queue)
2764	3467	* for_each_cpu(cpu, mask)
2765		- * set->mq_map[cpu] = queue;
	3468	+ * set->map[x].mq_map[cpu] = queue;
2766	3469	* }
2767	3470	*
2768	3471	* When we need to remap, the table has to be cleared for
2769	3472	* killing stale mapping since one CPU may not be mapped
2770	3473	* to any hw queue.
2771	3474	*/
2772		- blk_mq_clear_mq_map(set);
	3475	+ for (i = 0; i < set->nr_maps; i++)
	3476	+ blk_mq_clear_mq_map(&set->map[i]);
2773	3477
2774	3478	return set->ops->map_queues(set);
2775		- } else
2776		- return blk_mq_map_queues(set);
	3479	+ } else {
	3480	+ BUG_ON(set->nr_maps > 1);
	3481	+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3482	+ }
	3483	+}
	3484	+
	3485	+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
	3486	+ int cur_nr_hw_queues, int new_nr_hw_queues)
	3487	+{
	3488	+ struct blk_mq_tags **new_tags;
	3489	+
	3490	+ if (cur_nr_hw_queues >= new_nr_hw_queues)
	3491	+ return 0;
	3492	+
	3493	+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
	3494	+ GFP_KERNEL, set->numa_node);
	3495	+ if (!new_tags)
	3496	+ return -ENOMEM;
	3497	+
	3498	+ if (set->tags)
	3499	+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
	3500	+ sizeof(*set->tags));
	3501	+ kfree(set->tags);
	3502	+ set->tags = new_tags;
	3503	+ set->nr_hw_queues = new_nr_hw_queues;
	3504	+
	3505	+ return 0;
2777	3506	}
2778	3507
2779	3508	/*
..	..	@@ -2784,7 +3513,7 @@
2784	3513	*/
2785	3514	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2786	3515	{
2787		- int ret;
	3516	+ int i, ret;
2788	3517
2789	3518	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2790	3519
..	..	@@ -2807,6 +3536,11 @@
2807	3536	set->queue_depth = BLK_MQ_MAX_DEPTH;
2808	3537	}
2809	3538
	3539	+ if (!set->nr_maps)
	3540	+ set->nr_maps = 1;
	3541	+ else if (set->nr_maps > HCTX_MAX_TYPES)
	3542	+ return -EINVAL;
	3543	+
2810	3544	/*
2811	3545	* If a crashdump is active, then we are potentially in a very
2812	3546	* memory constrained environment. Limit us to 1 queue and
..	..	@@ -2814,42 +3548,59 @@
2814	3548	*/
2815	3549	if (is_kdump_kernel()) {
2816	3550	set->nr_hw_queues = 1;
	3551	+ set->nr_maps = 1;
2817	3552	set->queue_depth = min(64U, set->queue_depth);
2818	3553	}
2819	3554	/*
2820		- * There is no use for more h/w queues than cpus.
	3555	+ * There is no use for more h/w queues than cpus if we just have
	3556	+ * a single map
2821	3557	*/
2822		- if (set->nr_hw_queues > nr_cpu_ids)
	3558	+ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2823	3559	set->nr_hw_queues = nr_cpu_ids;
2824	3560
2825		- set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
2826		- GFP_KERNEL, set->numa_node);
2827		- if (!set->tags)
	3561	+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
2828	3562	return -ENOMEM;
2829	3563
2830	3564	ret = -ENOMEM;
2831		- set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
2832		- GFP_KERNEL, set->numa_node);
2833		- if (!set->mq_map)
2834		- goto out_free_tags;
	3565	+ for (i = 0; i < set->nr_maps; i++) {
	3566	+ set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
	3567	+ sizeof(set->map[i].mq_map[0]),
	3568	+ GFP_KERNEL, set->numa_node);
	3569	+ if (!set->map[i].mq_map)
	3570	+ goto out_free_mq_map;
	3571	+ set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
	3572	+ }
2835	3573
2836	3574	ret = blk_mq_update_queue_map(set);
2837	3575	if (ret)
2838	3576	goto out_free_mq_map;
2839	3577
2840		- ret = blk_mq_alloc_rq_maps(set);
	3578	+ ret = blk_mq_alloc_map_and_requests(set);
2841	3579	if (ret)
2842	3580	goto out_free_mq_map;
	3581	+
	3582	+ if (blk_mq_is_sbitmap_shared(set->flags)) {
	3583	+ atomic_set(&set->active_queues_shared_sbitmap, 0);
	3584	+
	3585	+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
	3586	+ ret = -ENOMEM;
	3587	+ goto out_free_mq_rq_maps;
	3588	+ }
	3589	+ }
2843	3590
2844	3591	mutex_init(&set->tag_list_lock);
2845	3592	INIT_LIST_HEAD(&set->tag_list);
2846	3593
2847	3594	return 0;
2848	3595
	3596	+out_free_mq_rq_maps:
	3597	+ for (i = 0; i < set->nr_hw_queues; i++)
	3598	+ blk_mq_free_map_and_requests(set, i);
2849	3599	out_free_mq_map:
2850		- kfree(set->mq_map);
2851		- set->mq_map = NULL;
2852		-out_free_tags:
	3600	+ for (i = 0; i < set->nr_maps; i++) {
	3601	+ kfree(set->map[i].mq_map);
	3602	+ set->map[i].mq_map = NULL;
	3603	+ }
2853	3604	kfree(set->tags);
2854	3605	set->tags = NULL;
2855	3606	return ret;
..	..	@@ -2858,13 +3609,18 @@
2858	3609
2859	3610	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2860	3611	{
2861		- int i;
	3612	+ int i, j;
2862	3613
2863		- for (i = 0; i < nr_cpu_ids; i++)
	3614	+ for (i = 0; i < set->nr_hw_queues; i++)
2864	3615	blk_mq_free_map_and_requests(set, i);
2865	3616
2866		- kfree(set->mq_map);
2867		- set->mq_map = NULL;
	3617	+ if (blk_mq_is_sbitmap_shared(set->flags))
	3618	+ blk_mq_exit_shared_sbitmap(set);
	3619	+
	3620	+ for (j = 0; j < set->nr_maps; j++) {
	3621	+ kfree(set->map[j].mq_map);
	3622	+ set->map[j].mq_map = NULL;
	3623	+ }
2868	3624
2869	3625	kfree(set->tags);
2870	3626	set->tags = NULL;
..	..	@@ -2880,6 +3636,9 @@
2880	3636	if (!set)
2881	3637	return -EINVAL;
2882	3638
	3639	+ if (q->nr_requests == nr)
	3640	+ return 0;
	3641	+
2883	3642	blk_mq_freeze_queue(q);
2884	3643	blk_mq_quiesce_queue(q);
2885	3644
..	..	@@ -2894,14 +3653,16 @@
2894	3653	if (!hctx->sched_tags) {
2895	3654	ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2896	3655	false);
	3656	+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
	3657	+ blk_mq_tag_resize_shared_sbitmap(set, nr);
2897	3658	} else {
2898	3659	ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2899	3660	nr, true);
2900	3661	}
2901	3662	if (ret)
2902	3663	break;
2903		- if (q->elevator && q->elevator->type->ops.mq.depth_updated)
2904		- q->elevator->type->ops.mq.depth_updated(hctx);
	3664	+ if (q->elevator && q->elevator->type->ops.depth_updated)
	3665	+ q->elevator->type->ops.depth_updated(hctx);
2905	3666	}
2906	3667
2907	3668	if (!ret)
..	..	@@ -2988,20 +3749,19 @@
2988	3749	{
2989	3750	struct request_queue *q;
2990	3751	LIST_HEAD(head);
	3752	+ int prev_nr_hw_queues;
2991	3753
2992	3754	lockdep_assert_held(&set->tag_list_lock);
2993	3755
2994		- if (nr_hw_queues > nr_cpu_ids)
	3756	+ if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
2995	3757	nr_hw_queues = nr_cpu_ids;
2996		- if (nr_hw_queues < 1 \|\| nr_hw_queues == set->nr_hw_queues)
	3758	+ if (nr_hw_queues < 1)
	3759	+ return;
	3760	+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
2997	3761	return;
2998	3762
2999	3763	list_for_each_entry(q, &set->tag_list, tag_set_list)
3000	3764	blk_mq_freeze_queue(q);
3001		- /*
3002		- * Sync with blk_mq_queue_tag_busy_iter.
3003		- */
3004		- synchronize_rcu();
3005	3765	/*
3006	3766	* Switch IO scheduler to 'none', cleaning up the data associated
3007	3767	* with the previous scheduler. We will switch back once we are done
..	..	@@ -3011,11 +3771,35 @@
3011	3771	if (!blk_mq_elv_switch_none(&head, q))
3012	3772	goto switch_back;
3013	3773
	3774	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3775	+ blk_mq_debugfs_unregister_hctxs(q);
	3776	+ blk_mq_sysfs_unregister(q);
	3777	+ }
	3778	+
	3779	+ prev_nr_hw_queues = set->nr_hw_queues;
	3780	+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
	3781	+ 0)
	3782	+ goto reregister;
	3783	+
3014	3784	set->nr_hw_queues = nr_hw_queues;
	3785	+fallback:
3015	3786	blk_mq_update_queue_map(set);
3016	3787	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3017	3788	blk_mq_realloc_hw_ctxs(set, q);
3018		- blk_mq_queue_reinit(q);
	3789	+ if (q->nr_hw_queues != set->nr_hw_queues) {
	3790	+ pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
	3791	+ nr_hw_queues, prev_nr_hw_queues);
	3792	+ set->nr_hw_queues = prev_nr_hw_queues;
	3793	+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
	3794	+ goto fallback;
	3795	+ }
	3796	+ blk_mq_map_swqueue(q);
	3797	+ }
	3798	+
	3799	+reregister:
	3800	+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
	3801	+ blk_mq_sysfs_register(q);
	3802	+ blk_mq_debugfs_register_hctxs(q);
3019	3803	}
3020	3804
3021	3805	switch_back:
..	..	@@ -3069,7 +3853,6 @@
3069	3853	}
3070	3854
3071	3855	static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3072		- struct blk_mq_hw_ctx *hctx,
3073	3856	struct request *rq)
3074	3857	{
3075	3858	unsigned long ret = 0;
..	..	@@ -3102,7 +3885,6 @@
3102	3885	}
3103	3886
3104	3887	static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3105		- struct blk_mq_hw_ctx *hctx,
3106	3888	struct request *rq)
3107	3889	{
3108	3890	struct hrtimer_sleeper hs;
..	..	@@ -3114,18 +3896,15 @@
3114	3896	return false;
3115	3897
3116	3898	/*
3117		- * poll_nsec can be:
	3899	+ * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3118	3900	*
3119		- * -1: don't ever hybrid sleep
3120	3901	* 0: use half of prev avg
3121	3902	* >0: use this specific value
3122	3903	*/
3123		- if (q->poll_nsec == -1)
3124		- return false;
3125		- else if (q->poll_nsec > 0)
	3904	+ if (q->poll_nsec > 0)
3126	3905	nsecs = q->poll_nsec;
3127	3906	else
3128		- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
	3907	+ nsecs = blk_mq_poll_nsecs(q, rq);
3129	3908
3130	3909	if (!nsecs)
3131	3910	return false;
..	..	@@ -3139,15 +3918,14 @@
3139	3918	kt = nsecs;
3140	3919
3141	3920	mode = HRTIMER_MODE_REL;
3142		- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
	3921	+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3143	3922	hrtimer_set_expires(&hs.timer, kt);
3144	3923
3145		- hrtimer_init_sleeper(&hs, current);
3146	3924	do {
3147	3925	if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3148	3926	break;
3149	3927	set_current_state(TASK_UNINTERRUPTIBLE);
3150		- hrtimer_start_expires(&hs.timer, mode);
	3928	+ hrtimer_sleeper_start_expires(&hs, mode);
3151	3929	if (hs.task)
3152	3930	io_schedule();
3153	3931	hrtimer_cancel(&hs.timer);
..	..	@@ -3159,59 +3937,14 @@
3159	3937	return true;
3160	3938	}
3161	3939
3162		-static bool __blk_mq_poll(struct blk_mq_hw_ctx hctx, struct request rq)
	3940	+static bool blk_mq_poll_hybrid(struct request_queue *q,
	3941	+ struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3163	3942	{
3164		- struct request_queue *q = hctx->queue;
3165		- long state;
3166		-
3167		- /*
3168		- * If we sleep, have the caller restart the poll loop to reset
3169		- * the state. Like for the other success return cases, the
3170		- * caller is responsible for checking if the IO completed. If
3171		- * the IO isn't complete, we'll get called again and will go
3172		- * straight to the busy poll loop.
3173		- */
3174		- if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3175		- return true;
3176		-
3177		- hctx->poll_considered++;
3178		-
3179		- state = current->state;
3180		- while (!need_resched()) {
3181		- int ret;
3182		-
3183		- hctx->poll_invoked++;
3184		-
3185		- ret = q->mq_ops->poll(hctx, rq->tag);
3186		- if (ret > 0) {
3187		- hctx->poll_success++;
3188		- set_current_state(TASK_RUNNING);
3189		- return true;
3190		- }
3191		-
3192		- if (signal_pending_state(state, current))
3193		- set_current_state(TASK_RUNNING);
3194		-
3195		- if (current->state == TASK_RUNNING)
3196		- return true;
3197		- if (ret < 0)
3198		- break;
3199		- cpu_relax();
3200		- }
3201		-
3202		- __set_current_state(TASK_RUNNING);
3203		- return false;
3204		-}
3205		-
3206		-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3207		-{
3208		- struct blk_mq_hw_ctx *hctx;
3209	3943	struct request *rq;
3210	3944
3211		- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3945	+ if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3212	3946	return false;
3213	3947
3214		- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3215	3948	if (!blk_qc_t_is_internal(cookie))
3216	3949	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3217	3950	else {
..	..	@@ -3226,13 +3959,97 @@
3226	3959	return false;
3227	3960	}
3228	3961
3229		- return __blk_mq_poll(hctx, rq);
	3962	+ return blk_mq_poll_hybrid_sleep(q, rq);
3230	3963	}
	3964	+
	3965	+/**
	3966	+ * blk_poll - poll for IO completions
	3967	+ * @q: the queue
	3968	+ * @cookie: cookie passed back at IO submission time
	3969	+ * @spin: whether to spin for completions
	3970	+ *
	3971	+ * Description:
	3972	+ * Poll for completions on the passed in queue. Returns number of
	3973	+ * completed entries found. If @spin is true, then blk_poll will continue
	3974	+ * looping until at least one completion is found, unless the task is
	3975	+ * otherwise marked running (or we need to reschedule).
	3976	+ */
	3977	+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
	3978	+{
	3979	+ struct blk_mq_hw_ctx *hctx;
	3980	+ long state;
	3981	+
	3982	+ if (!blk_qc_t_valid(cookie) \|\|
	3983	+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
	3984	+ return 0;
	3985	+
	3986	+ if (current->plug)
	3987	+ blk_flush_plug_list(current->plug, false);
	3988	+
	3989	+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
	3990	+
	3991	+ /*
	3992	+ * If we sleep, have the caller restart the poll loop to reset
	3993	+ * the state. Like for the other success return cases, the
	3994	+ * caller is responsible for checking if the IO completed. If
	3995	+ * the IO isn't complete, we'll get called again and will go
	3996	+ * straight to the busy poll loop.
	3997	+ */
	3998	+ if (blk_mq_poll_hybrid(q, hctx, cookie))
	3999	+ return 1;
	4000	+
	4001	+ hctx->poll_considered++;
	4002	+
	4003	+ state = current->state;
	4004	+ do {
	4005	+ int ret;
	4006	+
	4007	+ hctx->poll_invoked++;
	4008	+
	4009	+ ret = q->mq_ops->poll(hctx);
	4010	+ if (ret > 0) {
	4011	+ hctx->poll_success++;
	4012	+ __set_current_state(TASK_RUNNING);
	4013	+ return ret;
	4014	+ }
	4015	+
	4016	+ if (signal_pending_state(state, current))
	4017	+ __set_current_state(TASK_RUNNING);
	4018	+
	4019	+ if (current->state == TASK_RUNNING)
	4020	+ return 1;
	4021	+ if (ret < 0 \|\| !spin)
	4022	+ break;
	4023	+ cpu_relax();
	4024	+ } while (!need_resched());
	4025	+
	4026	+ __set_current_state(TASK_RUNNING);
	4027	+ return 0;
	4028	+}
	4029	+EXPORT_SYMBOL_GPL(blk_poll);
	4030	+
	4031	+unsigned int blk_mq_rq_cpu(struct request *rq)
	4032	+{
	4033	+ return rq->mq_ctx->cpu;
	4034	+}
	4035	+EXPORT_SYMBOL(blk_mq_rq_cpu);
3231	4036
3232	4037	static int __init blk_mq_init(void)
3233	4038	{
	4039	+ int i;
	4040	+
	4041	+ for_each_possible_cpu(i)
	4042	+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
	4043	+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
	4044	+
	4045	+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
	4046	+ "block/softirq:dead", NULL,
	4047	+ blk_softirq_cpu_dead);
3234	4048	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3235	4049	blk_mq_hctx_notify_dead);
	4050	+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
	4051	+ blk_mq_hctx_notify_online,
	4052	+ blk_mq_hctx_notify_offline);
3236	4053	return 0;
3237	4054	}
3238	4055	subsys_initcall(blk_mq_init);