~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* Budget Fair Queueing (BFQ) I/O scheduler.
3	4	*
..	..	@@ -12,21 +13,11 @@
12	13	*
13	14	* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14	15	*
15		- * This program is free software; you can redistribute it and/or
16		- * modify it under the terms of the GNU General Public License as
17		- * published by the Free Software Foundation; either version 2 of the
18		- * License, or (at your option) any later version.
19		- *
20		- * This program is distributed in the hope that it will be useful,
21		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
22		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23		- * General Public License for more details.
24		- *
25	16	* BFQ is a proportional-share I/O scheduler, with some extra
26	17	* low-latency capabilities. BFQ also supports full hierarchical
27	18	* scheduling through cgroups. Next paragraphs provide an introduction
28	19	* on BFQ inner workings. Details on BFQ benefits, usage and
29		- * limitations can be found in Documentation/block/bfq-iosched.txt.
	20	+ * limitations can be found in Documentation/block/bfq-iosched.rst.
30	21	*
31	22	* BFQ is a proportional-share storage-I/O scheduling algorithm based
32	23	* on the slice-by-slice service scheme of CFQ. But BFQ assigns
..	..	@@ -167,6 +158,7 @@
167	158	BFQ_BFQQ_FNS(coop);
168	159	BFQ_BFQQ_FNS(split_coop);
169	160	BFQ_BFQQ_FNS(softrt_update);
	161	+BFQ_BFQQ_FNS(has_waker);
170	162	#undef BFQ_BFQQ_FNS \
171	163
172	164	/* Expiration time of sync (0) and async (1) requests, in ns. */
..	..	@@ -190,7 +182,7 @@
190	182	/*
191	183	* When a sync request is dispatched, the queue that contains that
192	184	* request, and all the ancestor entities of that queue, are charged
193		- * with the number of sectors of the request. In constrast, if the
	185	+ * with the number of sectors of the request. In contrast, if the
194	186	* request is async, then the queue and its ancestor entities are
195	187	* charged with the number of sectors of the request, multiplied by
196	188	* the factor below. This throttles the bandwidth for async I/O,
..	..	@@ -218,7 +210,7 @@
218	210	* queue merging.
219	211	*
220	212	* As can be deduced from the low time limit below, queue merging, if
221		- * successful, happens at the very beggining of the I/O of the involved
	213	+ * successful, happens at the very beginning of the I/O of the involved
222	214	* cooperating processes, as a consequence of the arrival of the very
223	215	* first requests from each cooperator. After that, there is very
224	216	* little chance to find cooperators.
..	..	@@ -231,13 +223,26 @@
231	223	#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
232	224
233	225	/* hw_tag detection: parallel requests threshold and min samples needed. */
234		-#define BFQ_HW_QUEUE_THRESHOLD 4
	226	+#define BFQ_HW_QUEUE_THRESHOLD 3
235	227	#define BFQ_HW_QUEUE_SAMPLES 32
236	228
237	229	#define BFQQ_SEEK_THR (sector_t)(8 * 100)
238	230	#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
	231	+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
	232	+ (get_sdist(last_pos, rq) > \
	233	+ BFQQ_SEEK_THR && \
	234	+ (!blk_queue_nonrot(bfqd->queue) \|\| \
	235	+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
239	236	#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
240	237	#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
	238	+/*
	239	+ * Sync random I/O is likely to be confused with soft real-time I/O,
	240	+ * because it is characterized by limited throughput and apparently
	241	+ * isochronous arrival pattern. To avoid false positives, queues
	242	+ * containing only random (seeky) I/O are prevented from being tagged
	243	+ * as soft real-time.
	244	+ */
	245	+#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1)
241	246
242	247	/* Min number of samples required to perform peak-rate update */
243	248	#define BFQ_RATE_MIN_SAMPLES 32
..	..	@@ -368,6 +373,12 @@
368	373
369	374	void bic_set_bfqq(struct bfq_io_cq bic, struct bfq_queue bfqq, bool is_sync)
370	375	{
	376	+ struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
	377	+
	378	+ /* Clear bic pointer if bfqq is detached from this bic */
	379	+ if (old_bfqq && old_bfqq->bic == bic)
	380	+ old_bfqq->bic = NULL;
	381	+
371	382	bic->bfqq[is_sync] = bfqq;
372	383	}
373	384
..	..	@@ -400,9 +411,9 @@
400	411	unsigned long flags;
401	412	struct bfq_io_cq *icq;
402	413
403		- spin_lock_irqsave(q->queue_lock, flags);
	414	+ spin_lock_irqsave(&q->queue_lock, flags);
404	415	icq = icq_to_bic(ioc_lookup_icq(ioc, q));
405		- spin_unlock_irqrestore(q->queue_lock, flags);
	416	+ spin_unlock_irqrestore(&q->queue_lock, flags);
406	417
407	418	return icq;
408	419	}
..	..	@@ -416,6 +427,8 @@
416	427	*/
417	428	void bfq_schedule_dispatch(struct bfq_data *bfqd)
418	429	{
	430	+ lockdep_assert_held(&bfqd->lock);
	431	+
419	432	if (bfqd->queued != 0) {
420	433	bfq_log(bfqd, "schedule dispatch");
421	434	blk_mq_run_hw_queues(bfqd->queue, true);
..	..	@@ -423,13 +436,12 @@
423	436	}
424	437
425	438	#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
426		-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
427	439
428	440	#define bfq_sample_valid(samples) ((samples) > 80)
429	441
430	442	/*
431	443	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
432		- * We choose the request that is closesr to the head right now. Distance
	444	+ * We choose the request that is closer to the head right now. Distance
433	445	* behind the head is penalized and only allowed to a certain extent.
434	446	*/
435	447	static struct request bfq_choose_req(struct bfq_data bfqd,
..	..	@@ -591,7 +603,16 @@
591	603	bfq_merge_time_limit);
592	604	}
593	605
594		-void bfq_pos_tree_add_move(struct bfq_data bfqd, struct bfq_queue bfqq)
	606	+/*
	607	+ * The following function is not marked as __cold because it is
	608	+ * actually cold, but for the same performance goal described in the
	609	+ * comments on the likely() at the beginning of
	610	+ * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
	611	+ * execution time for the case where this function is not invoked, we
	612	+ * had to add an unlikely() in each involved if().
	613	+ */
	614	+void __cold
	615	+bfq_pos_tree_add_move(struct bfq_data bfqd, struct bfq_queue bfqq)
595	616	{
596	617	struct rb_node *p, parent;
597	618	struct bfq_queue *__bfqq;
..	..	@@ -600,6 +621,10 @@
600	621	rb_erase(&bfqq->pos_node, bfqq->pos_root);
601	622	bfqq->pos_root = NULL;
602	623	}
	624	+
	625	+ /* oom_bfqq does not participate in queue merging */
	626	+ if (bfqq == &bfqd->oom_bfqq)
	627	+ return;
603	628
604	629	/*
605	630	* bfqq cannot be merged any longer (see comments in
..	..	@@ -625,52 +650,68 @@
625	650	}
626	651
627	652	/*
628		- * Tell whether there are active queues with different weights or
629		- * active groups.
630		- */
631		-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
632		-{
633		- /*
634		- * For queue weights to differ, queue_weights_tree must contain
635		- * at least two nodes.
636		- */
637		- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
638		- (bfqd->queue_weights_tree.rb_node->rb_left \|\|
639		- bfqd->queue_weights_tree.rb_node->rb_right)
640		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
641		- ) \|\|
642		- (bfqd->num_groups_with_pending_reqs > 0
643		-#endif
644		- );
645		-}
646		-
647		-/*
648		- * The following function returns true if every queue must receive the
649		- * same share of the throughput (this condition is used when deciding
650		- * whether idling may be disabled, see the comments in the function
651		- * bfq_better_to_idle()).
	653	+ * The following function returns false either if every active queue
	654	+ * must receive the same share of the throughput (symmetric scenario),
	655	+ * or, as a special case, if bfqq must receive a share of the
	656	+ * throughput lower than or equal to the share that every other active
	657	+ * queue must receive. If bfqq does sync I/O, then these are the only
	658	+ * two cases where bfqq happens to be guaranteed its share of the
	659	+ * throughput even if I/O dispatching is not plugged when bfqq remains
	660	+ * temporarily empty (for more details, see the comments in the
	661	+ * function bfq_better_to_idle()). For this reason, the return value
	662	+ * of this function is used to check whether I/O-dispatch plugging can
	663	+ * be avoided.
652	664	*
653		- * Such a scenario occurs when:
	665	+ * The above first case (symmetric scenario) occurs when:
654	666	* 1) all active queues have the same weight,
655		- * 2) all active groups at the same level in the groups tree have the same
656		- * weight,
	667	+ * 2) all active queues belong to the same I/O-priority class,
657	668	* 3) all active groups at the same level in the groups tree have the same
	669	+ * weight,
	670	+ * 4) all active groups at the same level in the groups tree have the same
658	671	* number of children.
659	672	*
660	673	* Unfortunately, keeping the necessary state for evaluating exactly
661	674	* the last two symmetry sub-conditions above would be quite complex
662		- * and time consuming. Therefore this function evaluates, instead,
663		- * only the following stronger two sub-conditions, for which it is
	675	+ * and time consuming. Therefore this function evaluates, instead,
	676	+ * only the following stronger three sub-conditions, for which it is
664	677	* much easier to maintain the needed state:
665	678	* 1) all active queues have the same weight,
666		- * 2) there are no active groups.
	679	+ * 2) all active queues belong to the same I/O-priority class,
	680	+ * 3) there are no active groups.
667	681	* In particular, the last condition is always true if hierarchical
668	682	* support or the cgroups interface are not enabled, thus no state
669	683	* needs to be maintained in this case.
670	684	*/
671		-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
	685	+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
	686	+ struct bfq_queue *bfqq)
672	687	{
673		- return !bfq_varied_queue_weights_or_active_groups(bfqd);
	688	+ bool smallest_weight = bfqq &&
	689	+ bfqq->weight_counter &&
	690	+ bfqq->weight_counter ==
	691	+ container_of(
	692	+ rb_first_cached(&bfqd->queue_weights_tree),
	693	+ struct bfq_weight_counter,
	694	+ weights_node);
	695	+
	696	+ /*
	697	+ * For queue weights to differ, queue_weights_tree must contain
	698	+ * at least two nodes.
	699	+ */
	700	+ bool varied_queue_weights = !smallest_weight &&
	701	+ !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
	702	+ (bfqd->queue_weights_tree.rb_root.rb_node->rb_left \|\|
	703	+ bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
	704	+
	705	+ bool multiple_classes_busy =
	706	+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) \|\|
	707	+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) \|\|
	708	+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
	709	+
	710	+ return varied_queue_weights \|\| multiple_classes_busy
	711	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
	712	+ \|\| bfqd->num_groups_with_pending_reqs > 0
	713	+#endif
	714	+ ;
674	715	}
675	716
676	717	/*
..	..	@@ -687,10 +728,11 @@
687	728	* should be low too.
688	729	*/
689	730	void bfq_weights_tree_add(struct bfq_data bfqd, struct bfq_queue bfqq,
690		- struct rb_root *root)
	731	+ struct rb_root_cached *root)
691	732	{
692	733	struct bfq_entity *entity = &bfqq->entity;
693		- struct rb_node *new = &(root->rb_node), parent = NULL;
	734	+ struct rb_node *new = &(root->rb_root.rb_node), parent = NULL;
	735	+ bool leftmost = true;
694	736
695	737	/*
696	738	* Do not insert if the queue is already associated with a
..	..	@@ -719,8 +761,10 @@
719	761	}
720	762	if (entity->weight < __counter->weight)
721	763	new = &((*new)->rb_left);
722		- else
	764	+ else {
723	765	new = &((*new)->rb_right);
	766	+ leftmost = false;
	767	+ }
724	768	}
725	769
726	770	bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
..	..	@@ -729,22 +773,22 @@
729	773	/*
730	774	* In the unlucky event of an allocation failure, we just
731	775	* exit. This will cause the weight of queue to not be
732		- * considered in bfq_varied_queue_weights_or_active_groups,
733		- * which, in its turn, causes the scenario to be deemed
734		- * wrongly symmetric in case bfqq's weight would have been
735		- * the only weight making the scenario asymmetric. On the
736		- * bright side, no unbalance will however occur when bfqq
737		- * becomes inactive again (the invocation of this function
738		- * is triggered by an activation of queue). In fact,
739		- * bfq_weights_tree_remove does nothing if
740		- * !bfqq->weight_counter.
	776	+ * considered in bfq_asymmetric_scenario, which, in its turn,
	777	+ * causes the scenario to be deemed wrongly symmetric in case
	778	+ * bfqq's weight would have been the only weight making the
	779	+ * scenario asymmetric. On the bright side, no unbalance will
	780	+ * however occur when bfqq becomes inactive again (the
	781	+ * invocation of this function is triggered by an activation
	782	+ * of queue). In fact, bfq_weights_tree_remove does nothing
	783	+ * if !bfqq->weight_counter.
741	784	*/
742	785	if (unlikely(!bfqq->weight_counter))
743	786	return;
744	787
745	788	bfqq->weight_counter->weight = entity->weight;
746	789	rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
747		- rb_insert_color(&bfqq->weight_counter->weights_node, root);
	790	+ rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
	791	+ leftmost);
748	792
749	793	inc_counter:
750	794	bfqq->weight_counter->num_active++;
..	..	@@ -759,7 +803,7 @@
759	803	*/
760	804	void __bfq_weights_tree_remove(struct bfq_data *bfqd,
761	805	struct bfq_queue *bfqq,
762		- struct rb_root *root)
	806	+ struct rb_root_cached *root)
763	807	{
764	808	if (!bfqq->weight_counter)
765	809	return;
..	..	@@ -768,7 +812,7 @@
768	812	if (bfqq->weight_counter->num_active > 0)
769	813	goto reset_entity_pointer;
770	814
771		- rb_erase(&bfqq->weight_counter->weights_node, root);
	815	+ rb_erase_cached(&bfqq->weight_counter->weights_node, root);
772	816	kfree(bfqq->weight_counter);
773	817
774	818	reset_entity_pointer:
..	..	@@ -882,7 +926,8 @@
882	926	static unsigned long bfq_serv_to_charge(struct request *rq,
883	927	struct bfq_queue *bfqq)
884	928	{
885		- if (bfq_bfqq_sync(bfqq) \|\| bfqq->wr_coeff > 1)
	929	+ if (bfq_bfqq_sync(bfqq) \|\| bfqq->wr_coeff > 1 \|\|
	930	+ bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
886	931	return blk_rq_sectors(rq);
887	932
888	933	return blk_rq_sectors(rq) * bfq_async_charge_factor;
..	..	@@ -916,8 +961,10 @@
916	961	*/
917	962	return;
918	963
919		- new_budget = max_t(unsigned long, bfqq->max_budget,
920		- bfq_serv_to_charge(next_rq, bfqq));
	964	+ new_budget = max_t(unsigned long,
	965	+ max_t(unsigned long, bfqq->max_budget,
	966	+ bfq_serv_to_charge(next_rq, bfqq)),
	967	+ entity->service);
921	968	if (entity->budget != new_budget) {
922	969	entity->budget = new_budget;
923	970	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
..	..	@@ -946,7 +993,7 @@
946	993	* of several files
947	994	* mplayer took 23 seconds to start, if constantly weight-raised.
948	995	*
949		- * As for higher values than that accomodating the above bad
	996	+ * As for higher values than that accommodating the above bad
950	997	* scenario, tests show that higher values would often yield
951	998	* the opposite of the desired result, i.e., would worsen
952	999	* responsiveness by allowing non-interactive applications to
..	..	@@ -985,6 +1032,7 @@
985	1032	else
986	1033	bfq_clear_bfqq_IO_bound(bfqq);
987	1034
	1035	+ bfqq->entity.new_weight = bic->saved_weight;
988	1036	bfqq->ttime = bic->saved_ttime;
989	1037	bfqq->wr_coeff = bic->saved_wr_coeff;
990	1038	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
..	..	@@ -1020,7 +1068,7 @@
1020	1068
1021	1069	static int bfqq_process_refs(struct bfq_queue *bfqq)
1022	1070	{
1023		- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
	1071	+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
1024	1072	(bfqq->weight_counter != NULL);
1025	1073	}
1026	1074
..	..	@@ -1032,8 +1080,18 @@
1032	1080
1033	1081	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
1034	1082	hlist_del_init(&item->burst_list_node);
1035		- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1036		- bfqd->burst_size = 1;
	1083	+
	1084	+ /*
	1085	+ * Start the creation of a new burst list only if there is no
	1086	+ * active queue. See comments on the conditional invocation of
	1087	+ * bfq_handle_burst().
	1088	+ */
	1089	+ if (bfq_tot_busy_queues(bfqd) == 0) {
	1090	+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
	1091	+ bfqd->burst_size = 1;
	1092	+ } else
	1093	+ bfqd->burst_size = 0;
	1094	+
1037	1095	bfqd->burst_parent_entity = bfqq->entity.parent;
1038	1096	}
1039	1097
..	..	@@ -1089,7 +1147,8 @@
1089	1147	* many parallel threads/processes. Examples are systemd during boot,
1090	1148	* or git grep. To help these processes get their job done as soon as
1091	1149	* possible, it is usually better to not grant either weight-raising
1092		- * or device idling to their queues.
	1150	+ * or device idling to their queues, unless these queues must be
	1151	+ * protected from the I/O flowing through other active queues.
1093	1152	*
1094	1153	* In this comment we describe, firstly, the reasons why this fact
1095	1154	* holds, and, secondly, the next function, which implements the main
..	..	@@ -1101,7 +1160,10 @@
1101	1160	* cumulatively served, the sooner the target job of these queues gets
1102	1161	* completed. As a consequence, weight-raising any of these queues,
1103	1162	* which also implies idling the device for it, is almost always
1104		- * counterproductive. In most cases it just lowers throughput.
	1163	+ * counterproductive, unless there are other active queues to isolate
	1164	+ * these new queues from. If there no other active queues, then
	1165	+ * weight-raising these new queues just lowers throughput in most
	1166	+ * cases.
1105	1167	*
1106	1168	* On the other hand, a burst of queue creations may be caused also by
1107	1169	* the start of an application that does not consist of a lot of
..	..	@@ -1135,14 +1197,16 @@
1135	1197	* are very rare. They typically occur if some service happens to
1136	1198	* start doing I/O exactly when the interactive task starts.
1137	1199	*
1138		- * Turning back to the next function, it implements all the steps
1139		- * needed to detect the occurrence of a large burst and to properly
1140		- * mark all the queues belonging to it (so that they can then be
1141		- * treated in a different way). This goal is achieved by maintaining a
1142		- * "burst list" that holds, temporarily, the queues that belong to the
1143		- * burst in progress. The list is then used to mark these queues as
1144		- * belonging to a large burst if the burst does become large. The main
1145		- * steps are the following.
	1200	+ * Turning back to the next function, it is invoked only if there are
	1201	+ * no active queues (apart from active queues that would belong to the
	1202	+ * same, possible burst bfqq would belong to), and it implements all
	1203	+ * the steps needed to detect the occurrence of a large burst and to
	1204	+ * properly mark all the queues belonging to it (so that they can then
	1205	+ * be treated in a different way). This goal is achieved by
	1206	+ * maintaining a "burst list" that holds, temporarily, the queues that
	1207	+ * belong to the burst in progress. The list is then used to mark
	1208	+ * these queues as belonging to a large burst if the burst does become
	1209	+ * large. The main steps are the following.
1146	1210	*
1147	1211	* . when the very first queue is created, the queue is inserted into the
1148	1212	* list (as it could be the first queue in a possible burst)
..	..	@@ -1376,21 +1440,31 @@
1376	1440	* mechanism may be re-designed in such a way to make it possible to
1377	1441	* know whether preemption is needed without needing to update service
1378	1442	* trees). In addition, queue preemptions almost always cause random
1379		- * I/O, and thus loss of throughput. Because of these facts, the next
1380		- * function adopts the following simple scheme to avoid both costly
1381		- * operations and too frequent preemptions: it requests the expiration
1382		- * of the in-service queue (unconditionally) only for queues that need
1383		- * to recover a hole, or that either are weight-raised or deserve to
1384		- * be weight-raised.
	1443	+ * I/O, which may in turn cause loss of throughput. Finally, there may
	1444	+ * even be no in-service queue when the next function is invoked (so,
	1445	+ * no queue to compare timestamps with). Because of these facts, the
	1446	+ * next function adopts the following simple scheme to avoid costly
	1447	+ * operations, too frequent preemptions and too many dependencies on
	1448	+ * the state of the scheduler: it requests the expiration of the
	1449	+ * in-service queue (unconditionally) only for queues that need to
	1450	+ * recover a hole. Then it delegates to other parts of the code the
	1451	+ * responsibility of handling the above case 2.
1385	1452	*/
1386	1453	static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1387	1454	struct bfq_queue *bfqq,
1388		- bool arrived_in_time,
1389		- bool wr_or_deserves_wr)
	1455	+ bool arrived_in_time)
1390	1456	{
1391	1457	struct bfq_entity *entity = &bfqq->entity;
1392	1458
1393		- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
	1459	+ /*
	1460	+ * In the next compound condition, we check also whether there
	1461	+ * is some budget left, because otherwise there is no point in
	1462	+ * trying to go on serving bfqq with this same budget: bfqq
	1463	+ * would be expired immediately after being selected for
	1464	+ * service. This would only cause useless overhead.
	1465	+ */
	1466	+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
	1467	+ bfq_bfqq_budget_left(bfqq) > 0) {
1394	1468	/*
1395	1469	* We do not clear the flag non_blocking_wait_rq here, as
1396	1470	* the latter is used in bfq_activate_bfqq to signal
..	..	@@ -1433,7 +1507,7 @@
1433	1507	entity->budget = max_t(unsigned long, bfqq->max_budget,
1434	1508	bfq_serv_to_charge(bfqq->next_rq, bfqq));
1435	1509	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1436		- return wr_or_deserves_wr;
	1510	+ return false;
1437	1511	}
1438	1512
1439	1513	/*
..	..	@@ -1551,6 +1625,36 @@
1551	1625	bfqd->bfq_wr_min_idle_time);
1552	1626	}
1553	1627
	1628	+
	1629	+/*
	1630	+ * Return true if bfqq is in a higher priority class, or has a higher
	1631	+ * weight than the in-service queue.
	1632	+ */
	1633	+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
	1634	+ struct bfq_queue *in_serv_bfqq)
	1635	+{
	1636	+ int bfqq_weight, in_serv_weight;
	1637	+
	1638	+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
	1639	+ return true;
	1640	+
	1641	+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
	1642	+ bfqq_weight = bfqq->entity.weight;
	1643	+ in_serv_weight = in_serv_bfqq->entity.weight;
	1644	+ } else {
	1645	+ if (bfqq->entity.parent)
	1646	+ bfqq_weight = bfqq->entity.parent->weight;
	1647	+ else
	1648	+ bfqq_weight = bfqq->entity.weight;
	1649	+ if (in_serv_bfqq->entity.parent)
	1650	+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
	1651	+ else
	1652	+ in_serv_weight = in_serv_bfqq->entity.weight;
	1653	+ }
	1654	+
	1655	+ return bfqq_weight > in_serv_weight;
	1656	+}
	1657	+
1554	1658	static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1555	1659	struct bfq_queue *bfqq,
1556	1660	int old_wr_coeff,
..	..	@@ -1579,6 +1683,7 @@
1579	1683	*/
1580	1684	in_burst = bfq_bfqq_in_large_burst(bfqq);
1581	1685	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
	1686	+ !BFQQ_TOTALLY_SEEKY(bfqq) &&
1582	1687	!in_burst &&
1583	1688	time_is_before_jiffies(bfqq->soft_rt_next_start) &&
1584	1689	bfqq->dispatched == 0;
..	..	@@ -1594,8 +1699,7 @@
1594	1699	*/
1595	1700	bfqq_wants_to_preempt =
1596	1701	bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1597		- arrived_in_time,
1598		- wr_or_deserves_wr);
	1702	+ arrived_in_time);
1599	1703
1600	1704	/*
1601	1705	* If bfqq happened to be activated in a burst, but has been
..	..	@@ -1660,19 +1764,109 @@
1660	1764
1661	1765	/*
1662	1766	* Expire in-service queue only if preemption may be needed
1663		- * for guarantees. In this respect, the function
1664		- * next_queue_may_preempt just checks a simple, necessary
1665		- * condition, and not a sufficient condition based on
1666		- * timestamps. In fact, for the latter condition to be
1667		- * evaluated, timestamps would need first to be updated, and
1668		- * this operation is quite costly (see the comments on the
1669		- * function bfq_bfqq_update_budg_for_activation).
	1767	+ * for guarantees. In particular, we care only about two
	1768	+ * cases. The first is that bfqq has to recover a service
	1769	+ * hole, as explained in the comments on
	1770	+ * bfq_bfqq_update_budg_for_activation(), i.e., that
	1771	+ * bfqq_wants_to_preempt is true. However, if bfqq does not
	1772	+ * carry time-critical I/O, then bfqq's bandwidth is less
	1773	+ * important than that of queues that carry time-critical I/O.
	1774	+ * So, as a further constraint, we consider this case only if
	1775	+ * bfqq is at least as weight-raised, i.e., at least as time
	1776	+ * critical, as the in-service queue.
	1777	+ *
	1778	+ * The second case is that bfqq is in a higher priority class,
	1779	+ * or has a higher weight than the in-service queue. If this
	1780	+ * condition does not hold, we don't care because, even if
	1781	+ * bfqq does not start to be served immediately, the resulting
	1782	+ * delay for bfqq's I/O is however lower or much lower than
	1783	+ * the ideal completion time to be guaranteed to bfqq's I/O.
	1784	+ *
	1785	+ * In both cases, preemption is needed only if, according to
	1786	+ * the timestamps of both bfqq and of the in-service queue,
	1787	+ * bfqq actually is the next queue to serve. So, to reduce
	1788	+ * useless preemptions, the return value of
	1789	+ * next_queue_may_preempt() is considered in the next compound
	1790	+ * condition too. Yet next_queue_may_preempt() just checks a
	1791	+ * simple, necessary condition for bfqq to be the next queue
	1792	+ * to serve. In fact, to evaluate a sufficient condition, the
	1793	+ * timestamps of the in-service queue would need to be
	1794	+ * updated, and this operation is quite costly (see the
	1795	+ * comments on bfq_bfqq_update_budg_for_activation()).
1670	1796	*/
1671		- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
1672		- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
	1797	+ if (bfqd->in_service_queue &&
	1798	+ ((bfqq_wants_to_preempt &&
	1799	+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) \|\|
	1800	+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
1673	1801	next_queue_may_preempt(bfqd))
1674	1802	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
1675	1803	false, BFQQE_PREEMPTED);
	1804	+}
	1805	+
	1806	+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
	1807	+ struct bfq_queue *bfqq)
	1808	+{
	1809	+ /* invalidate baseline total service time */
	1810	+ bfqq->last_serv_time_ns = 0;
	1811	+
	1812	+ /*
	1813	+ * Reset pointer in case we are waiting for
	1814	+ * some request completion.
	1815	+ */
	1816	+ bfqd->waited_rq = NULL;
	1817	+
	1818	+ /*
	1819	+ * If bfqq has a short think time, then start by setting the
	1820	+ * inject limit to 0 prudentially, because the service time of
	1821	+ * an injected I/O request may be higher than the think time
	1822	+ * of bfqq, and therefore, if one request was injected when
	1823	+ * bfqq remains empty, this injected request might delay the
	1824	+ * service of the next I/O request for bfqq significantly. In
	1825	+ * case bfqq can actually tolerate some injection, then the
	1826	+ * adaptive update will however raise the limit soon. This
	1827	+ * lucky circumstance holds exactly because bfqq has a short
	1828	+ * think time, and thus, after remaining empty, is likely to
	1829	+ * get new I/O enqueued---and then completed---before being
	1830	+ * expired. This is the very pattern that gives the
	1831	+ * limit-update algorithm the chance to measure the effect of
	1832	+ * injection on request service times, and then to update the
	1833	+ * limit accordingly.
	1834	+ *
	1835	+ * However, in the following special case, the inject limit is
	1836	+ * left to 1 even if the think time is short: bfqq's I/O is
	1837	+ * synchronized with that of some other queue, i.e., bfqq may
	1838	+ * receive new I/O only after the I/O of the other queue is
	1839	+ * completed. Keeping the inject limit to 1 allows the
	1840	+ * blocking I/O to be served while bfqq is in service. And
	1841	+ * this is very convenient both for bfqq and for overall
	1842	+ * throughput, as explained in detail in the comments in
	1843	+ * bfq_update_has_short_ttime().
	1844	+ *
	1845	+ * On the opposite end, if bfqq has a long think time, then
	1846	+ * start directly by 1, because:
	1847	+ * a) on the bright side, keeping at most one request in
	1848	+ * service in the drive is unlikely to cause any harm to the
	1849	+ * latency of bfqq's requests, as the service time of a single
	1850	+ * request is likely to be lower than the think time of bfqq;
	1851	+ * b) on the downside, after becoming empty, bfqq is likely to
	1852	+ * expire before getting its next request. With this request
	1853	+ * arrival pattern, it is very hard to sample total service
	1854	+ * times and update the inject limit accordingly (see comments
	1855	+ * on bfq_update_inject_limit()). So the limit is likely to be
	1856	+ * never, or at least seldom, updated. As a consequence, by
	1857	+ * setting the limit to 1, we avoid that no injection ever
	1858	+ * occurs with bfqq. On the downside, this proactive step
	1859	+ * further reduces chances to actually compute the baseline
	1860	+ * total service time. Thus it reduces chances to execute the
	1861	+ * limit-update algorithm and possibly raise the limit to more
	1862	+ * than 1.
	1863	+ */
	1864	+ if (bfq_bfqq_has_short_ttime(bfqq))
	1865	+ bfqq->inject_limit = 0;
	1866	+ else
	1867	+ bfqq->inject_limit = 1;
	1868	+
	1869	+ bfqq->decrease_time_jif = jiffies;
1676	1870	}
1677	1871
1678	1872	static void bfq_add_request(struct request *rq)
..	..	@@ -1687,6 +1881,180 @@
1687	1881	bfqq->queued[rq_is_sync(rq)]++;
1688	1882	bfqd->queued++;
1689	1883
	1884	+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
	1885	+ /*
	1886	+ * Detect whether bfqq's I/O seems synchronized with
	1887	+ * that of some other queue, i.e., whether bfqq, after
	1888	+ * remaining empty, happens to receive new I/O only
	1889	+ * right after some I/O request of the other queue has
	1890	+ * been completed. We call waker queue the other
	1891	+ * queue, and we assume, for simplicity, that bfqq may
	1892	+ * have at most one waker queue.
	1893	+ *
	1894	+ * A remarkable throughput boost can be reached by
	1895	+ * unconditionally injecting the I/O of the waker
	1896	+ * queue, every time a new bfq_dispatch_request
	1897	+ * happens to be invoked while I/O is being plugged
	1898	+ * for bfqq. In addition to boosting throughput, this
	1899	+ * unblocks bfqq's I/O, thereby improving bandwidth
	1900	+ * and latency for bfqq. Note that these same results
	1901	+ * may be achieved with the general injection
	1902	+ * mechanism, but less effectively. For details on
	1903	+ * this aspect, see the comments on the choice of the
	1904	+ * queue for injection in bfq_select_queue().
	1905	+ *
	1906	+ * Turning back to the detection of a waker queue, a
	1907	+ * queue Q is deemed as a waker queue for bfqq if, for
	1908	+ * two consecutive times, bfqq happens to become non
	1909	+ * empty right after a request of Q has been
	1910	+ * completed. In particular, on the first time, Q is
	1911	+ * tentatively set as a candidate waker queue, while
	1912	+ * on the second time, the flag
	1913	+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
	1914	+ * is a waker queue for bfqq. These detection steps
	1915	+ * are performed only if bfqq has a long think time,
	1916	+ * so as to make it more likely that bfqq's I/O is
	1917	+ * actually being blocked by a synchronization. This
	1918	+ * last filter, plus the above two-times requirement,
	1919	+ * make false positives less likely.
	1920	+ *
	1921	+ * NOTE
	1922	+ *
	1923	+ * The sooner a waker queue is detected, the sooner
	1924	+ * throughput can be boosted by injecting I/O from the
	1925	+ * waker queue. Fortunately, detection is likely to be
	1926	+ * actually fast, for the following reasons. While
	1927	+ * blocked by synchronization, bfqq has a long think
	1928	+ * time. This implies that bfqq's inject limit is at
	1929	+ * least equal to 1 (see the comments in
	1930	+ * bfq_update_inject_limit()). So, thanks to
	1931	+ * injection, the waker queue is likely to be served
	1932	+ * during the very first I/O-plugging time interval
	1933	+ * for bfqq. This triggers the first step of the
	1934	+ * detection mechanism. Thanks again to injection, the
	1935	+ * candidate waker queue is then likely to be
	1936	+ * confirmed no later than during the next
	1937	+ * I/O-plugging interval for bfqq.
	1938	+ */
	1939	+ if (bfqd->last_completed_rq_bfqq &&
	1940	+ !bfq_bfqq_has_short_ttime(bfqq) &&
	1941	+ ktime_get_ns() - bfqd->last_completion <
	1942	+ 200 * NSEC_PER_USEC) {
	1943	+ if (bfqd->last_completed_rq_bfqq != bfqq &&
	1944	+ bfqd->last_completed_rq_bfqq !=
	1945	+ bfqq->waker_bfqq) {
	1946	+ /*
	1947	+ * First synchronization detected with
	1948	+ * a candidate waker queue, or with a
	1949	+ * different candidate waker queue
	1950	+ * from the current one.
	1951	+ */
	1952	+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
	1953	+
	1954	+ /*
	1955	+ * If the waker queue disappears, then
	1956	+ * bfqq->waker_bfqq must be reset. To
	1957	+ * this goal, we maintain in each
	1958	+ * waker queue a list, woken_list, of
	1959	+ * all the queues that reference the
	1960	+ * waker queue through their
	1961	+ * waker_bfqq pointer. When the waker
	1962	+ * queue exits, the waker_bfqq pointer
	1963	+ * of all the queues in the woken_list
	1964	+ * is reset.
	1965	+ *
	1966	+ * In addition, if bfqq is already in
	1967	+ * the woken_list of a waker queue,
	1968	+ * then, before being inserted into
	1969	+ * the woken_list of a new waker
	1970	+ * queue, bfqq must be removed from
	1971	+ * the woken_list of the old waker
	1972	+ * queue.
	1973	+ */
	1974	+ if (!hlist_unhashed(&bfqq->woken_list_node))
	1975	+ hlist_del_init(&bfqq->woken_list_node);
	1976	+ hlist_add_head(&bfqq->woken_list_node,
	1977	+ &bfqd->last_completed_rq_bfqq->woken_list);
	1978	+
	1979	+ bfq_clear_bfqq_has_waker(bfqq);
	1980	+ } else if (bfqd->last_completed_rq_bfqq ==
	1981	+ bfqq->waker_bfqq &&
	1982	+ !bfq_bfqq_has_waker(bfqq)) {
	1983	+ /*
	1984	+ * synchronization with waker_bfqq
	1985	+ * seen for the second time
	1986	+ */
	1987	+ bfq_mark_bfqq_has_waker(bfqq);
	1988	+ }
	1989	+ }
	1990	+
	1991	+ /*
	1992	+ * Periodically reset inject limit, to make sure that
	1993	+ * the latter eventually drops in case workload
	1994	+ * changes, see step (3) in the comments on
	1995	+ * bfq_update_inject_limit().
	1996	+ */
	1997	+ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	1998	+ msecs_to_jiffies(1000)))
	1999	+ bfq_reset_inject_limit(bfqd, bfqq);
	2000	+
	2001	+ /*
	2002	+ * The following conditions must hold to setup a new
	2003	+ * sampling of total service time, and then a new
	2004	+ * update of the inject limit:
	2005	+ * - bfqq is in service, because the total service
	2006	+ * time is evaluated only for the I/O requests of
	2007	+ * the queues in service;
	2008	+ * - this is the right occasion to compute or to
	2009	+ * lower the baseline total service time, because
	2010	+ * there are actually no requests in the drive,
	2011	+ * or
	2012	+ * the baseline total service time is available, and
	2013	+ * this is the right occasion to compute the other
	2014	+ * quantity needed to update the inject limit, i.e.,
	2015	+ * the total service time caused by the amount of
	2016	+ * injection allowed by the current value of the
	2017	+ * limit. It is the right occasion because injection
	2018	+ * has actually been performed during the service
	2019	+ * hole, and there are still in-flight requests,
	2020	+ * which are very likely to be exactly the injected
	2021	+ * requests, or part of them;
	2022	+ * - the minimum interval for sampling the total
	2023	+ * service time and updating the inject limit has
	2024	+ * elapsed.
	2025	+ */
	2026	+ if (bfqq == bfqd->in_service_queue &&
	2027	+ (bfqd->rq_in_driver == 0 \|\|
	2028	+ (bfqq->last_serv_time_ns > 0 &&
	2029	+ bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
	2030	+ time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	2031	+ msecs_to_jiffies(10))) {
	2032	+ bfqd->last_empty_occupied_ns = ktime_get_ns();
	2033	+ /*
	2034	+ * Start the state machine for measuring the
	2035	+ * total service time of rq: setting
	2036	+ * wait_dispatch will cause bfqd->waited_rq to
	2037	+ * be set when rq will be dispatched.
	2038	+ */
	2039	+ bfqd->wait_dispatch = true;
	2040	+ /*
	2041	+ * If there is no I/O in service in the drive,
	2042	+ * then possible injection occurred before the
	2043	+ * arrival of rq will not affect the total
	2044	+ * service time of rq. So the injection limit
	2045	+ * must not be updated as a function of such
	2046	+ * total service time, unless new injection
	2047	+ * occurs before rq is completed. To have the
	2048	+ * injection limit updated only in the latter
	2049	+ * case, reset rqs_injected here (rqs_injected
	2050	+ * will be set in case injection is performed
	2051	+ * on bfqq before rq is completed).
	2052	+ */
	2053	+ if (bfqd->rq_in_driver == 0)
	2054	+ bfqd->rqs_injected = false;
	2055	+ }
	2056	+ }
	2057	+
1690	2058	elv_rb_add(&bfqq->sort_list, rq);
1691	2059
1692	2060	/*
..	..	@@ -1698,8 +2066,9 @@
1698	2066
1699	2067	/*
1700	2068	* Adjust priority tree position, if next_rq changes.
	2069	+ * See comments on bfq_pos_tree_add_move() for the unlikely().
1701	2070	*/
1702		- if (prev != bfqq->next_rq)
	2071	+ if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
1703	2072	bfq_pos_tree_add_move(bfqd, bfqq);
1704	2073
1705	2074	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
..	..	@@ -1839,7 +2208,9 @@
1839	2208	bfqq->pos_root = NULL;
1840	2209	}
1841	2210	} else {
1842		- bfq_pos_tree_add_move(bfqd, bfqq);
	2211	+ /* see comments on bfq_pos_tree_add_move() for the unlikely() */
	2212	+ if (unlikely(!bfqd->nonrot_with_queueing))
	2213	+ bfq_pos_tree_add_move(bfqd, bfqq);
1843	2214	}
1844	2215
1845	2216	if (rq->cmd_flags & REQ_META)
..	..	@@ -1847,9 +2218,9 @@
1847	2218
1848	2219	}
1849	2220
1850		-static bool bfq_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
	2221	+static bool bfq_bio_merge(struct request_queue q, struct bio bio,
	2222	+ unsigned int nr_segs)
1851	2223	{
1852		- struct request_queue *q = hctx->queue;
1853	2224	struct bfq_data *bfqd = q->elevator->elevator_data;
1854	2225	struct request *free = NULL;
1855	2226	/*
..	..	@@ -1864,13 +2235,20 @@
1864	2235
1865	2236	spin_lock_irq(&bfqd->lock);
1866	2237
1867		- if (bic)
	2238	+ if (bic) {
	2239	+ /*
	2240	+ * Make sure cgroup info is uptodate for current process before
	2241	+ * considering the merge.
	2242	+ */
	2243	+ bfq_bic_update_cgroup(bic, bio);
	2244	+
1868	2245	bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
1869		- else
	2246	+ } else {
1870	2247	bfqd->bio_bfqq = NULL;
	2248	+ }
1871	2249	bfqd->bio_bic = bic;
1872	2250
1873		- ret = blk_mq_sched_try_merge(q, bio, &free);
	2251	+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
1874	2252
1875	2253	if (free)
1876	2254	blk_mq_free_request(free);
..	..	@@ -1888,13 +2266,14 @@
1888	2266	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
1889	2267	if (__rq && elv_bio_merge_ok(__rq, bio)) {
1890	2268	*req = __rq;
	2269	+
	2270	+ if (blk_discard_mergable(__rq))
	2271	+ return ELEVATOR_DISCARD_MERGE;
1891	2272	return ELEVATOR_FRONT_MERGE;
1892	2273	}
1893	2274
1894	2275	return ELEVATOR_NO_MERGE;
1895	2276	}
1896		-
1897		-static struct bfq_queue bfq_init_rq(struct request rq);
1898	2277
1899	2278	static void bfq_request_merged(struct request_queue q, struct request req,
1900	2279	enum elv_merge type)
..	..	@@ -1904,7 +2283,7 @@
1904	2283	blk_rq_pos(req) <
1905	2284	blk_rq_pos(container_of(rb_prev(&req->rb_node),
1906	2285	struct request, rb_node))) {
1907		- struct bfq_queue *bfqq = bfq_init_rq(req);
	2286	+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1908	2287	struct bfq_data *bfqd;
1909	2288	struct request prev, next_rq;
1910	2289
..	..	@@ -1929,7 +2308,12 @@
1929	2308	*/
1930	2309	if (prev != bfqq->next_rq) {
1931	2310	bfq_updated_next_req(bfqd, bfqq);
1932		- bfq_pos_tree_add_move(bfqd, bfqq);
	2311	+ /*
	2312	+ * See comments on bfq_pos_tree_add_move() for
	2313	+ * the unlikely().
	2314	+ */
	2315	+ if (unlikely(!bfqd->nonrot_with_queueing))
	2316	+ bfq_pos_tree_add_move(bfqd, bfqq);
1933	2317	}
1934	2318	}
1935	2319	}
..	..	@@ -1951,8 +2335,8 @@
1951	2335	static void bfq_requests_merged(struct request_queue q, struct request rq,
1952	2336	struct request *next)
1953	2337	{
1954		- struct bfq_queue *bfqq = bfq_init_rq(rq),
1955		- *next_bfqq = bfq_init_rq(next);
	2338	+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
	2339	+ *next_bfqq = RQ_BFQQ(next);
1956	2340
1957	2341	if (!bfqq)
1958	2342	return;
..	..	@@ -2131,6 +2515,14 @@
2131	2515	if (process_refs == 0 \|\| new_process_refs == 0)
2132	2516	return NULL;
2133	2517
	2518	+ /*
	2519	+ * Make sure merged queues belong to the same parent. Parents could
	2520	+ * have changed since the time we decided the two queues are suitable
	2521	+ * for merging.
	2522	+ */
	2523	+ if (new_bfqq->entity.parent != bfqq->entity.parent)
	2524	+ return NULL;
	2525	+
2134	2526	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2135	2527	new_bfqq->pid);
2136	2528
..	..	@@ -2155,6 +2547,15 @@
2155	2547	* are likely to increase the throughput.
2156	2548	*/
2157	2549	bfqq->new_bfqq = new_bfqq;
	2550	+ /*
	2551	+ * The above assignment schedules the following redirections:
	2552	+ * each time some I/O for bfqq arrives, the process that
	2553	+ * generated that I/O is disassociated from bfqq and
	2554	+ * associated with new_bfqq. Here we increases new_bfqq->ref
	2555	+ * in advance, adding the number of processes that are
	2556	+ * expected to be associated with new_bfqq as they happen to
	2557	+ * issue I/O.
	2558	+ */
2158	2559	new_bfqq->ref += process_refs;
2159	2560	return new_bfqq;
2160	2561	}
..	..	@@ -2214,6 +2615,50 @@
2214	2615	{
2215	2616	struct bfq_queue in_service_bfqq, new_bfqq;
2216	2617
	2618	+ /* if a merge has already been setup, then proceed with that first */
	2619	+ if (bfqq->new_bfqq)
	2620	+ return bfqq->new_bfqq;
	2621	+
	2622	+ /*
	2623	+ * Do not perform queue merging if the device is non
	2624	+ * rotational and performs internal queueing. In fact, such a
	2625	+ * device reaches a high speed through internal parallelism
	2626	+ * and pipelining. This means that, to reach a high
	2627	+ * throughput, it must have many requests enqueued at the same
	2628	+ * time. But, in this configuration, the internal scheduling
	2629	+ * algorithm of the device does exactly the job of queue
	2630	+ * merging: it reorders requests so as to obtain as much as
	2631	+ * possible a sequential I/O pattern. As a consequence, with
	2632	+ * the workload generated by processes doing interleaved I/O,
	2633	+ * the throughput reached by the device is likely to be the
	2634	+ * same, with and without queue merging.
	2635	+ *
	2636	+ * Disabling merging also provides a remarkable benefit in
	2637	+ * terms of throughput. Merging tends to make many workloads
	2638	+ * artificially more uneven, because of shared queues
	2639	+ * remaining non empty for incomparably more time than
	2640	+ * non-merged queues. This may accentuate workload
	2641	+ * asymmetries. For example, if one of the queues in a set of
	2642	+ * merged queues has a higher weight than a normal queue, then
	2643	+ * the shared queue may inherit such a high weight and, by
	2644	+ * staying almost always active, may force BFQ to perform I/O
	2645	+ * plugging most of the time. This evidently makes it harder
	2646	+ * for BFQ to let the device reach a high throughput.
	2647	+ *
	2648	+ * Finally, the likely() macro below is not used because one
	2649	+ * of the two branches is more likely than the other, but to
	2650	+ * have the code path after the following if() executed as
	2651	+ * fast as possible for the case of a non rotational device
	2652	+ * with queueing. We want it because this is the fastest kind
	2653	+ * of device. On the opposite end, the likely() may lengthen
	2654	+ * the execution time of BFQ for the case of slower devices
	2655	+ * (rotational or at least without queueing). But in this case
	2656	+ * the execution time of BFQ matters very little, if not at
	2657	+ * all.
	2658	+ */
	2659	+ if (likely(bfqd->nonrot_with_queueing))
	2660	+ return NULL;
	2661	+
2217	2662	/*
2218	2663	* Prevent bfqq from being merged if it has been created too
2219	2664	* long ago. The idea is that true cooperating processes, and
..	..	@@ -2228,14 +2673,11 @@
2228	2673	if (bfq_too_late_for_merging(bfqq))
2229	2674	return NULL;
2230	2675
2231		- if (bfqq->new_bfqq)
2232		- return bfqq->new_bfqq;
2233		-
2234	2676	if (!io_struct \|\| unlikely(bfqq == &bfqd->oom_bfqq))
2235	2677	return NULL;
2236	2678
2237	2679	/* If there is only one backlogged queue, don't search. */
2238		- if (bfqd->busy_queues == 1)
	2680	+ if (bfq_tot_busy_queues(bfqd) == 1)
2239	2681	return NULL;
2240	2682
2241	2683	in_service_bfqq = bfqd->in_service_queue;
..	..	@@ -2277,6 +2719,7 @@
2277	2719	if (!bic)
2278	2720	return;
2279	2721
	2722	+ bic->saved_weight = bfqq->entity.orig_weight;
2280	2723	bic->saved_ttime = bfqq->ttime;
2281	2724	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
2282	2725	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
..	..	@@ -2295,6 +2738,7 @@
2295	2738	* to enjoy weight raising if split soon.
2296	2739	*/
2297	2740	bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
	2741	+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
2298	2742	bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
2299	2743	bic->saved_last_wr_start_finish = jiffies;
2300	2744	} else {
..	..	@@ -2304,6 +2748,26 @@
2304	2748	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
2305	2749	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
2306	2750	}
	2751	+}
	2752	+
	2753	+void bfq_release_process_ref(struct bfq_data bfqd, struct bfq_queue bfqq)
	2754	+{
	2755	+ /*
	2756	+ * To prevent bfqq's service guarantees from being violated,
	2757	+ * bfqq may be left busy, i.e., queued for service, even if
	2758	+ * empty (see comments in __bfq_bfqq_expire() for
	2759	+ * details). But, if no process will send requests to bfqq any
	2760	+ * longer, then there is no point in keeping bfqq queued for
	2761	+ * service. In addition, keeping bfqq queued for service, but
	2762	+ * with no process ref any longer, may have caused bfqq to be
	2763	+ * freed when dequeued from service. But this is assumed to
	2764	+ * never happen.
	2765	+ */
	2766	+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
	2767	+ bfqq != bfqd->in_service_queue)
	2768	+ bfq_del_bfqq_busy(bfqd, bfqq, false);
	2769	+
	2770	+ bfq_put_queue(bfqq);
2307	2771	}
2308	2772
2309	2773	static void
..	..	@@ -2352,7 +2816,7 @@
2352	2816	/*
2353	2817	* Merge queues (that is, let bic redirect its requests to new_bfqq)
2354	2818	*/
2355		- bic_set_bfqq(bic, new_bfqq, 1);
	2819	+ bic_set_bfqq(bic, new_bfqq, true);
2356	2820	bfq_mark_bfqq_coop(new_bfqq);
2357	2821	/*
2358	2822	* new_bfqq now belongs to at least two bics (it is a shared queue):
..	..	@@ -2365,9 +2829,18 @@
2365	2829	* assignment causes no harm).
2366	2830	*/
2367	2831	new_bfqq->bic = NULL;
	2832	+ /*
	2833	+ * If the queue is shared, the pid is the pid of one of the associated
	2834	+ * processes. Which pid depends on the exact sequence of merge events
	2835	+ * the queue underwent. So printing such a pid is useless and confusing
	2836	+ * because it reports a random pid between those of the associated
	2837	+ * processes.
	2838	+ * We mark such a queue with a pid -1, and then print SHARED instead of
	2839	+ * a pid in logging messages.
	2840	+ */
	2841	+ new_bfqq->pid = -1;
2368	2842	bfqq->bic = NULL;
2369		- /* release process reference to bfqq */
2370		- bfq_put_queue(bfqq);
	2843	+ bfq_release_process_ref(bfqd, bfqq);
2371	2844	}
2372	2845
2373	2846	static bool bfq_allow_bio_merge(struct request_queue q, struct request rq,
..	..	@@ -2399,8 +2872,8 @@
2399	2872	/*
2400	2873	* bic still points to bfqq, then it has not yet been
2401	2874	* redirected to some other bfq_queue, and a queue
2402		- * merge beween bfqq and new_bfqq can be safely
2403		- * fulfillled, i.e., bic can be redirected to new_bfqq
	2875	+ * merge between bfqq and new_bfqq can be safely
	2876	+ * fulfilled, i.e., bic can be redirected to new_bfqq
2404	2877	* and bfqq can be put.
2405	2878	*/
2406	2879	bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
..	..	@@ -2535,12 +3008,14 @@
2535	3008	* queue).
2536	3009	*/
2537	3010	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2538		- bfq_symmetric_scenario(bfqd))
	3011	+ !bfq_asymmetric_scenario(bfqd, bfqq))
2539	3012	sl = min_t(u64, sl, BFQ_MIN_TT);
2540	3013	else if (bfqq->wr_coeff > 1)
2541	3014	sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
2542	3015
2543	3016	bfqd->last_idling_start = ktime_get();
	3017	+ bfqd->last_idling_start_jiffies = jiffies;
	3018	+
2544	3019	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2545	3020	HRTIMER_MODE_REL);
2546	3021	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
..	..	@@ -2764,7 +3239,7 @@
2764	3239
2765	3240	if ((bfqd->rq_in_driver > 0 \|\|
2766	3241	now_ns - bfqd->last_completion < BFQ_MIN_TT)
2767		- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
	3242	+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
2768	3243	bfqd->sequential_samples++;
2769	3244
2770	3245	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
..	..	@@ -2816,7 +3291,209 @@
2816	3291	bfq_remove_request(q, rq);
2817	3292	}
2818	3293
2819		-static bool __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
	3294	+/*
	3295	+ * There is a case where idling does not have to be performed for
	3296	+ * throughput concerns, but to preserve the throughput share of
	3297	+ * the process associated with bfqq.
	3298	+ *
	3299	+ * To introduce this case, we can note that allowing the drive
	3300	+ * to enqueue more than one request at a time, and hence
	3301	+ * delegating de facto final scheduling decisions to the
	3302	+ * drive's internal scheduler, entails loss of control on the
	3303	+ * actual request service order. In particular, the critical
	3304	+ * situation is when requests from different processes happen
	3305	+ * to be present, at the same time, in the internal queue(s)
	3306	+ * of the drive. In such a situation, the drive, by deciding
	3307	+ * the service order of the internally-queued requests, does
	3308	+ * determine also the actual throughput distribution among
	3309	+ * these processes. But the drive typically has no notion or
	3310	+ * concern about per-process throughput distribution, and
	3311	+ * makes its decisions only on a per-request basis. Therefore,
	3312	+ * the service distribution enforced by the drive's internal
	3313	+ * scheduler is likely to coincide with the desired throughput
	3314	+ * distribution only in a completely symmetric, or favorably
	3315	+ * skewed scenario where:
	3316	+ * (i-a) each of these processes must get the same throughput as
	3317	+ * the others,
	3318	+ * (i-b) in case (i-a) does not hold, it holds that the process
	3319	+ * associated with bfqq must receive a lower or equal
	3320	+ * throughput than any of the other processes;
	3321	+ * (ii) the I/O of each process has the same properties, in
	3322	+ * terms of locality (sequential or random), direction
	3323	+ * (reads or writes), request sizes, greediness
	3324	+ * (from I/O-bound to sporadic), and so on;
	3325	+
	3326	+ * In fact, in such a scenario, the drive tends to treat the requests
	3327	+ * of each process in about the same way as the requests of the
	3328	+ * others, and thus to provide each of these processes with about the
	3329	+ * same throughput. This is exactly the desired throughput
	3330	+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
	3331	+ * even more convenient distribution for (the process associated with)
	3332	+ * bfqq.
	3333	+ *
	3334	+ * In contrast, in any asymmetric or unfavorable scenario, device
	3335	+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
	3336	+ * that bfqq receives its assigned fraction of the device throughput
	3337	+ * (see [1] for details).
	3338	+ *
	3339	+ * The problem is that idling may significantly reduce throughput with
	3340	+ * certain combinations of types of I/O and devices. An important
	3341	+ * example is sync random I/O on flash storage with command
	3342	+ * queueing. So, unless bfqq falls in cases where idling also boosts
	3343	+ * throughput, it is important to check conditions (i-a), i(-b) and
	3344	+ * (ii) accurately, so as to avoid idling when not strictly needed for
	3345	+ * service guarantees.
	3346	+ *
	3347	+ * Unfortunately, it is extremely difficult to thoroughly check
	3348	+ * condition (ii). And, in case there are active groups, it becomes
	3349	+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
	3350	+ * if there are active groups, then, for conditions (i-a) or (i-b) to
	3351	+ * become false 'indirectly', it is enough that an active group
	3352	+ * contains more active processes or sub-groups than some other active
	3353	+ * group. More precisely, for conditions (i-a) or (i-b) to become
	3354	+ * false because of such a group, it is not even necessary that the
	3355	+ * group is (still) active: it is sufficient that, even if the group
	3356	+ * has become inactive, some of its descendant processes still have
	3357	+ * some request already dispatched but still waiting for
	3358	+ * completion. In fact, requests have still to be guaranteed their
	3359	+ * share of the throughput even after being dispatched. In this
	3360	+ * respect, it is easy to show that, if a group frequently becomes
	3361	+ * inactive while still having in-flight requests, and if, when this
	3362	+ * happens, the group is not considered in the calculation of whether
	3363	+ * the scenario is asymmetric, then the group may fail to be
	3364	+ * guaranteed its fair share of the throughput (basically because
	3365	+ * idling may not be performed for the descendant processes of the
	3366	+ * group, but it had to be). We address this issue with the following
	3367	+ * bi-modal behavior, implemented in the function
	3368	+ * bfq_asymmetric_scenario().
	3369	+ *
	3370	+ * If there are groups with requests waiting for completion
	3371	+ * (as commented above, some of these groups may even be
	3372	+ * already inactive), then the scenario is tagged as
	3373	+ * asymmetric, conservatively, without checking any of the
	3374	+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
	3375	+ * This behavior matches also the fact that groups are created
	3376	+ * exactly if controlling I/O is a primary concern (to
	3377	+ * preserve bandwidth and latency guarantees).
	3378	+ *
	3379	+ * On the opposite end, if there are no groups with requests waiting
	3380	+ * for completion, then only conditions (i-a) and (i-b) are actually
	3381	+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
	3382	+ * idling is not performed, regardless of whether condition (ii)
	3383	+ * holds. In other words, only if conditions (i-a) and (i-b) do not
	3384	+ * hold, then idling is allowed, and the device tends to be prevented
	3385	+ * from queueing many requests, possibly of several processes. Since
	3386	+ * there are no groups with requests waiting for completion, then, to
	3387	+ * control conditions (i-a) and (i-b) it is enough to check just
	3388	+ * whether all the queues with requests waiting for completion also
	3389	+ * have the same weight.
	3390	+ *
	3391	+ * Not checking condition (ii) evidently exposes bfqq to the
	3392	+ * risk of getting less throughput than its fair share.
	3393	+ * However, for queues with the same weight, a further
	3394	+ * mechanism, preemption, mitigates or even eliminates this
	3395	+ * problem. And it does so without consequences on overall
	3396	+ * throughput. This mechanism and its benefits are explained
	3397	+ * in the next three paragraphs.
	3398	+ *
	3399	+ * Even if a queue, say Q, is expired when it remains idle, Q
	3400	+ * can still preempt the new in-service queue if the next
	3401	+ * request of Q arrives soon (see the comments on
	3402	+ * bfq_bfqq_update_budg_for_activation). If all queues and
	3403	+ * groups have the same weight, this form of preemption,
	3404	+ * combined with the hole-recovery heuristic described in the
	3405	+ * comments on function bfq_bfqq_update_budg_for_activation,
	3406	+ * are enough to preserve a correct bandwidth distribution in
	3407	+ * the mid term, even without idling. In fact, even if not
	3408	+ * idling allows the internal queues of the device to contain
	3409	+ * many requests, and thus to reorder requests, we can rather
	3410	+ * safely assume that the internal scheduler still preserves a
	3411	+ * minimum of mid-term fairness.
	3412	+ *
	3413	+ * More precisely, this preemption-based, idleless approach
	3414	+ * provides fairness in terms of IOPS, and not sectors per
	3415	+ * second. This can be seen with a simple example. Suppose
	3416	+ * that there are two queues with the same weight, but that
	3417	+ * the first queue receives requests of 8 sectors, while the
	3418	+ * second queue receives requests of 1024 sectors. In
	3419	+ * addition, suppose that each of the two queues contains at
	3420	+ * most one request at a time, which implies that each queue
	3421	+ * always remains idle after it is served. Finally, after
	3422	+ * remaining idle, each queue receives very quickly a new
	3423	+ * request. It follows that the two queues are served
	3424	+ * alternatively, preempting each other if needed. This
	3425	+ * implies that, although both queues have the same weight,
	3426	+ * the queue with large requests receives a service that is
	3427	+ * 1024/8 times as high as the service received by the other
	3428	+ * queue.
	3429	+ *
	3430	+ * The motivation for using preemption instead of idling (for
	3431	+ * queues with the same weight) is that, by not idling,
	3432	+ * service guarantees are preserved (completely or at least in
	3433	+ * part) without minimally sacrificing throughput. And, if
	3434	+ * there is no active group, then the primary expectation for
	3435	+ * this device is probably a high throughput.
	3436	+ *
	3437	+ * We are now left only with explaining the two sub-conditions in the
	3438	+ * additional compound condition that is checked below for deciding
	3439	+ * whether the scenario is asymmetric. To explain the first
	3440	+ * sub-condition, we need to add that the function
	3441	+ * bfq_asymmetric_scenario checks the weights of only
	3442	+ * non-weight-raised queues, for efficiency reasons (see comments on
	3443	+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
	3444	+ * is checked explicitly here. More precisely, the compound condition
	3445	+ * below takes into account also the fact that, even if bfqq is being
	3446	+ * weight-raised, the scenario is still symmetric if all queues with
	3447	+ * requests waiting for completion happen to be
	3448	+ * weight-raised. Actually, we should be even more precise here, and
	3449	+ * differentiate between interactive weight raising and soft real-time
	3450	+ * weight raising.
	3451	+ *
	3452	+ * The second sub-condition checked in the compound condition is
	3453	+ * whether there is a fair amount of already in-flight I/O not
	3454	+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
	3455	+ * following reason. The drive may decide to serve in-flight
	3456	+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
	3457	+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
	3458	+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
	3459	+ * basically uncontrolled amount of I/O from other queues may be
	3460	+ * dispatched too, possibly causing the service of bfqq's I/O to be
	3461	+ * delayed even longer in the drive. This problem gets more and more
	3462	+ * serious as the speed and the queue depth of the drive grow,
	3463	+ * because, as these two quantities grow, the probability to find no
	3464	+ * queue busy but many requests in flight grows too. By contrast,
	3465	+ * plugging I/O dispatching minimizes the delay induced by already
	3466	+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
	3467	+ * lose because of this delay.
	3468	+ *
	3469	+ * As a side note, it is worth considering that the above
	3470	+ * device-idling countermeasures may however fail in the following
	3471	+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
	3472	+ * in a time period during which all symmetry sub-conditions hold, and
	3473	+ * therefore the device is allowed to enqueue many requests, but at
	3474	+ * some later point in time some sub-condition stops to hold, then it
	3475	+ * may become impossible to make requests be served in the desired
	3476	+ * order until all the requests already queued in the device have been
	3477	+ * served. The last sub-condition commented above somewhat mitigates
	3478	+ * this problem for weight-raised queues.
	3479	+ */
	3480	+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
	3481	+ struct bfq_queue *bfqq)
	3482	+{
	3483	+ /* No point in idling for bfqq if it won't get requests any longer */
	3484	+ if (unlikely(!bfqq_process_refs(bfqq)))
	3485	+ return false;
	3486	+
	3487	+ return (bfqq->wr_coeff > 1 &&
	3488	+ (bfqd->wr_busy_queues <
	3489	+ bfq_tot_busy_queues(bfqd) \|\|
	3490	+ bfqd->rq_in_driver >=
	3491	+ bfqq->dispatched + 4)) \|\|
	3492	+ bfq_asymmetric_scenario(bfqd, bfqq);
	3493	+}
	3494	+
	3495	+static bool __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq,
	3496	+ enum bfqq_expiration reason)
2820	3497	{
2821	3498	/*
2822	3499	* If this bfqq is shared between multiple processes, check
..	..	@@ -2827,7 +3504,22 @@
2827	3504	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2828	3505	bfq_mark_bfqq_split_coop(bfqq);
2829	3506
2830		- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
	3507	+ /*
	3508	+ * Consider queues with a higher finish virtual time than
	3509	+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
	3510	+ * true, then bfqq's bandwidth would be violated if an
	3511	+ * uncontrolled amount of I/O from these queues were
	3512	+ * dispatched while bfqq is waiting for its new I/O to
	3513	+ * arrive. This is exactly what may happen if this is a forced
	3514	+ * expiration caused by a preemption attempt, and if bfqq is
	3515	+ * not re-scheduled. To prevent this from happening, re-queue
	3516	+ * bfqq if it needs I/O-dispatch plugging, even if it is
	3517	+ * empty. By doing so, bfqq is granted to be served before the
	3518	+ * above queues (provided that bfqq is of course eligible).
	3519	+ */
	3520	+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
	3521	+ !(reason == BFQQE_PREEMPTED &&
	3522	+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
2831	3523	if (bfqq->dispatched == 0)
2832	3524	/*
2833	3525	* Overloading budget_timeout field to store
..	..	@@ -2842,8 +3534,11 @@
2842	3534	bfq_requeue_bfqq(bfqd, bfqq, true);
2843	3535	/*
2844	3536	* Resort priority tree of potential close cooperators.
	3537	+ * See comments on bfq_pos_tree_add_move() for the unlikely().
2845	3538	*/
2846		- bfq_pos_tree_add_move(bfqd, bfqq);
	3539	+ if (unlikely(!bfqd->nonrot_with_queueing &&
	3540	+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
	3541	+ bfq_pos_tree_add_move(bfqd, bfqq);
2847	3542	}
2848	3543
2849	3544	/*
..	..	@@ -3217,13 +3912,6 @@
3217	3912	jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
3218	3913	}
3219	3914
3220		-static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
3221		-{
3222		- return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
3223		- blk_queue_nonrot(bfqq->bfqd->queue) &&
3224		- bfqq->bfqd->hw_tag;
3225		-}
3226		-
3227	3915	/**
3228	3916	* bfq_bfqq_expire - expire a queue.
3229	3917	* @bfqd: device owning the queue.
..	..	@@ -3299,16 +3987,32 @@
3299	3987	* requests, then the request pattern is isochronous
3300	3988	* (see the comments on the function
3301	3989	* bfq_bfqq_softrt_next_start()). Thus we can compute
3302		- * soft_rt_next_start. If, instead, the queue still
3303		- * has outstanding requests, then we have to wait for
3304		- * the completion of all the outstanding requests to
3305		- * discover whether the request pattern is actually
3306		- * isochronous.
	3990	+ * soft_rt_next_start. And we do it, unless bfqq is in
	3991	+ * interactive weight raising. We do not do it in the
	3992	+ * latter subcase, for the following reason. bfqq may
	3993	+ * be conveying the I/O needed to load a soft
	3994	+ * real-time application. Such an application will
	3995	+ * actually exhibit a soft real-time I/O pattern after
	3996	+ * it finally starts doing its job. But, if
	3997	+ * soft_rt_next_start is computed here for an
	3998	+ * interactive bfqq, and bfqq had received a lot of
	3999	+ * service before remaining with no outstanding
	4000	+ * request (likely to happen on a fast device), then
	4001	+ * soft_rt_next_start would be assigned such a high
	4002	+ * value that, for a very long time, bfqq would be
	4003	+ * prevented from being possibly considered as soft
	4004	+ * real time.
	4005	+ *
	4006	+ * If, instead, the queue still has outstanding
	4007	+ * requests, then we have to wait for the completion
	4008	+ * of all the outstanding requests to discover whether
	4009	+ * the request pattern is actually isochronous.
3307	4010	*/
3308		- if (bfqq->dispatched == 0)
	4011	+ if (bfqq->dispatched == 0 &&
	4012	+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
3309	4013	bfqq->soft_rt_next_start =
3310	4014	bfq_bfqq_softrt_next_start(bfqd, bfqq);
3311		- else {
	4015	+ else if (bfqq->dispatched > 0) {
3312	4016	/*
3313	4017	* Schedule an update of soft_rt_next_start to when
3314	4018	* the task may be discovered to be isochronous.
..	..	@@ -3322,15 +4026,21 @@
3322	4026	slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
3323	4027
3324	4028	/*
	4029	+ * bfqq expired, so no total service time needs to be computed
	4030	+ * any longer: reset state machine for measuring total service
	4031	+ * times.
	4032	+ */
	4033	+ bfqd->rqs_injected = bfqd->wait_dispatch = false;
	4034	+ bfqd->waited_rq = NULL;
	4035	+
	4036	+ /*
3325	4037	* Increase, decrease or leave budget unchanged according to
3326	4038	* reason.
3327	4039	*/
3328	4040	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3329		- if (__bfq_bfqq_expire(bfqd, bfqq))
	4041	+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
3330	4042	/* bfqq is gone, no more actions on it */
3331	4043	return;
3332		-
3333		- bfqq->injected_service = 0;
3334	4044
3335	4045	/* mark bfqq as waiting a request only if a bic still points to it */
3336	4046	if (!bfq_bfqq_busy(bfqq) &&
..	..	@@ -3399,52 +4109,16 @@
3399	4109	bfq_bfqq_budget_timeout(bfqq);
3400	4110	}
3401	4111
3402		-/*
3403		- * For a queue that becomes empty, device idling is allowed only if
3404		- * this function returns true for the queue. As a consequence, since
3405		- * device idling plays a critical role in both throughput boosting and
3406		- * service guarantees, the return value of this function plays a
3407		- * critical role in both these aspects as well.
3408		- *
3409		- * In a nutshell, this function returns true only if idling is
3410		- * beneficial for throughput or, even if detrimental for throughput,
3411		- * idling is however necessary to preserve service guarantees (low
3412		- * latency, desired throughput distribution, ...). In particular, on
3413		- * NCQ-capable devices, this function tries to return false, so as to
3414		- * help keep the drives' internal queues full, whenever this helps the
3415		- * device boost the throughput without causing any service-guarantee
3416		- * issue.
3417		- *
3418		- * In more detail, the return value of this function is obtained by,
3419		- * first, computing a number of boolean variables that take into
3420		- * account throughput and service-guarantee issues, and, then,
3421		- * combining these variables in a logical expression. Most of the
3422		- * issues taken into account are not trivial. We discuss these issues
3423		- * individually while introducing the variables.
3424		- */
3425		-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
	4112	+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
	4113	+ struct bfq_queue *bfqq)
3426	4114	{
3427		- struct bfq_data *bfqd = bfqq->bfqd;
3428	4115	bool rot_without_queueing =
3429	4116	!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
3430	4117	bfqq_sequential_and_IO_bound,
3431		- idling_boosts_thr, idling_boosts_thr_without_issues,
3432		- idling_needed_for_service_guarantees,
3433		- asymmetric_scenario;
	4118	+ idling_boosts_thr;
3434	4119
3435		- if (bfqd->strict_guarantees)
3436		- return true;
3437		-
3438		- /*
3439		- * Idling is performed only if slice_idle > 0. In addition, we
3440		- * do not idle if
3441		- * (a) bfqq is async
3442		- * (b) bfqq is in the idle io prio class: in this case we do
3443		- * not idle because we want to minimize the bandwidth that
3444		- * queues in this class can steal to higher-priority queues
3445		- */
3446		- if (bfqd->bfq_slice_idle == 0 \|\| !bfq_bfqq_sync(bfqq) \|\|
3447		- bfq_class_idle(bfqq))
	4120	+ /* No point in idling for bfqq if it won't get requests any longer */
	4121	+ if (unlikely(!bfqq_process_refs(bfqq)))
3448	4122	return false;
3449	4123
3450	4124	bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
..	..	@@ -3477,8 +4151,7 @@
3477	4151	bfqq_sequential_and_IO_bound);
3478	4152
3479	4153	/*
3480		- * The value of the next variable,
3481		- * idling_boosts_thr_without_issues, is equal to that of
	4154	+ * The return value of this function is equal to that of
3482	4155	* idling_boosts_thr, unless a special case holds. In this
3483	4156	* special case, described below, idling may cause problems to
3484	4157	* weight-raised queues.
..	..	@@ -3495,217 +4168,85 @@
3495	4168	* which enqueue several requests in advance, and further
3496	4169	* reorder internally-queued requests.
3497	4170	*
3498		- * For this reason, we force to false the value of
3499		- * idling_boosts_thr_without_issues if there are weight-raised
3500		- * busy queues. In this case, and if bfqq is not weight-raised,
3501		- * this guarantees that the device is not idled for bfqq (if,
3502		- * instead, bfqq is weight-raised, then idling will be
3503		- * guaranteed by another variable, see below). Combined with
3504		- * the timestamping rules of BFQ (see [1] for details), this
3505		- * behavior causes bfqq, and hence any sync non-weight-raised
3506		- * queue, to get a lower number of requests served, and thus
3507		- * to ask for a lower number of requests from the request
3508		- * pool, before the busy weight-raised queues get served
3509		- * again. This often mitigates starvation problems in the
3510		- * presence of heavy write workloads and NCQ, thereby
3511		- * guaranteeing a higher application and system responsiveness
3512		- * in these hostile scenarios.
	4171	+ * For this reason, we force to false the return value if
	4172	+ * there are weight-raised busy queues. In this case, and if
	4173	+ * bfqq is not weight-raised, this guarantees that the device
	4174	+ * is not idled for bfqq (if, instead, bfqq is weight-raised,
	4175	+ * then idling will be guaranteed by another variable, see
	4176	+ * below). Combined with the timestamping rules of BFQ (see
	4177	+ * [1] for details), this behavior causes bfqq, and hence any
	4178	+ * sync non-weight-raised queue, to get a lower number of
	4179	+ * requests served, and thus to ask for a lower number of
	4180	+ * requests from the request pool, before the busy
	4181	+ * weight-raised queues get served again. This often mitigates
	4182	+ * starvation problems in the presence of heavy write
	4183	+ * workloads and NCQ, thereby guaranteeing a higher
	4184	+ * application and system responsiveness in these hostile
	4185	+ * scenarios.
3513	4186	*/
3514		- idling_boosts_thr_without_issues = idling_boosts_thr &&
	4187	+ return idling_boosts_thr &&
3515	4188	bfqd->wr_busy_queues == 0;
	4189	+}
	4190	+
	4191	+/*
	4192	+ * For a queue that becomes empty, device idling is allowed only if
	4193	+ * this function returns true for that queue. As a consequence, since
	4194	+ * device idling plays a critical role for both throughput boosting
	4195	+ * and service guarantees, the return value of this function plays a
	4196	+ * critical role as well.
	4197	+ *
	4198	+ * In a nutshell, this function returns true only if idling is
	4199	+ * beneficial for throughput or, even if detrimental for throughput,
	4200	+ * idling is however necessary to preserve service guarantees (low
	4201	+ * latency, desired throughput distribution, ...). In particular, on
	4202	+ * NCQ-capable devices, this function tries to return false, so as to
	4203	+ * help keep the drives' internal queues full, whenever this helps the
	4204	+ * device boost the throughput without causing any service-guarantee
	4205	+ * issue.
	4206	+ *
	4207	+ * Most of the issues taken into account to get the return value of
	4208	+ * this function are not trivial. We discuss these issues in the two
	4209	+ * functions providing the main pieces of information needed by this
	4210	+ * function.
	4211	+ */
	4212	+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
	4213	+{
	4214	+ struct bfq_data *bfqd = bfqq->bfqd;
	4215	+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
	4216	+
	4217	+ /* No point in idling for bfqq if it won't get requests any longer */
	4218	+ if (unlikely(!bfqq_process_refs(bfqq)))
	4219	+ return false;
	4220	+
	4221	+ if (unlikely(bfqd->strict_guarantees))
	4222	+ return true;
3516	4223
3517	4224	/*
3518		- * There is then a case where idling must be performed not
3519		- * for throughput concerns, but to preserve service
3520		- * guarantees.
3521		- *
3522		- * To introduce this case, we can note that allowing the drive
3523		- * to enqueue more than one request at a time, and hence
3524		- * delegating de facto final scheduling decisions to the
3525		- * drive's internal scheduler, entails loss of control on the
3526		- * actual request service order. In particular, the critical
3527		- * situation is when requests from different processes happen
3528		- * to be present, at the same time, in the internal queue(s)
3529		- * of the drive. In such a situation, the drive, by deciding
3530		- * the service order of the internally-queued requests, does
3531		- * determine also the actual throughput distribution among
3532		- * these processes. But the drive typically has no notion or
3533		- * concern about per-process throughput distribution, and
3534		- * makes its decisions only on a per-request basis. Therefore,
3535		- * the service distribution enforced by the drive's internal
3536		- * scheduler is likely to coincide with the desired
3537		- * device-throughput distribution only in a completely
3538		- * symmetric scenario where:
3539		- * (i) each of these processes must get the same throughput as
3540		- * the others;
3541		- * (ii) the I/O of each process has the same properties, in
3542		- * terms of locality (sequential or random), direction
3543		- * (reads or writes), request sizes, greediness
3544		- * (from I/O-bound to sporadic), and so on.
3545		- * In fact, in such a scenario, the drive tends to treat
3546		- * the requests of each of these processes in about the same
3547		- * way as the requests of the others, and thus to provide
3548		- * each of these processes with about the same throughput
3549		- * (which is exactly the desired throughput distribution). In
3550		- * contrast, in any asymmetric scenario, device idling is
3551		- * certainly needed to guarantee that bfqq receives its
3552		- * assigned fraction of the device throughput (see [1] for
3553		- * details).
3554		- * The problem is that idling may significantly reduce
3555		- * throughput with certain combinations of types of I/O and
3556		- * devices. An important example is sync random I/O, on flash
3557		- * storage with command queueing. So, unless bfqq falls in the
3558		- * above cases where idling also boosts throughput, it would
3559		- * be important to check conditions (i) and (ii) accurately,
3560		- * so as to avoid idling when not strictly needed for service
3561		- * guarantees.
3562		- *
3563		- * Unfortunately, it is extremely difficult to thoroughly
3564		- * check condition (ii). And, in case there are active groups,
3565		- * it becomes very difficult to check condition (i) too. In
3566		- * fact, if there are active groups, then, for condition (i)
3567		- * to become false, it is enough that an active group contains
3568		- * more active processes or sub-groups than some other active
3569		- * group. More precisely, for condition (i) to hold because of
3570		- * such a group, it is not even necessary that the group is
3571		- * (still) active: it is sufficient that, even if the group
3572		- * has become inactive, some of its descendant processes still
3573		- * have some request already dispatched but still waiting for
3574		- * completion. In fact, requests have still to be guaranteed
3575		- * their share of the throughput even after being
3576		- * dispatched. In this respect, it is easy to show that, if a
3577		- * group frequently becomes inactive while still having
3578		- * in-flight requests, and if, when this happens, the group is
3579		- * not considered in the calculation of whether the scenario
3580		- * is asymmetric, then the group may fail to be guaranteed its
3581		- * fair share of the throughput (basically because idling may
3582		- * not be performed for the descendant processes of the group,
3583		- * but it had to be). We address this issue with the
3584		- * following bi-modal behavior, implemented in the function
3585		- * bfq_symmetric_scenario().
3586		- *
3587		- * If there are groups with requests waiting for completion
3588		- * (as commented above, some of these groups may even be
3589		- * already inactive), then the scenario is tagged as
3590		- * asymmetric, conservatively, without checking any of the
3591		- * conditions (i) and (ii). So the device is idled for bfqq.
3592		- * This behavior matches also the fact that groups are created
3593		- * exactly if controlling I/O is a primary concern (to
3594		- * preserve bandwidth and latency guarantees).
3595		- *
3596		- * On the opposite end, if there are no groups with requests
3597		- * waiting for completion, then only condition (i) is actually
3598		- * controlled, i.e., provided that condition (i) holds, idling
3599		- * is not performed, regardless of whether condition (ii)
3600		- * holds. In other words, only if condition (i) does not hold,
3601		- * then idling is allowed, and the device tends to be
3602		- * prevented from queueing many requests, possibly of several
3603		- * processes. Since there are no groups with requests waiting
3604		- * for completion, then, to control condition (i) it is enough
3605		- * to check just whether all the queues with requests waiting
3606		- * for completion also have the same weight.
3607		- *
3608		- * Not checking condition (ii) evidently exposes bfqq to the
3609		- * risk of getting less throughput than its fair share.
3610		- * However, for queues with the same weight, a further
3611		- * mechanism, preemption, mitigates or even eliminates this
3612		- * problem. And it does so without consequences on overall
3613		- * throughput. This mechanism and its benefits are explained
3614		- * in the next three paragraphs.
3615		- *
3616		- * Even if a queue, say Q, is expired when it remains idle, Q
3617		- * can still preempt the new in-service queue if the next
3618		- * request of Q arrives soon (see the comments on
3619		- * bfq_bfqq_update_budg_for_activation). If all queues and
3620		- * groups have the same weight, this form of preemption,
3621		- * combined with the hole-recovery heuristic described in the
3622		- * comments on function bfq_bfqq_update_budg_for_activation,
3623		- * are enough to preserve a correct bandwidth distribution in
3624		- * the mid term, even without idling. In fact, even if not
3625		- * idling allows the internal queues of the device to contain
3626		- * many requests, and thus to reorder requests, we can rather
3627		- * safely assume that the internal scheduler still preserves a
3628		- * minimum of mid-term fairness.
3629		- *
3630		- * More precisely, this preemption-based, idleless approach
3631		- * provides fairness in terms of IOPS, and not sectors per
3632		- * second. This can be seen with a simple example. Suppose
3633		- * that there are two queues with the same weight, but that
3634		- * the first queue receives requests of 8 sectors, while the
3635		- * second queue receives requests of 1024 sectors. In
3636		- * addition, suppose that each of the two queues contains at
3637		- * most one request at a time, which implies that each queue
3638		- * always remains idle after it is served. Finally, after
3639		- * remaining idle, each queue receives very quickly a new
3640		- * request. It follows that the two queues are served
3641		- * alternatively, preempting each other if needed. This
3642		- * implies that, although both queues have the same weight,
3643		- * the queue with large requests receives a service that is
3644		- * 1024/8 times as high as the service received by the other
3645		- * queue.
3646		- *
3647		- * The motivation for using preemption instead of idling (for
3648		- * queues with the same weight) is that, by not idling,
3649		- * service guarantees are preserved (completely or at least in
3650		- * part) without minimally sacrificing throughput. And, if
3651		- * there is no active group, then the primary expectation for
3652		- * this device is probably a high throughput.
3653		- *
3654		- * We are now left only with explaining the additional
3655		- * compound condition that is checked below for deciding
3656		- * whether the scenario is asymmetric. To explain this
3657		- * compound condition, we need to add that the function
3658		- * bfq_symmetric_scenario checks the weights of only
3659		- * non-weight-raised queues, for efficiency reasons (see
3660		- * comments on bfq_weights_tree_add()). Then the fact that
3661		- * bfqq is weight-raised is checked explicitly here. More
3662		- * precisely, the compound condition below takes into account
3663		- * also the fact that, even if bfqq is being weight-raised,
3664		- * the scenario is still symmetric if all queues with requests
3665		- * waiting for completion happen to be
3666		- * weight-raised. Actually, we should be even more precise
3667		- * here, and differentiate between interactive weight raising
3668		- * and soft real-time weight raising.
3669		- *
3670		- * As a side note, it is worth considering that the above
3671		- * device-idling countermeasures may however fail in the
3672		- * following unlucky scenario: if idling is (correctly)
3673		- * disabled in a time period during which all symmetry
3674		- * sub-conditions hold, and hence the device is allowed to
3675		- * enqueue many requests, but at some later point in time some
3676		- * sub-condition stops to hold, then it may become impossible
3677		- * to let requests be served in the desired order until all
3678		- * the requests already queued in the device have been served.
	4225	+ * Idling is performed only if slice_idle > 0. In addition, we
	4226	+ * do not idle if
	4227	+ * (a) bfqq is async
	4228	+ * (b) bfqq is in the idle io prio class: in this case we do
	4229	+ * not idle because we want to minimize the bandwidth that
	4230	+ * queues in this class can steal to higher-priority queues
3679	4231	*/
3680		- asymmetric_scenario = (bfqq->wr_coeff > 1 &&
3681		- bfqd->wr_busy_queues < bfqd->busy_queues) \|\|
3682		- !bfq_symmetric_scenario(bfqd);
	4232	+ if (bfqd->bfq_slice_idle == 0 \|\| !bfq_bfqq_sync(bfqq) \|\|
	4233	+ bfq_class_idle(bfqq))
	4234	+ return false;
	4235	+
	4236	+ idling_boosts_thr_with_no_issue =
	4237	+ idling_boosts_thr_without_issues(bfqd, bfqq);
	4238	+
	4239	+ idling_needed_for_service_guar =
	4240	+ idling_needed_for_service_guarantees(bfqd, bfqq);
3683	4241
3684	4242	/*
3685		- * Finally, there is a case where maximizing throughput is the
3686		- * best choice even if it may cause unfairness toward
3687		- * bfqq. Such a case is when bfqq became active in a burst of
3688		- * queue activations. Queues that became active during a large
3689		- * burst benefit only from throughput, as discussed in the
3690		- * comments on bfq_handle_burst. Thus, if bfqq became active
3691		- * in a burst and not idling the device maximizes throughput,
3692		- * then the device must no be idled, because not idling the
3693		- * device provides bfqq and all other queues in the burst with
3694		- * maximum benefit. Combining this and the above case, we can
3695		- * now establish when idling is actually needed to preserve
3696		- * service guarantees.
3697		- */
3698		- idling_needed_for_service_guarantees =
3699		- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
3700		-
3701		- /*
3702		- * We have now all the components we need to compute the
	4243	+ * We have now the two components we need to compute the
3703	4244	* return value of the function, which is true only if idling
3704	4245	* either boosts the throughput (without issues), or is
3705	4246	* necessary to preserve service guarantees.
3706	4247	*/
3707		- return idling_boosts_thr_without_issues \|\|
3708		- idling_needed_for_service_guarantees;
	4248	+ return idling_boosts_thr_with_no_issue \|\|
	4249	+ idling_needed_for_service_guar;
3709	4250	}
3710	4251
3711	4252	/*
..	..	@@ -3724,26 +4265,98 @@
3724	4265	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
3725	4266	}
3726	4267
3727		-static struct bfq_queue bfq_choose_bfqq_for_injection(struct bfq_data bfqd)
	4268	+/*
	4269	+ * This function chooses the queue from which to pick the next extra
	4270	+ * I/O request to inject, if it finds a compatible queue. See the
	4271	+ * comments on bfq_update_inject_limit() for details on the injection
	4272	+ * mechanism, and for the definitions of the quantities mentioned
	4273	+ * below.
	4274	+ */
	4275	+static struct bfq_queue *
	4276	+bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
3728	4277	{
3729		- struct bfq_queue *bfqq;
	4278	+ struct bfq_queue bfqq, in_serv_bfqq = bfqd->in_service_queue;
	4279	+ unsigned int limit = in_serv_bfqq->inject_limit;
	4280	+ /*
	4281	+ * If
	4282	+ * - bfqq is not weight-raised and therefore does not carry
	4283	+ * time-critical I/O,
	4284	+ * or
	4285	+ * - regardless of whether bfqq is weight-raised, bfqq has
	4286	+ * however a long think time, during which it can absorb the
	4287	+ * effect of an appropriate number of extra I/O requests
	4288	+ * from other queues (see bfq_update_inject_limit for
	4289	+ * details on the computation of this number);
	4290	+ * then injection can be performed without restrictions.
	4291	+ */
	4292	+ bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 \|\|
	4293	+ !bfq_bfqq_has_short_ttime(in_serv_bfqq);
3730	4294
3731	4295	/*
3732		- * A linear search; but, with a high probability, very few
3733		- * steps are needed to find a candidate queue, i.e., a queue
3734		- * with enough budget left for its next request. In fact:
	4296	+ * If
	4297	+ * - the baseline total service time could not be sampled yet,
	4298	+ * so the inject limit happens to be still 0, and
	4299	+ * - a lot of time has elapsed since the plugging of I/O
	4300	+ * dispatching started, so drive speed is being wasted
	4301	+ * significantly;
	4302	+ * then temporarily raise inject limit to one request.
	4303	+ */
	4304	+ if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
	4305	+ bfq_bfqq_wait_request(in_serv_bfqq) &&
	4306	+ time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
	4307	+ bfqd->bfq_slice_idle)
	4308	+ )
	4309	+ limit = 1;
	4310	+
	4311	+ if (bfqd->rq_in_driver >= limit)
	4312	+ return NULL;
	4313	+
	4314	+ /*
	4315	+ * Linear search of the source queue for injection; but, with
	4316	+ * a high probability, very few steps are needed to find a
	4317	+ * candidate queue, i.e., a queue with enough budget left for
	4318	+ * its next request. In fact:
3735	4319	* - BFQ dynamically updates the budget of every queue so as
3736	4320	* to accommodate the expected backlog of the queue;
3737	4321	* - if a queue gets all its requests dispatched as injected
3738	4322	* service, then the queue is removed from the active list
3739		- * (and re-added only if it gets new requests, but with
3740		- * enough budget for its new backlog).
	4323	+ * (and re-added only if it gets new requests, but then it
	4324	+ * is assigned again enough budget for its new backlog).
3741	4325	*/
3742	4326	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
3743	4327	if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
	4328	+ (in_serv_always_inject \|\| bfqq->wr_coeff > 1) &&
3744	4329	bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
3745		- bfq_bfqq_budget_left(bfqq))
3746		- return bfqq;
	4330	+ bfq_bfqq_budget_left(bfqq)) {
	4331	+ /*
	4332	+ * Allow for only one large in-flight request
	4333	+ * on non-rotational devices, for the
	4334	+ * following reason. On non-rotationl drives,
	4335	+ * large requests take much longer than
	4336	+ * smaller requests to be served. In addition,
	4337	+ * the drive prefers to serve large requests
	4338	+ * w.r.t. to small ones, if it can choose. So,
	4339	+ * having more than one large requests queued
	4340	+ * in the drive may easily make the next first
	4341	+ * request of the in-service queue wait for so
	4342	+ * long to break bfqq's service guarantees. On
	4343	+ * the bright side, large requests let the
	4344	+ * drive reach a very high throughput, even if
	4345	+ * there is only one in-flight large request
	4346	+ * at a time.
	4347	+ */
	4348	+ if (blk_queue_nonrot(bfqd->queue) &&
	4349	+ blk_rq_sectors(bfqq->next_rq) >=
	4350	+ BFQQ_SECT_THR_NONROT)
	4351	+ limit = min_t(unsigned int, 1, limit);
	4352	+ else
	4353	+ limit = in_serv_bfqq->inject_limit;
	4354	+
	4355	+ if (bfqd->rq_in_driver < limit) {
	4356	+ bfqd->rqs_injected = true;
	4357	+ return bfqq;
	4358	+ }
	4359	+ }
3747	4360
3748	4361	return NULL;
3749	4362	}
..	..	@@ -3830,14 +4443,105 @@
3830	4443	* for a new request, or has requests waiting for a completion and
3831	4444	* may idle after their completion, then keep it anyway.
3832	4445	*
3833		- * Yet, to boost throughput, inject service from other queues if
3834		- * possible.
	4446	+ * Yet, inject service from other queues if it boosts
	4447	+ * throughput and is possible.
3835	4448	*/
3836	4449	if (bfq_bfqq_wait_request(bfqq) \|\|
3837	4450	(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3838		- if (bfq_bfqq_injectable(bfqq) &&
3839		- bfqq->injected_service * bfqq->inject_coeff <
3840		- bfqq->entity.service * 10)
	4451	+ struct bfq_queue *async_bfqq =
	4452	+ bfqq->bic && bfqq->bic->bfqq[0] &&
	4453	+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
	4454	+ bfqq->bic->bfqq[0]->next_rq ?
	4455	+ bfqq->bic->bfqq[0] : NULL;
	4456	+
	4457	+ /*
	4458	+ * The next three mutually-exclusive ifs decide
	4459	+ * whether to try injection, and choose the queue to
	4460	+ * pick an I/O request from.
	4461	+ *
	4462	+ * The first if checks whether the process associated
	4463	+ * with bfqq has also async I/O pending. If so, it
	4464	+ * injects such I/O unconditionally. Injecting async
	4465	+ * I/O from the same process can cause no harm to the
	4466	+ * process. On the contrary, it can only increase
	4467	+ * bandwidth and reduce latency for the process.
	4468	+ *
	4469	+ * The second if checks whether there happens to be a
	4470	+ * non-empty waker queue for bfqq, i.e., a queue whose
	4471	+ * I/O needs to be completed for bfqq to receive new
	4472	+ * I/O. This happens, e.g., if bfqq is associated with
	4473	+ * a process that does some sync. A sync generates
	4474	+ * extra blocking I/O, which must be completed before
	4475	+ * the process associated with bfqq can go on with its
	4476	+ * I/O. If the I/O of the waker queue is not served,
	4477	+ * then bfqq remains empty, and no I/O is dispatched,
	4478	+ * until the idle timeout fires for bfqq. This is
	4479	+ * likely to result in lower bandwidth and higher
	4480	+ * latencies for bfqq, and in a severe loss of total
	4481	+ * throughput. The best action to take is therefore to
	4482	+ * serve the waker queue as soon as possible. So do it
	4483	+ * (without relying on the third alternative below for
	4484	+ * eventually serving waker_bfqq's I/O; see the last
	4485	+ * paragraph for further details). This systematic
	4486	+ * injection of I/O from the waker queue does not
	4487	+ * cause any delay to bfqq's I/O. On the contrary,
	4488	+ * next bfqq's I/O is brought forward dramatically,
	4489	+ * for it is not blocked for milliseconds.
	4490	+ *
	4491	+ * The third if checks whether bfqq is a queue for
	4492	+ * which it is better to avoid injection. It is so if
	4493	+ * bfqq delivers more throughput when served without
	4494	+ * any further I/O from other queues in the middle, or
	4495	+ * if the service times of bfqq's I/O requests both
	4496	+ * count more than overall throughput, and may be
	4497	+ * easily increased by injection (this happens if bfqq
	4498	+ * has a short think time). If none of these
	4499	+ * conditions holds, then a candidate queue for
	4500	+ * injection is looked for through
	4501	+ * bfq_choose_bfqq_for_injection(). Note that the
	4502	+ * latter may return NULL (for example if the inject
	4503	+ * limit for bfqq is currently 0).
	4504	+ *
	4505	+ * NOTE: motivation for the second alternative
	4506	+ *
	4507	+ * Thanks to the way the inject limit is updated in
	4508	+ * bfq_update_has_short_ttime(), it is rather likely
	4509	+ * that, if I/O is being plugged for bfqq and the
	4510	+ * waker queue has pending I/O requests that are
	4511	+ * blocking bfqq's I/O, then the third alternative
	4512	+ * above lets the waker queue get served before the
	4513	+ * I/O-plugging timeout fires. So one may deem the
	4514	+ * second alternative superfluous. It is not, because
	4515	+ * the third alternative may be way less effective in
	4516	+ * case of a synchronization. For two main
	4517	+ * reasons. First, throughput may be low because the
	4518	+ * inject limit may be too low to guarantee the same
	4519	+ * amount of injected I/O, from the waker queue or
	4520	+ * other queues, that the second alternative
	4521	+ * guarantees (the second alternative unconditionally
	4522	+ * injects a pending I/O request of the waker queue
	4523	+ * for each bfq_dispatch_request()). Second, with the
	4524	+ * third alternative, the duration of the plugging,
	4525	+ * i.e., the time before bfqq finally receives new I/O,
	4526	+ * may not be minimized, because the waker queue may
	4527	+ * happen to be served only after other queues.
	4528	+ */
	4529	+ if (async_bfqq &&
	4530	+ icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
	4531	+ bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
	4532	+ bfq_bfqq_budget_left(async_bfqq))
	4533	+ bfqq = bfqq->bic->bfqq[0];
	4534	+ else if (bfq_bfqq_has_waker(bfqq) &&
	4535	+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
	4536	+ bfqq->waker_bfqq->next_rq &&
	4537	+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
	4538	+ bfqq->waker_bfqq) <=
	4539	+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
	4540	+ )
	4541	+ bfqq = bfqq->waker_bfqq;
	4542	+ else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
	4543	+ (bfqq->wr_coeff == 1 \|\| bfqd->wr_busy_queues > 1 \|\|
	4544	+ !bfq_bfqq_has_short_ttime(bfqq)))
3841	4545	bfqq = bfq_choose_bfqq_for_injection(bfqd);
3842	4546	else
3843	4547	bfqq = NULL;
..	..	@@ -3929,15 +4633,15 @@
3929	4633
3930	4634	bfq_bfqq_served(bfqq, service_to_charge);
3931	4635
	4636	+ if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
	4637	+ bfqd->wait_dispatch = false;
	4638	+ bfqd->waited_rq = rq;
	4639	+ }
	4640	+
3932	4641	bfq_dispatch_remove(bfqd->queue, rq);
3933	4642
3934		- if (bfqq != bfqd->in_service_queue) {
3935		- if (likely(bfqd->in_service_queue))
3936		- bfqd->in_service_queue->injected_service +=
3937		- bfq_serv_to_charge(rq, bfqq);
3938		-
	4643	+ if (bfqq != bfqd->in_service_queue)
3939	4644	goto return_rq;
3940		- }
3941	4645
3942	4646	/*
3943	4647	* If weight raising has to terminate for bfqq, then next
..	..	@@ -3957,7 +4661,7 @@
3957	4661	* belongs to CLASS_IDLE and other queues are waiting for
3958	4662	* service.
3959	4663	*/
3960		- if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
	4664	+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
3961	4665	goto return_rq;
3962	4666
3963	4667	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
..	..	@@ -3975,7 +4679,7 @@
3975	4679	* most a call to dispatch for nothing
3976	4680	*/
3977	4681	return !list_empty_careful(&bfqd->dispatch) \|\|
3978		- bfqd->busy_queues > 0;
	4682	+ bfq_tot_busy_queues(bfqd) > 0;
3979	4683	}
3980	4684
3981	4685	static struct request __bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
..	..	@@ -4029,9 +4733,10 @@
4029	4733	goto start_rq;
4030	4734	}
4031	4735
4032		- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
	4736	+ bfq_log(bfqd, "dispatch requests: %d busy queues",
	4737	+ bfq_tot_busy_queues(bfqd));
4033	4738
4034		- if (bfqd->busy_queues == 0)
	4739	+ if (bfq_tot_busy_queues(bfqd) == 0)
4035	4740	goto exit;
4036	4741
4037	4742	/*
..	..	@@ -4043,7 +4748,7 @@
4043	4748	* some unlucky request wait for as long as the device
4044	4749	* wishes.
4045	4750	*
4046		- * Of course, serving one request at at time may cause loss of
	4751	+ * Of course, serving one request at a time may cause loss of
4047	4752	* throughput.
4048	4753	*/
4049	4754	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
..	..	@@ -4065,7 +4770,7 @@
4065	4770	return rq;
4066	4771	}
4067	4772
4068		-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
	4773	+#ifdef CONFIG_BFQ_CGROUP_DEBUG
4069	4774	static void bfq_update_dispatch_stats(struct request_queue *q,
4070	4775	struct request *rq,
4071	4776	struct bfq_queue *in_serv_queue,
..	..	@@ -4089,7 +4794,7 @@
4089	4794	* In addition, the following queue lock guarantees that
4090	4795	* bfqq_group(bfqq) exists as well.
4091	4796	*/
4092		- spin_lock_irq(q->queue_lock);
	4797	+ spin_lock_irq(&q->queue_lock);
4093	4798	if (idle_timer_disabled)
4094	4799	/*
4095	4800	* Since the idle timer has been disabled,
..	..	@@ -4108,21 +4813,21 @@
4108	4813	bfqg_stats_set_start_empty_time(bfqg);
4109	4814	bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
4110	4815	}
4111		- spin_unlock_irq(q->queue_lock);
	4816	+ spin_unlock_irq(&q->queue_lock);
4112	4817	}
4113	4818	#else
4114	4819	static inline void bfq_update_dispatch_stats(struct request_queue *q,
4115	4820	struct request *rq,
4116	4821	struct bfq_queue *in_serv_queue,
4117	4822	bool idle_timer_disabled) {}
4118		-#endif
	4823	+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
4119	4824
4120	4825	static struct request bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
4121	4826	{
4122	4827	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
4123	4828	struct request *rq;
4124	4829	struct bfq_queue *in_serv_queue;
4125		- bool waiting_rq, idle_timer_disabled;
	4830	+ bool waiting_rq, idle_timer_disabled = false;
4126	4831
4127	4832	spin_lock_irq(&bfqd->lock);
4128	4833
..	..	@@ -4130,14 +4835,15 @@
4130	4835	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
4131	4836
4132	4837	rq = __bfq_dispatch_request(hctx);
4133		-
4134		- idle_timer_disabled =
4135		- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
	4838	+ if (in_serv_queue == bfqd->in_service_queue) {
	4839	+ idle_timer_disabled =
	4840	+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
	4841	+ }
4136	4842
4137	4843	spin_unlock_irq(&bfqd->lock);
4138		-
4139		- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
4140		- idle_timer_disabled);
	4844	+ bfq_update_dispatch_stats(hctx->queue, rq,
	4845	+ idle_timer_disabled ? in_serv_queue : NULL,
	4846	+ idle_timer_disabled);
4141	4847
4142	4848	return rq;
4143	4849	}
..	..	@@ -4151,9 +4857,9 @@
4151	4857	*/
4152	4858	void bfq_put_queue(struct bfq_queue *bfqq)
4153	4859	{
4154		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
	4860	+ struct bfq_queue *item;
	4861	+ struct hlist_node *n;
4155	4862	struct bfq_group *bfqg = bfqq_group(bfqq);
4156		-#endif
4157	4863
4158	4864	if (bfqq->bfqd)
4159	4865	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
..	..	@@ -4195,13 +4901,41 @@
4195	4901	bfqq->bfqd->burst_size--;
4196	4902	}
4197	4903
	4904	+ /*
	4905	+ * bfqq does not exist any longer, so it cannot be woken by
	4906	+ * any other queue, and cannot wake any other queue. Then bfqq
	4907	+ * must be removed from the woken list of its possible waker
	4908	+ * queue, and all queues in the woken list of bfqq must stop
	4909	+ * having a waker queue. Strictly speaking, these updates
	4910	+ * should be performed when bfqq remains with no I/O source
	4911	+ * attached to it, which happens before bfqq gets freed. In
	4912	+ * particular, this happens when the last process associated
	4913	+ * with bfqq exits or gets associated with a different
	4914	+ * queue. However, both events lead to bfqq being freed soon,
	4915	+ * and dangling references would come out only after bfqq gets
	4916	+ * freed. So these updates are done here, as a simple and safe
	4917	+ * way to handle all cases.
	4918	+ */
	4919	+ /* remove bfqq from woken list */
	4920	+ if (!hlist_unhashed(&bfqq->woken_list_node))
	4921	+ hlist_del_init(&bfqq->woken_list_node);
	4922	+
	4923	+ /* reset waker for all queues in woken list */
	4924	+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
	4925	+ woken_list_node) {
	4926	+ item->waker_bfqq = NULL;
	4927	+ bfq_clear_bfqq_has_waker(item);
	4928	+ hlist_del_init(&item->woken_list_node);
	4929	+ }
	4930	+
	4931	+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
	4932	+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
	4933	+
4198	4934	kmem_cache_free(bfq_pool, bfqq);
4199		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
4200	4935	bfqg_and_blkg_put(bfqg);
4201		-#endif
4202	4936	}
4203	4937
4204		-static void bfq_put_cooperator(struct bfq_queue *bfqq)
	4938	+void bfq_put_cooperator(struct bfq_queue *bfqq)
4205	4939	{
4206	4940	struct bfq_queue __bfqq, next;
4207	4941
..	..	@@ -4223,7 +4957,7 @@
4223	4957	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
4224	4958	{
4225	4959	if (bfqq == bfqd->in_service_queue) {
4226		- __bfq_bfqq_expire(bfqd, bfqq);
	4960	+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
4227	4961	bfq_schedule_dispatch(bfqd);
4228	4962	}
4229	4963
..	..	@@ -4231,7 +4965,7 @@
4231	4965
4232	4966	bfq_put_cooperator(bfqq);
4233	4967
4234		- bfq_put_queue(bfqq); /* release process reference */
	4968	+ bfq_release_process_ref(bfqd, bfqq);
4235	4969	}
4236	4970
4237	4971	static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
..	..	@@ -4246,9 +4980,8 @@
4246	4980	unsigned long flags;
4247	4981
4248	4982	spin_lock_irqsave(&bfqd->lock, flags);
4249		- bfqq->bic = NULL;
4250		- bfq_exit_bfqq(bfqd, bfqq);
4251	4983	bic_set_bfqq(bic, NULL, is_sync);
	4984	+ bfq_exit_bfqq(bfqd, bfqq);
4252	4985	spin_unlock_irqrestore(&bfqd->lock, flags);
4253	4986	}
4254	4987	}
..	..	@@ -4281,7 +5014,7 @@
4281	5014	pr_err("bdi %s: bfq: bad prio class %d\n",
4282	5015	bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
4283	5016	ioprio_class);
4284		- /* fall through */
	5017	+ fallthrough;
4285	5018	case IOPRIO_CLASS_NONE:
4286	5019	/*
4287	5020	* No prio set, inherit CPU scheduling settings.
..	..	@@ -4334,10 +5067,11 @@
4334	5067
4335	5068	bfqq = bic_to_bfqq(bic, false);
4336	5069	if (bfqq) {
4337		- /* release process reference on this queue */
4338		- bfq_put_queue(bfqq);
4339		- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
	5070	+ struct bfq_queue *old_bfqq = bfqq;
	5071	+
	5072	+ bfqq = bfq_get_queue(bfqd, bio, false, bic);
4340	5073	bic_set_bfqq(bic, bfqq, false);
	5074	+ bfq_release_process_ref(bfqd, old_bfqq);
4341	5075	}
4342	5076
4343	5077	bfqq = bic_to_bfqq(bic, true);
..	..	@@ -4351,6 +5085,8 @@
4351	5085	RB_CLEAR_NODE(&bfqq->entity.rb_node);
4352	5086	INIT_LIST_HEAD(&bfqq->fifo);
4353	5087	INIT_HLIST_NODE(&bfqq->burst_list_node);
	5088	+ INIT_HLIST_NODE(&bfqq->woken_list_node);
	5089	+ INIT_HLIST_HEAD(&bfqq->woken_list);
4354	5090
4355	5091	bfqq->ref = 0;
4356	5092	bfqq->bfqd = bfqd;
..	..	@@ -4369,13 +5105,6 @@
4369	5105	bfq_mark_bfqq_has_short_ttime(bfqq);
4370	5106	bfq_mark_bfqq_sync(bfqq);
4371	5107	bfq_mark_bfqq_just_created(bfqq);
4372		- /*
4373		- * Aggressively inject a lot of service: up to 90%.
4374		- * This coefficient remains constant during bfqq life,
4375		- * but this behavior might be changed, after enough
4376		- * testing and tuning.
4377		- */
4378		- bfqq->inject_coeff = 1;
4379	5108	} else
4380	5109	bfq_clear_bfqq_sync(bfqq);
4381	5110
..	..	@@ -4419,7 +5148,7 @@
4419	5148	return &bfqg->async_bfqq[0][ioprio];
4420	5149	case IOPRIO_CLASS_NONE:
4421	5150	ioprio = IOPRIO_NORM;
4422		- /* fall through */
	5151	+ fallthrough;
4423	5152	case IOPRIO_CLASS_BE:
4424	5153	return &bfqg->async_bfqq[1][ioprio];
4425	5154	case IOPRIO_CLASS_IDLE:
..	..	@@ -4439,14 +5168,7 @@
4439	5168	struct bfq_queue *bfqq;
4440	5169	struct bfq_group *bfqg;
4441	5170
4442		- rcu_read_lock();
4443		-
4444		- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
4445		- if (!bfqg) {
4446		- bfqq = &bfqd->oom_bfqq;
4447		- goto out;
4448		- }
4449		-
	5171	+ bfqg = bfq_bio_bfqg(bfqd, bio);
4450	5172	if (!is_sync) {
4451	5173	async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
4452	5174	ioprio);
..	..	@@ -4490,7 +5212,6 @@
4490	5212	out:
4491	5213	bfqq->ref++; /* get a process reference to this queue */
4492	5214	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
4493		- rcu_read_unlock();
4494	5215	return bfqq;
4495	5216	}
4496	5217
..	..	@@ -4513,17 +5234,19 @@
4513	5234	struct request *rq)
4514	5235	{
4515	5236	bfqq->seek_history <<= 1;
4516		- bfqq->seek_history \|=
4517		- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
4518		- (!blk_queue_nonrot(bfqd->queue) \|\|
4519		- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
	5237	+ bfqq->seek_history \|= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
	5238	+
	5239	+ if (bfqq->wr_coeff > 1 &&
	5240	+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
	5241	+ BFQQ_TOTALLY_SEEKY(bfqq))
	5242	+ bfq_bfqq_end_wr(bfqq);
4520	5243	}
4521	5244
4522	5245	static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4523	5246	struct bfq_queue *bfqq,
4524	5247	struct bfq_io_cq *bic)
4525	5248	{
4526		- bool has_short_ttime = true;
	5249	+ bool has_short_ttime = true, state_changed;
4527	5250
4528	5251	/*
4529	5252	* No need to update has_short_ttime if bfqq is async or in
..	..	@@ -4548,13 +5271,102 @@
4548	5271	bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
4549	5272	has_short_ttime = false;
4550	5273
4551		- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
4552		- has_short_ttime);
	5274	+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
4553	5275
4554	5276	if (has_short_ttime)
4555	5277	bfq_mark_bfqq_has_short_ttime(bfqq);
4556	5278	else
4557	5279	bfq_clear_bfqq_has_short_ttime(bfqq);
	5280	+
	5281	+ /*
	5282	+ * Until the base value for the total service time gets
	5283	+ * finally computed for bfqq, the inject limit does depend on
	5284	+ * the think-time state (short\|long). In particular, the limit
	5285	+ * is 0 or 1 if the think time is deemed, respectively, as
	5286	+ * short or long (details in the comments in
	5287	+ * bfq_update_inject_limit()). Accordingly, the next
	5288	+ * instructions reset the inject limit if the think-time state
	5289	+ * has changed and the above base value is still to be
	5290	+ * computed.
	5291	+ *
	5292	+ * However, the reset is performed only if more than 100 ms
	5293	+ * have elapsed since the last update of the inject limit, or
	5294	+ * (inclusive) if the change is from short to long think
	5295	+ * time. The reason for this waiting is as follows.
	5296	+ *
	5297	+ * bfqq may have a long think time because of a
	5298	+ * synchronization with some other queue, i.e., because the
	5299	+ * I/O of some other queue may need to be completed for bfqq
	5300	+ * to receive new I/O. Details in the comments on the choice
	5301	+ * of the queue for injection in bfq_select_queue().
	5302	+ *
	5303	+ * As stressed in those comments, if such a synchronization is
	5304	+ * actually in place, then, without injection on bfqq, the
	5305	+ * blocking I/O cannot happen to served while bfqq is in
	5306	+ * service. As a consequence, if bfqq is granted
	5307	+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
	5308	+ * is dispatched, until the idle timeout fires. This is likely
	5309	+ * to result in lower bandwidth and higher latencies for bfqq,
	5310	+ * and in a severe loss of total throughput.
	5311	+ *
	5312	+ * On the opposite end, a non-zero inject limit may allow the
	5313	+ * I/O that blocks bfqq to be executed soon, and therefore
	5314	+ * bfqq to receive new I/O soon.
	5315	+ *
	5316	+ * But, if the blocking gets actually eliminated, then the
	5317	+ * next think-time sample for bfqq may be very low. This in
	5318	+ * turn may cause bfqq's think time to be deemed
	5319	+ * short. Without the 100 ms barrier, this new state change
	5320	+ * would cause the body of the next if to be executed
	5321	+ * immediately. But this would set to 0 the inject
	5322	+ * limit. Without injection, the blocking I/O would cause the
	5323	+ * think time of bfqq to become long again, and therefore the
	5324	+ * inject limit to be raised again, and so on. The only effect
	5325	+ * of such a steady oscillation between the two think-time
	5326	+ * states would be to prevent effective injection on bfqq.
	5327	+ *
	5328	+ * In contrast, if the inject limit is not reset during such a
	5329	+ * long time interval as 100 ms, then the number of short
	5330	+ * think time samples can grow significantly before the reset
	5331	+ * is performed. As a consequence, the think time state can
	5332	+ * become stable before the reset. Therefore there will be no
	5333	+ * state change when the 100 ms elapse, and no reset of the
	5334	+ * inject limit. The inject limit remains steadily equal to 1
	5335	+ * both during and after the 100 ms. So injection can be
	5336	+ * performed at all times, and throughput gets boosted.
	5337	+ *
	5338	+ * An inject limit equal to 1 is however in conflict, in
	5339	+ * general, with the fact that the think time of bfqq is
	5340	+ * short, because injection may be likely to delay bfqq's I/O
	5341	+ * (as explained in the comments in
	5342	+ * bfq_update_inject_limit()). But this does not happen in
	5343	+ * this special case, because bfqq's low think time is due to
	5344	+ * an effective handling of a synchronization, through
	5345	+ * injection. In this special case, bfqq's I/O does not get
	5346	+ * delayed by injection; on the contrary, bfqq's I/O is
	5347	+ * brought forward, because it is not blocked for
	5348	+ * milliseconds.
	5349	+ *
	5350	+ * In addition, serving the blocking I/O much sooner, and much
	5351	+ * more frequently than once per I/O-plugging timeout, makes
	5352	+ * it much quicker to detect a waker queue (the concept of
	5353	+ * waker queue is defined in the comments in
	5354	+ * bfq_add_request()). This makes it possible to start sooner
	5355	+ * to boost throughput more effectively, by injecting the I/O
	5356	+ * of the waker queue unconditionally on every
	5357	+ * bfq_dispatch_request().
	5358	+ *
	5359	+ * One last, important benefit of not resetting the inject
	5360	+ * limit before 100 ms is that, during this time interval, the
	5361	+ * base value for the total service time is likely to get
	5362	+ * finally computed for bfqq, freeing the inject limit from
	5363	+ * its relation with the think time.
	5364	+ */
	5365	+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
	5366	+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	5367	+ msecs_to_jiffies(100)) \|\|
	5368	+ !has_short_ttime))
	5369	+ bfq_reset_inject_limit(bfqd, bfqq);
4558	5370	}
4559	5371
4560	5372	/*
..	..	@@ -4564,18 +5376,8 @@
4564	5376	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
4565	5377	struct request *rq)
4566	5378	{
4567		- struct bfq_io_cq *bic = RQ_BIC(rq);
4568		-
4569	5379	if (rq->cmd_flags & REQ_META)
4570	5380	bfqq->meta_pending++;
4571		-
4572		- bfq_update_io_thinktime(bfqd, bfqq);
4573		- bfq_update_has_short_ttime(bfqd, bfqq, bic);
4574		- bfq_update_io_seektime(bfqd, bfqq, rq);
4575		-
4576		- bfq_log_bfqq(bfqd, bfqq,
4577		- "rq_enqueued: has_short_ttime=%d (seeky %d)",
4578		- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
4579	5381
4580	5382	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4581	5383
..	..	@@ -4585,28 +5387,31 @@
4585	5387	bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
4586	5388
4587	5389	/*
4588		- * There is just this request queued: if the request
4589		- * is small and the queue is not to be expired, then
4590		- * just exit.
	5390	+ * There is just this request queued: if
	5391	+ * - the request is small, and
	5392	+ * - we are idling to boost throughput, and
	5393	+ * - the queue is not to be expired,
	5394	+ * then just exit.
4591	5395	*
4592	5396	* In this way, if the device is being idled to wait
4593	5397	* for a new request from the in-service queue, we
4594	5398	* avoid unplugging the device and committing the
4595		- * device to serve just a small request. On the
4596		- * contrary, we wait for the block layer to decide
4597		- * when to unplug the device: hopefully, new requests
4598		- * will be merged to this one quickly, then the device
4599		- * will be unplugged and larger requests will be
4600		- * dispatched.
	5399	+ * device to serve just a small request. In contrast
	5400	+ * we wait for the block layer to decide when to
	5401	+ * unplug the device: hopefully, new requests will be
	5402	+ * merged to this one quickly, then the device will be
	5403	+ * unplugged and larger requests will be dispatched.
4601	5404	*/
4602		- if (small_req && !budget_timeout)
	5405	+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
	5406	+ !budget_timeout)
4603	5407	return;
4604	5408
4605	5409	/*
4606		- * A large enough request arrived, or the queue is to
4607		- * be expired: in both cases disk idling is to be
4608		- * stopped, so clear wait_request flag and reset
4609		- * timer.
	5410	+ * A large enough request arrived, or idling is being
	5411	+ * performed to preserve service guarantees, or
	5412	+ * finally the queue is to be expired: in all these
	5413	+ * cases disk idling is to be stopped, so clear
	5414	+ * wait_request flag and reset timer.
4610	5415	*/
4611	5416	bfq_clear_bfqq_wait_request(bfqq);
4612	5417	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
..	..	@@ -4632,8 +5437,6 @@
4632	5437	bool waiting, idle_timer_disabled = false;
4633	5438
4634	5439	if (new_bfqq) {
4635		- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
4636		- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
4637	5440	/*
4638	5441	* Release the request's reference to the old bfqq
4639	5442	* and make sure one is taken to the shared queue.
..	..	@@ -4663,6 +5466,10 @@
4663	5466	bfqq = new_bfqq;
4664	5467	}
4665	5468
	5469	+ bfq_update_io_thinktime(bfqd, bfqq);
	5470	+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
	5471	+ bfq_update_io_seektime(bfqd, bfqq, rq);
	5472	+
4666	5473	waiting = bfqq && bfq_bfqq_wait_request(bfqq);
4667	5474	bfq_add_request(rq);
4668	5475	idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
..	..	@@ -4675,7 +5482,7 @@
4675	5482	return idle_timer_disabled;
4676	5483	}
4677	5484
4678		-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
	5485	+#ifdef CONFIG_BFQ_CGROUP_DEBUG
4679	5486	static void bfq_update_insert_stats(struct request_queue *q,
4680	5487	struct bfq_queue *bfqq,
4681	5488	bool idle_timer_disabled,
..	..	@@ -4694,18 +5501,20 @@
4694	5501	* In addition, the following queue lock guarantees that
4695	5502	* bfqq_group(bfqq) exists as well.
4696	5503	*/
4697		- spin_lock_irq(q->queue_lock);
	5504	+ spin_lock_irq(&q->queue_lock);
4698	5505	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4699	5506	if (idle_timer_disabled)
4700	5507	bfqg_stats_update_idle_time(bfqq_group(bfqq));
4701		- spin_unlock_irq(q->queue_lock);
	5508	+ spin_unlock_irq(&q->queue_lock);
4702	5509	}
4703	5510	#else
4704	5511	static inline void bfq_update_insert_stats(struct request_queue *q,
4705	5512	struct bfq_queue *bfqq,
4706	5513	bool idle_timer_disabled,
4707	5514	unsigned int cmd_flags) {}
4708		-#endif
	5515	+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
	5516	+
	5517	+static struct bfq_queue bfq_init_rq(struct request rq);
4709	5518
4710	5519	static void bfq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
4711	5520	bool at_head)
..	..	@@ -4716,18 +5525,19 @@
4716	5525	bool idle_timer_disabled = false;
4717	5526	unsigned int cmd_flags;
4718	5527
	5528	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
	5529	+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
	5530	+ bfqg_stats_update_legacy_io(q, rq);
	5531	+#endif
4719	5532	spin_lock_irq(&bfqd->lock);
	5533	+ bfqq = bfq_init_rq(rq);
4720	5534	if (blk_mq_sched_try_insert_merge(q, rq)) {
4721	5535	spin_unlock_irq(&bfqd->lock);
4722	5536	return;
4723	5537	}
4724	5538
4725		- spin_unlock_irq(&bfqd->lock);
4726		-
4727	5539	blk_mq_sched_request_inserted(rq);
4728	5540
4729		- spin_lock_irq(&bfqd->lock);
4730		- bfqq = bfq_init_rq(rq);
4731	5541	if (!bfqq \|\| at_head \|\| blk_rq_is_passthrough(rq)) {
4732	5542	if (at_head)
4733	5543	list_add(&rq->queuelist, &bfqd->dispatch);
..	..	@@ -4776,6 +5586,8 @@
4776	5586
4777	5587	static void bfq_update_hw_tag(struct bfq_data *bfqd)
4778	5588	{
	5589	+ struct bfq_queue *bfqq = bfqd->in_service_queue;
	5590	+
4779	5591	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
4780	5592	bfqd->rq_in_driver);
4781	5593
..	..	@@ -4788,7 +5600,18 @@
4788	5600	* sum is not exact, as it's not taking into account deactivated
4789	5601	* requests.
4790	5602	*/
4791		- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
	5603	+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
	5604	+ return;
	5605	+
	5606	+ /*
	5607	+ * If active queue hasn't enough requests and can idle, bfq might not
	5608	+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
	5609	+ * case
	5610	+ */
	5611	+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
	5612	+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
	5613	+ BFQ_HW_QUEUE_THRESHOLD &&
	5614	+ bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
4792	5615	return;
4793	5616
4794	5617	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
..	..	@@ -4797,6 +5620,9 @@
4797	5620	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
4798	5621	bfqd->max_rq_in_driver = 0;
4799	5622	bfqd->hw_tag_samples = 0;
	5623	+
	5624	+ bfqd->nonrot_with_queueing =
	5625	+ blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
4800	5626	}
4801	5627
4802	5628	static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
..	..	@@ -4852,6 +5678,7 @@
4852	5678	1UL<<(BFQ_RATE_SHIFT - 10))
4853	5679	bfq_update_rate_reset(bfqd, NULL);
4854	5680	bfqd->last_completion = now_ns;
	5681	+ bfqd->last_completed_rq_bfqq = bfqq;
4855	5682
4856	5683	/*
4857	5684	* If we are waiting to discover whether the request pattern
..	..	@@ -4859,11 +5686,14 @@
4859	5686	* isochronous, and both requisites for this condition to hold
4860	5687	* are now satisfied, then compute soft_rt_next_start (see the
4861	5688	* comments on the function bfq_bfqq_softrt_next_start()). We
4862		- * schedule this delayed check when bfqq expires, if it still
4863		- * has in-flight requests.
	5689	+ * do not compute soft_rt_next_start if bfqq is in interactive
	5690	+ * weight raising (see the comments in bfq_bfqq_expire() for
	5691	+ * an explanation). We schedule this delayed update when bfqq
	5692	+ * expires, if it still has in-flight requests.
4864	5693	*/
4865	5694	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4866		- RB_EMPTY_ROOT(&bfqq->sort_list))
	5695	+ RB_EMPTY_ROOT(&bfqq->sort_list) &&
	5696	+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
4867	5697	bfqq->soft_rt_next_start =
4868	5698	bfq_bfqq_softrt_next_start(bfqd, bfqq);
4869	5699
..	..	@@ -4921,6 +5751,167 @@
4921	5751	}
4922	5752
4923	5753	/*
	5754	+ * The processes associated with bfqq may happen to generate their
	5755	+ * cumulative I/O at a lower rate than the rate at which the device
	5756	+ * could serve the same I/O. This is rather probable, e.g., if only
	5757	+ * one process is associated with bfqq and the device is an SSD. It
	5758	+ * results in bfqq becoming often empty while in service. In this
	5759	+ * respect, if BFQ is allowed to switch to another queue when bfqq
	5760	+ * remains empty, then the device goes on being fed with I/O requests,
	5761	+ * and the throughput is not affected. In contrast, if BFQ is not
	5762	+ * allowed to switch to another queue---because bfqq is sync and
	5763	+ * I/O-dispatch needs to be plugged while bfqq is temporarily
	5764	+ * empty---then, during the service of bfqq, there will be frequent
	5765	+ * "service holes", i.e., time intervals during which bfqq gets empty
	5766	+ * and the device can only consume the I/O already queued in its
	5767	+ * hardware queues. During service holes, the device may even get to
	5768	+ * remaining idle. In the end, during the service of bfqq, the device
	5769	+ * is driven at a lower speed than the one it can reach with the kind
	5770	+ * of I/O flowing through bfqq.
	5771	+ *
	5772	+ * To counter this loss of throughput, BFQ implements a "request
	5773	+ * injection mechanism", which tries to fill the above service holes
	5774	+ * with I/O requests taken from other queues. The hard part in this
	5775	+ * mechanism is finding the right amount of I/O to inject, so as to
	5776	+ * both boost throughput and not break bfqq's bandwidth and latency
	5777	+ * guarantees. In this respect, the mechanism maintains a per-queue
	5778	+ * inject limit, computed as below. While bfqq is empty, the injection
	5779	+ * mechanism dispatches extra I/O requests only until the total number
	5780	+ * of I/O requests in flight---i.e., already dispatched but not yet
	5781	+ * completed---remains lower than this limit.
	5782	+ *
	5783	+ * A first definition comes in handy to introduce the algorithm by
	5784	+ * which the inject limit is computed. We define as first request for
	5785	+ * bfqq, an I/O request for bfqq that arrives while bfqq is in
	5786	+ * service, and causes bfqq to switch from empty to non-empty. The
	5787	+ * algorithm updates the limit as a function of the effect of
	5788	+ * injection on the service times of only the first requests of
	5789	+ * bfqq. The reason for this restriction is that these are the
	5790	+ * requests whose service time is affected most, because they are the
	5791	+ * first to arrive after injection possibly occurred.
	5792	+ *
	5793	+ * To evaluate the effect of injection, the algorithm measures the
	5794	+ * "total service time" of first requests. We define as total service
	5795	+ * time of an I/O request, the time that elapses since when the
	5796	+ * request is enqueued into bfqq, to when it is completed. This
	5797	+ * quantity allows the whole effect of injection to be measured. It is
	5798	+ * easy to see why. Suppose that some requests of other queues are
	5799	+ * actually injected while bfqq is empty, and that a new request R
	5800	+ * then arrives for bfqq. If the device does start to serve all or
	5801	+ * part of the injected requests during the service hole, then,
	5802	+ * because of this extra service, it may delay the next invocation of
	5803	+ * the dispatch hook of BFQ. Then, even after R gets eventually
	5804	+ * dispatched, the device may delay the actual service of R if it is
	5805	+ * still busy serving the extra requests, or if it decides to serve,
	5806	+ * before R, some extra request still present in its queues. As a
	5807	+ * conclusion, the cumulative extra delay caused by injection can be
	5808	+ * easily evaluated by just comparing the total service time of first
	5809	+ * requests with and without injection.
	5810	+ *
	5811	+ * The limit-update algorithm works as follows. On the arrival of a
	5812	+ * first request of bfqq, the algorithm measures the total time of the
	5813	+ * request only if one of the three cases below holds, and, for each
	5814	+ * case, it updates the limit as described below:
	5815	+ *
	5816	+ * (1) If there is no in-flight request. This gives a baseline for the
	5817	+ * total service time of the requests of bfqq. If the baseline has
	5818	+ * not been computed yet, then, after computing it, the limit is
	5819	+ * set to 1, to start boosting throughput, and to prepare the
	5820	+ * ground for the next case. If the baseline has already been
	5821	+ * computed, then it is updated, in case it results to be lower
	5822	+ * than the previous value.
	5823	+ *
	5824	+ * (2) If the limit is higher than 0 and there are in-flight
	5825	+ * requests. By comparing the total service time in this case with
	5826	+ * the above baseline, it is possible to know at which extent the
	5827	+ * current value of the limit is inflating the total service
	5828	+ * time. If the inflation is below a certain threshold, then bfqq
	5829	+ * is assumed to be suffering from no perceivable loss of its
	5830	+ * service guarantees, and the limit is even tentatively
	5831	+ * increased. If the inflation is above the threshold, then the
	5832	+ * limit is decreased. Due to the lack of any hysteresis, this
	5833	+ * logic makes the limit oscillate even in steady workload
	5834	+ * conditions. Yet we opted for it, because it is fast in reaching
	5835	+ * the best value for the limit, as a function of the current I/O
	5836	+ * workload. To reduce oscillations, this step is disabled for a
	5837	+ * short time interval after the limit happens to be decreased.
	5838	+ *
	5839	+ * (3) Periodically, after resetting the limit, to make sure that the
	5840	+ * limit eventually drops in case the workload changes. This is
	5841	+ * needed because, after the limit has gone safely up for a
	5842	+ * certain workload, it is impossible to guess whether the
	5843	+ * baseline total service time may have changed, without measuring
	5844	+ * it again without injection. A more effective version of this
	5845	+ * step might be to just sample the baseline, by interrupting
	5846	+ * injection only once, and then to reset/lower the limit only if
	5847	+ * the total service time with the current limit does happen to be
	5848	+ * too large.
	5849	+ *
	5850	+ * More details on each step are provided in the comments on the
	5851	+ * pieces of code that implement these steps: the branch handling the
	5852	+ * transition from empty to non empty in bfq_add_request(), the branch
	5853	+ * handling injection in bfq_select_queue(), and the function
	5854	+ * bfq_choose_bfqq_for_injection(). These comments also explain some
	5855	+ * exceptions, made by the injection mechanism in some special cases.
	5856	+ */
	5857	+static void bfq_update_inject_limit(struct bfq_data *bfqd,
	5858	+ struct bfq_queue *bfqq)
	5859	+{
	5860	+ u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
	5861	+ unsigned int old_limit = bfqq->inject_limit;
	5862	+
	5863	+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
	5864	+ u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
	5865	+
	5866	+ if (tot_time_ns >= threshold && old_limit > 0) {
	5867	+ bfqq->inject_limit--;
	5868	+ bfqq->decrease_time_jif = jiffies;
	5869	+ } else if (tot_time_ns < threshold &&
	5870	+ old_limit <= bfqd->max_rq_in_driver)
	5871	+ bfqq->inject_limit++;
	5872	+ }
	5873	+
	5874	+ /*
	5875	+ * Either we still have to compute the base value for the
	5876	+ * total service time, and there seem to be the right
	5877	+ * conditions to do it, or we can lower the last base value
	5878	+ * computed.
	5879	+ *
	5880	+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
	5881	+ * request in flight, because this function is in the code
	5882	+ * path that handles the completion of a request of bfqq, and,
	5883	+ * in particular, this function is executed before
	5884	+ * bfqd->rq_in_driver is decremented in such a code path.
	5885	+ */
	5886	+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) \|\|
	5887	+ tot_time_ns < bfqq->last_serv_time_ns) {
	5888	+ if (bfqq->last_serv_time_ns == 0) {
	5889	+ /*
	5890	+ * Now we certainly have a base value: make sure we
	5891	+ * start trying injection.
	5892	+ */
	5893	+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
	5894	+ }
	5895	+ bfqq->last_serv_time_ns = tot_time_ns;
	5896	+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
	5897	+ /*
	5898	+ * No I/O injected and no request still in service in
	5899	+ * the drive: these are the exact conditions for
	5900	+ * computing the base value of the total service time
	5901	+ * for bfqq. So let's update this value, because it is
	5902	+ * rather variable. For example, it varies if the size
	5903	+ * or the spatial locality of the I/O requests in bfqq
	5904	+ * change.
	5905	+ */
	5906	+ bfqq->last_serv_time_ns = tot_time_ns;
	5907	+
	5908	+
	5909	+ /* update complete, not waiting for any request completion any longer */
	5910	+ bfqd->waited_rq = NULL;
	5911	+ bfqd->rqs_injected = false;
	5912	+}
	5913	+
	5914	+/*
4924	5915	* Handle either a requeue or a finish for rq. The things to do are
4925	5916	* the same in both cases: all references to rq are to be dropped. In
4926	5917	* particular, rq is considered completed from the point of view of
..	..	@@ -4930,18 +5921,6 @@
4930	5921	{
4931	5922	struct bfq_queue *bfqq = RQ_BFQQ(rq);
4932	5923	struct bfq_data *bfqd;
4933		-
4934		- /*
4935		- * Requeue and finish hooks are invoked in blk-mq without
4936		- * checking whether the involved request is actually still
4937		- * referenced in the scheduler. To handle this fact, the
4938		- * following two checks make this function exit in case of
4939		- * spurious invocations, for which there is nothing to do.
4940		- *
4941		- * First, check whether rq has nothing to do with an elevator.
4942		- */
4943		- if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4944		- return;
4945	5924
4946	5925	/*
4947	5926	* rq either is not associated with any icq, or is an already
..	..	@@ -4963,6 +5942,9 @@
4963	5942	unsigned long flags;
4964	5943
4965	5944	spin_lock_irqsave(&bfqd->lock, flags);
	5945	+
	5946	+ if (rq == bfqd->waited_rq)
	5947	+ bfq_update_inject_limit(bfqd, bfqq);
4966	5948
4967	5949	bfq_completed_request(bfqq, bfqd);
4968	5950	bfq_finish_requeue_request_body(bfqq);
..	..	@@ -5012,6 +5994,8 @@
5012	5994	}
5013	5995
5014	5996	/*
	5997	+ * Removes the association between the current task and bfqq, assuming
	5998	+ * that bic points to the bfq iocontext of the task.
5015	5999	* Returns NULL if a new bfqq should be allocated, or the old bfqq if this
5016	6000	* was the last process referring to that bfqq.
5017	6001	*/
..	..	@@ -5027,11 +6011,11 @@
5027	6011	return bfqq;
5028	6012	}
5029	6013
5030		- bic_set_bfqq(bic, NULL, 1);
	6014	+ bic_set_bfqq(bic, NULL, true);
5031	6015
5032	6016	bfq_put_cooperator(bfqq);
5033	6017
5034		- bfq_put_queue(bfqq);
	6018	+ bfq_release_process_ref(bfqq->bfqd, bfqq);
5035	6019	return NULL;
5036	6020	}
5037	6021
..	..	@@ -5104,7 +6088,7 @@
5104	6088	* comments on bfq_init_rq for the reason behind this delayed
5105	6089	* preparation.
5106	6090	*/
5107		-static void bfq_prepare_request(struct request rq, struct bio bio)
	6091	+static void bfq_prepare_request(struct request *rq)
5108	6092	{
5109	6093	/*
5110	6094	* Regardless of whether we have an icq attached, we have to
..	..	@@ -5127,7 +6111,7 @@
5127	6111	* preparation is that, after the prepare_request hook is invoked for
5128	6112	* rq, rq may still be transformed into a request with no icq, i.e., a
5129	6113	* request not associated with any queue. No bfq hook is invoked to
5130		- * signal this tranformation. As a consequence, should these
	6114	+ * signal this transformation. As a consequence, should these
5131	6115	* preparation operations be performed when the prepare_request hook
5132	6116	* is invoked, and should rq be transformed one moment later, bfq
5133	6117	* would end up in an inconsistent state, because it would have
..	..	@@ -5218,7 +6202,29 @@
5218	6202	}
5219	6203	}
5220	6204
5221		- if (unlikely(bfq_bfqq_just_created(bfqq)))
	6205	+ /*
	6206	+ * Consider bfqq as possibly belonging to a burst of newly
	6207	+ * created queues only if:
	6208	+ * 1) A burst is actually happening (bfqd->burst_size > 0)
	6209	+ * or
	6210	+ * 2) There is no other active queue. In fact, if, in
	6211	+ * contrast, there are active queues not belonging to the
	6212	+ * possible burst bfqq may belong to, then there is no gain
	6213	+ * in considering bfqq as belonging to a burst, and
	6214	+ * therefore in not weight-raising bfqq. See comments on
	6215	+ * bfq_handle_burst().
	6216	+ *
	6217	+ * This filtering also helps eliminating false positives,
	6218	+ * occurring when bfqq does not belong to an actual large
	6219	+ * burst, but some background task (e.g., a service) happens
	6220	+ * to trigger the creation of new queues very close to when
	6221	+ * bfqq and its possible companion queues are created. See
	6222	+ * comments on bfq_handle_burst() for further details also on
	6223	+ * this issue.
	6224	+ */
	6225	+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
	6226	+ (bfqd->burst_size > 0 \|\|
	6227	+ bfq_tot_busy_queues(bfqd) == 0)))
5222	6228	bfq_handle_burst(bfqd, bfqq);
5223	6229
5224	6230	return bfqq;
..	..	@@ -5267,8 +6273,8 @@
5267	6273	bfq_bfqq_expire(bfqd, bfqq, true, reason);
5268	6274
5269	6275	schedule_dispatch:
5270		- spin_unlock_irqrestore(&bfqd->lock, flags);
5271	6276	bfq_schedule_dispatch(bfqd);
	6277	+ spin_unlock_irqrestore(&bfqd->lock, flags);
5272	6278	}
5273	6279
5274	6280	/*
..	..	@@ -5381,8 +6387,8 @@
5381	6387	struct blk_mq_tags *tags = hctx->sched_tags;
5382	6388	unsigned int min_shallow;
5383	6389
5384		- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
5385		- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
	6390	+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
	6391	+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
5386	6392	}
5387	6393
5388	6394	static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
..	..	@@ -5405,10 +6411,10 @@
5405	6411
5406	6412	hrtimer_cancel(&bfqd->idle_slice_timer);
5407	6413
5408		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
5409	6414	/* release oom-queue reference to root group */
5410	6415	bfqg_and_blkg_put(bfqd->root_group);
5411	6416
	6417	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
5412	6418	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
5413	6419	#else
5414	6420	spin_lock_irq(&bfqd->lock);
..	..	@@ -5454,9 +6460,9 @@
5454	6460	}
5455	6461	eq->elevator_data = bfqd;
5456	6462
5457		- spin_lock_irq(q->queue_lock);
	6463	+ spin_lock_irq(&q->queue_lock);
5458	6464	q->elevator = eq;
5459		- spin_unlock_irq(q->queue_lock);
	6465	+ spin_unlock_irq(&q->queue_lock);
5460	6466
5461	6467	/*
5462	6468	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
..	..	@@ -5488,7 +6494,7 @@
5488	6494	HRTIMER_MODE_REL);
5489	6495	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5490	6496
5491		- bfqd->queue_weights_tree = RB_ROOT;
	6497	+ bfqd->queue_weights_tree = RB_ROOT_CACHED;
5492	6498	bfqd->num_groups_with_pending_reqs = 0;
5493	6499
5494	6500	INIT_LIST_HEAD(&bfqd->active_list);
..	..	@@ -5496,6 +6502,7 @@
5496	6502	INIT_HLIST_HEAD(&bfqd->burst_list);
5497	6503
5498	6504	bfqd->hw_tag = -1;
	6505	+ bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
5499	6506
5500	6507	bfqd->bfq_max_budget = bfq_default_max_budget;
5501	6508
..	..	@@ -5796,7 +6803,7 @@
5796	6803	};
5797	6804
5798	6805	static struct elevator_type iosched_bfq_mq = {
5799		- .ops.mq = {
	6806	+ .ops = {
5800	6807	.limit_depth = bfq_limit_depth,
5801	6808	.prepare_request = bfq_prepare_request,
5802	6809	.requeue_request = bfq_finish_requeue_request,
..	..	@@ -5818,7 +6825,6 @@
5818	6825	.exit_sched = bfq_exit_queue,
5819	6826	},
5820	6827
5821		- .uses_mq = true,
5822	6828	.icq_size = sizeof(struct bfq_io_cq),
5823	6829	.icq_align = __alignof__(struct bfq_io_cq),
5824	6830	.elevator_attrs = bfq_attrs,