~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* Budget Fair Queueing (BFQ) I/O scheduler.
3	4	*
..	..	@@ -12,21 +13,11 @@
12	13	*
13	14	* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14	15	*
15		- * This program is free software; you can redistribute it and/or
16		- * modify it under the terms of the GNU General Public License as
17		- * published by the Free Software Foundation; either version 2 of the
18		- * License, or (at your option) any later version.
19		- *
20		- * This program is distributed in the hope that it will be useful,
21		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
22		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23		- * General Public License for more details.
24		- *
25	16	* BFQ is a proportional-share I/O scheduler, with some extra
26	17	* low-latency capabilities. BFQ also supports full hierarchical
27	18	* scheduling through cgroups. Next paragraphs provide an introduction
28	19	* on BFQ inner workings. Details on BFQ benefits, usage and
29		- * limitations can be found in Documentation/block/bfq-iosched.txt.
	20	+ * limitations can be found in Documentation/block/bfq-iosched.rst.
30	21	*
31	22	* BFQ is a proportional-share storage-I/O scheduling algorithm based
32	23	* on the slice-by-slice service scheme of CFQ. But BFQ assigns
..	..	@@ -167,6 +158,7 @@
167	158	BFQ_BFQQ_FNS(coop);
168	159	BFQ_BFQQ_FNS(split_coop);
169	160	BFQ_BFQQ_FNS(softrt_update);
	161	+BFQ_BFQQ_FNS(has_waker);
170	162	#undef BFQ_BFQQ_FNS \
171	163
172	164	/* Expiration time of sync (0) and async (1) requests, in ns. */
..	..	@@ -190,7 +182,7 @@
190	182	/*
191	183	* When a sync request is dispatched, the queue that contains that
192	184	* request, and all the ancestor entities of that queue, are charged
193		- * with the number of sectors of the request. In constrast, if the
	185	+ * with the number of sectors of the request. In contrast, if the
194	186	* request is async, then the queue and its ancestor entities are
195	187	* charged with the number of sectors of the request, multiplied by
196	188	* the factor below. This throttles the bandwidth for async I/O,
..	..	@@ -218,7 +210,7 @@
218	210	* queue merging.
219	211	*
220	212	* As can be deduced from the low time limit below, queue merging, if
221		- * successful, happens at the very beggining of the I/O of the involved
	213	+ * successful, happens at the very beginning of the I/O of the involved
222	214	* cooperating processes, as a consequence of the arrival of the very
223	215	* first requests from each cooperator. After that, there is very
224	216	* little chance to find cooperators.
..	..	@@ -231,13 +223,26 @@
231	223	#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
232	224
233	225	/* hw_tag detection: parallel requests threshold and min samples needed. */
234		-#define BFQ_HW_QUEUE_THRESHOLD 4
	226	+#define BFQ_HW_QUEUE_THRESHOLD 3
235	227	#define BFQ_HW_QUEUE_SAMPLES 32
236	228
237	229	#define BFQQ_SEEK_THR (sector_t)(8 * 100)
238	230	#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
	231	+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
	232	+ (get_sdist(last_pos, rq) > \
	233	+ BFQQ_SEEK_THR && \
	234	+ (!blk_queue_nonrot(bfqd->queue) \|\| \
	235	+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
239	236	#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
240	237	#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
	238	+/*
	239	+ * Sync random I/O is likely to be confused with soft real-time I/O,
	240	+ * because it is characterized by limited throughput and apparently
	241	+ * isochronous arrival pattern. To avoid false positives, queues
	242	+ * containing only random (seeky) I/O are prevented from being tagged
	243	+ * as soft real-time.
	244	+ */
	245	+#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1)
241	246
242	247	/* Min number of samples required to perform peak-rate update */
243	248	#define BFQ_RATE_MIN_SAMPLES 32
..	..	@@ -400,9 +405,9 @@
400	405	unsigned long flags;
401	406	struct bfq_io_cq *icq;
402	407
403		- spin_lock_irqsave(q->queue_lock, flags);
	408	+ spin_lock_irqsave(&q->queue_lock, flags);
404	409	icq = icq_to_bic(ioc_lookup_icq(ioc, q));
405		- spin_unlock_irqrestore(q->queue_lock, flags);
	410	+ spin_unlock_irqrestore(&q->queue_lock, flags);
406	411
407	412	return icq;
408	413	}
..	..	@@ -416,6 +421,8 @@
416	421	*/
417	422	void bfq_schedule_dispatch(struct bfq_data *bfqd)
418	423	{
	424	+ lockdep_assert_held(&bfqd->lock);
	425	+
419	426	if (bfqd->queued != 0) {
420	427	bfq_log(bfqd, "schedule dispatch");
421	428	blk_mq_run_hw_queues(bfqd->queue, true);
..	..	@@ -423,13 +430,12 @@
423	430	}
424	431
425	432	#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
426		-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
427	433
428	434	#define bfq_sample_valid(samples) ((samples) > 80)
429	435
430	436	/*
431	437	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
432		- * We choose the request that is closesr to the head right now. Distance
	438	+ * We choose the request that is closer to the head right now. Distance
433	439	* behind the head is penalized and only allowed to a certain extent.
434	440	*/
435	441	static struct request bfq_choose_req(struct bfq_data bfqd,
..	..	@@ -591,7 +597,16 @@
591	597	bfq_merge_time_limit);
592	598	}
593	599
594		-void bfq_pos_tree_add_move(struct bfq_data bfqd, struct bfq_queue bfqq)
	600	+/*
	601	+ * The following function is not marked as __cold because it is
	602	+ * actually cold, but for the same performance goal described in the
	603	+ * comments on the likely() at the beginning of
	604	+ * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
	605	+ * execution time for the case where this function is not invoked, we
	606	+ * had to add an unlikely() in each involved if().
	607	+ */
	608	+void __cold
	609	+bfq_pos_tree_add_move(struct bfq_data bfqd, struct bfq_queue bfqq)
595	610	{
596	611	struct rb_node *p, parent;
597	612	struct bfq_queue *__bfqq;
..	..	@@ -600,6 +615,10 @@
600	615	rb_erase(&bfqq->pos_node, bfqq->pos_root);
601	616	bfqq->pos_root = NULL;
602	617	}
	618	+
	619	+ /* oom_bfqq does not participate in queue merging */
	620	+ if (bfqq == &bfqd->oom_bfqq)
	621	+ return;
603	622
604	623	/*
605	624	* bfqq cannot be merged any longer (see comments in
..	..	@@ -625,52 +644,68 @@
625	644	}
626	645
627	646	/*
628		- * Tell whether there are active queues with different weights or
629		- * active groups.
630		- */
631		-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
632		-{
633		- /*
634		- * For queue weights to differ, queue_weights_tree must contain
635		- * at least two nodes.
636		- */
637		- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
638		- (bfqd->queue_weights_tree.rb_node->rb_left \|\|
639		- bfqd->queue_weights_tree.rb_node->rb_right)
640		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
641		- ) \|\|
642		- (bfqd->num_groups_with_pending_reqs > 0
643		-#endif
644		- );
645		-}
646		-
647		-/*
648		- * The following function returns true if every queue must receive the
649		- * same share of the throughput (this condition is used when deciding
650		- * whether idling may be disabled, see the comments in the function
651		- * bfq_better_to_idle()).
	647	+ * The following function returns false either if every active queue
	648	+ * must receive the same share of the throughput (symmetric scenario),
	649	+ * or, as a special case, if bfqq must receive a share of the
	650	+ * throughput lower than or equal to the share that every other active
	651	+ * queue must receive. If bfqq does sync I/O, then these are the only
	652	+ * two cases where bfqq happens to be guaranteed its share of the
	653	+ * throughput even if I/O dispatching is not plugged when bfqq remains
	654	+ * temporarily empty (for more details, see the comments in the
	655	+ * function bfq_better_to_idle()). For this reason, the return value
	656	+ * of this function is used to check whether I/O-dispatch plugging can
	657	+ * be avoided.
652	658	*
653		- * Such a scenario occurs when:
	659	+ * The above first case (symmetric scenario) occurs when:
654	660	* 1) all active queues have the same weight,
655		- * 2) all active groups at the same level in the groups tree have the same
656		- * weight,
	661	+ * 2) all active queues belong to the same I/O-priority class,
657	662	* 3) all active groups at the same level in the groups tree have the same
	663	+ * weight,
	664	+ * 4) all active groups at the same level in the groups tree have the same
658	665	* number of children.
659	666	*
660	667	* Unfortunately, keeping the necessary state for evaluating exactly
661	668	* the last two symmetry sub-conditions above would be quite complex
662		- * and time consuming. Therefore this function evaluates, instead,
663		- * only the following stronger two sub-conditions, for which it is
	669	+ * and time consuming. Therefore this function evaluates, instead,
	670	+ * only the following stronger three sub-conditions, for which it is
664	671	* much easier to maintain the needed state:
665	672	* 1) all active queues have the same weight,
666		- * 2) there are no active groups.
	673	+ * 2) all active queues belong to the same I/O-priority class,
	674	+ * 3) there are no active groups.
667	675	* In particular, the last condition is always true if hierarchical
668	676	* support or the cgroups interface are not enabled, thus no state
669	677	* needs to be maintained in this case.
670	678	*/
671		-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
	679	+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
	680	+ struct bfq_queue *bfqq)
672	681	{
673		- return !bfq_varied_queue_weights_or_active_groups(bfqd);
	682	+ bool smallest_weight = bfqq &&
	683	+ bfqq->weight_counter &&
	684	+ bfqq->weight_counter ==
	685	+ container_of(
	686	+ rb_first_cached(&bfqd->queue_weights_tree),
	687	+ struct bfq_weight_counter,
	688	+ weights_node);
	689	+
	690	+ /*
	691	+ * For queue weights to differ, queue_weights_tree must contain
	692	+ * at least two nodes.
	693	+ */
	694	+ bool varied_queue_weights = !smallest_weight &&
	695	+ !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
	696	+ (bfqd->queue_weights_tree.rb_root.rb_node->rb_left \|\|
	697	+ bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
	698	+
	699	+ bool multiple_classes_busy =
	700	+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) \|\|
	701	+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) \|\|
	702	+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
	703	+
	704	+ return varied_queue_weights \|\| multiple_classes_busy
	705	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
	706	+ \|\| bfqd->num_groups_with_pending_reqs > 0
	707	+#endif
	708	+ ;
674	709	}
675	710
676	711	/*
..	..	@@ -687,10 +722,11 @@
687	722	* should be low too.
688	723	*/
689	724	void bfq_weights_tree_add(struct bfq_data bfqd, struct bfq_queue bfqq,
690		- struct rb_root *root)
	725	+ struct rb_root_cached *root)
691	726	{
692	727	struct bfq_entity *entity = &bfqq->entity;
693		- struct rb_node *new = &(root->rb_node), parent = NULL;
	728	+ struct rb_node *new = &(root->rb_root.rb_node), parent = NULL;
	729	+ bool leftmost = true;
694	730
695	731	/*
696	732	* Do not insert if the queue is already associated with a
..	..	@@ -719,8 +755,10 @@
719	755	}
720	756	if (entity->weight < __counter->weight)
721	757	new = &((*new)->rb_left);
722		- else
	758	+ else {
723	759	new = &((*new)->rb_right);
	760	+ leftmost = false;
	761	+ }
724	762	}
725	763
726	764	bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
..	..	@@ -729,22 +767,22 @@
729	767	/*
730	768	* In the unlucky event of an allocation failure, we just
731	769	* exit. This will cause the weight of queue to not be
732		- * considered in bfq_varied_queue_weights_or_active_groups,
733		- * which, in its turn, causes the scenario to be deemed
734		- * wrongly symmetric in case bfqq's weight would have been
735		- * the only weight making the scenario asymmetric. On the
736		- * bright side, no unbalance will however occur when bfqq
737		- * becomes inactive again (the invocation of this function
738		- * is triggered by an activation of queue). In fact,
739		- * bfq_weights_tree_remove does nothing if
740		- * !bfqq->weight_counter.
	770	+ * considered in bfq_asymmetric_scenario, which, in its turn,
	771	+ * causes the scenario to be deemed wrongly symmetric in case
	772	+ * bfqq's weight would have been the only weight making the
	773	+ * scenario asymmetric. On the bright side, no unbalance will
	774	+ * however occur when bfqq becomes inactive again (the
	775	+ * invocation of this function is triggered by an activation
	776	+ * of queue). In fact, bfq_weights_tree_remove does nothing
	777	+ * if !bfqq->weight_counter.
741	778	*/
742	779	if (unlikely(!bfqq->weight_counter))
743	780	return;
744	781
745	782	bfqq->weight_counter->weight = entity->weight;
746	783	rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
747		- rb_insert_color(&bfqq->weight_counter->weights_node, root);
	784	+ rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
	785	+ leftmost);
748	786
749	787	inc_counter:
750	788	bfqq->weight_counter->num_active++;
..	..	@@ -759,7 +797,7 @@
759	797	*/
760	798	void __bfq_weights_tree_remove(struct bfq_data *bfqd,
761	799	struct bfq_queue *bfqq,
762		- struct rb_root *root)
	800	+ struct rb_root_cached *root)
763	801	{
764	802	if (!bfqq->weight_counter)
765	803	return;
..	..	@@ -768,7 +806,7 @@
768	806	if (bfqq->weight_counter->num_active > 0)
769	807	goto reset_entity_pointer;
770	808
771		- rb_erase(&bfqq->weight_counter->weights_node, root);
	809	+ rb_erase_cached(&bfqq->weight_counter->weights_node, root);
772	810	kfree(bfqq->weight_counter);
773	811
774	812	reset_entity_pointer:
..	..	@@ -882,7 +920,8 @@
882	920	static unsigned long bfq_serv_to_charge(struct request *rq,
883	921	struct bfq_queue *bfqq)
884	922	{
885		- if (bfq_bfqq_sync(bfqq) \|\| bfqq->wr_coeff > 1)
	923	+ if (bfq_bfqq_sync(bfqq) \|\| bfqq->wr_coeff > 1 \|\|
	924	+ bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
886	925	return blk_rq_sectors(rq);
887	926
888	927	return blk_rq_sectors(rq) * bfq_async_charge_factor;
..	..	@@ -916,8 +955,10 @@
916	955	*/
917	956	return;
918	957
919		- new_budget = max_t(unsigned long, bfqq->max_budget,
920		- bfq_serv_to_charge(next_rq, bfqq));
	958	+ new_budget = max_t(unsigned long,
	959	+ max_t(unsigned long, bfqq->max_budget,
	960	+ bfq_serv_to_charge(next_rq, bfqq)),
	961	+ entity->service);
921	962	if (entity->budget != new_budget) {
922	963	entity->budget = new_budget;
923	964	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
..	..	@@ -946,7 +987,7 @@
946	987	* of several files
947	988	* mplayer took 23 seconds to start, if constantly weight-raised.
948	989	*
949		- * As for higher values than that accomodating the above bad
	990	+ * As for higher values than that accommodating the above bad
950	991	* scenario, tests show that higher values would often yield
951	992	* the opposite of the desired result, i.e., would worsen
952	993	* responsiveness by allowing non-interactive applications to
..	..	@@ -985,6 +1026,7 @@
985	1026	else
986	1027	bfq_clear_bfqq_IO_bound(bfqq);
987	1028
	1029	+ bfqq->entity.new_weight = bic->saved_weight;
988	1030	bfqq->ttime = bic->saved_ttime;
989	1031	bfqq->wr_coeff = bic->saved_wr_coeff;
990	1032	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
..	..	@@ -1020,7 +1062,7 @@
1020	1062
1021	1063	static int bfqq_process_refs(struct bfq_queue *bfqq)
1022	1064	{
1023		- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
	1065	+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
1024	1066	(bfqq->weight_counter != NULL);
1025	1067	}
1026	1068
..	..	@@ -1032,8 +1074,18 @@
1032	1074
1033	1075	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
1034	1076	hlist_del_init(&item->burst_list_node);
1035		- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1036		- bfqd->burst_size = 1;
	1077	+
	1078	+ /*
	1079	+ * Start the creation of a new burst list only if there is no
	1080	+ * active queue. See comments on the conditional invocation of
	1081	+ * bfq_handle_burst().
	1082	+ */
	1083	+ if (bfq_tot_busy_queues(bfqd) == 0) {
	1084	+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
	1085	+ bfqd->burst_size = 1;
	1086	+ } else
	1087	+ bfqd->burst_size = 0;
	1088	+
1037	1089	bfqd->burst_parent_entity = bfqq->entity.parent;
1038	1090	}
1039	1091
..	..	@@ -1089,7 +1141,8 @@
1089	1141	* many parallel threads/processes. Examples are systemd during boot,
1090	1142	* or git grep. To help these processes get their job done as soon as
1091	1143	* possible, it is usually better to not grant either weight-raising
1092		- * or device idling to their queues.
	1144	+ * or device idling to their queues, unless these queues must be
	1145	+ * protected from the I/O flowing through other active queues.
1093	1146	*
1094	1147	* In this comment we describe, firstly, the reasons why this fact
1095	1148	* holds, and, secondly, the next function, which implements the main
..	..	@@ -1101,7 +1154,10 @@
1101	1154	* cumulatively served, the sooner the target job of these queues gets
1102	1155	* completed. As a consequence, weight-raising any of these queues,
1103	1156	* which also implies idling the device for it, is almost always
1104		- * counterproductive. In most cases it just lowers throughput.
	1157	+ * counterproductive, unless there are other active queues to isolate
	1158	+ * these new queues from. If there no other active queues, then
	1159	+ * weight-raising these new queues just lowers throughput in most
	1160	+ * cases.
1105	1161	*
1106	1162	* On the other hand, a burst of queue creations may be caused also by
1107	1163	* the start of an application that does not consist of a lot of
..	..	@@ -1135,14 +1191,16 @@
1135	1191	* are very rare. They typically occur if some service happens to
1136	1192	* start doing I/O exactly when the interactive task starts.
1137	1193	*
1138		- * Turning back to the next function, it implements all the steps
1139		- * needed to detect the occurrence of a large burst and to properly
1140		- * mark all the queues belonging to it (so that they can then be
1141		- * treated in a different way). This goal is achieved by maintaining a
1142		- * "burst list" that holds, temporarily, the queues that belong to the
1143		- * burst in progress. The list is then used to mark these queues as
1144		- * belonging to a large burst if the burst does become large. The main
1145		- * steps are the following.
	1194	+ * Turning back to the next function, it is invoked only if there are
	1195	+ * no active queues (apart from active queues that would belong to the
	1196	+ * same, possible burst bfqq would belong to), and it implements all
	1197	+ * the steps needed to detect the occurrence of a large burst and to
	1198	+ * properly mark all the queues belonging to it (so that they can then
	1199	+ * be treated in a different way). This goal is achieved by
	1200	+ * maintaining a "burst list" that holds, temporarily, the queues that
	1201	+ * belong to the burst in progress. The list is then used to mark
	1202	+ * these queues as belonging to a large burst if the burst does become
	1203	+ * large. The main steps are the following.
1146	1204	*
1147	1205	* . when the very first queue is created, the queue is inserted into the
1148	1206	* list (as it could be the first queue in a possible burst)
..	..	@@ -1376,21 +1434,31 @@
1376	1434	* mechanism may be re-designed in such a way to make it possible to
1377	1435	* know whether preemption is needed without needing to update service
1378	1436	* trees). In addition, queue preemptions almost always cause random
1379		- * I/O, and thus loss of throughput. Because of these facts, the next
1380		- * function adopts the following simple scheme to avoid both costly
1381		- * operations and too frequent preemptions: it requests the expiration
1382		- * of the in-service queue (unconditionally) only for queues that need
1383		- * to recover a hole, or that either are weight-raised or deserve to
1384		- * be weight-raised.
	1437	+ * I/O, which may in turn cause loss of throughput. Finally, there may
	1438	+ * even be no in-service queue when the next function is invoked (so,
	1439	+ * no queue to compare timestamps with). Because of these facts, the
	1440	+ * next function adopts the following simple scheme to avoid costly
	1441	+ * operations, too frequent preemptions and too many dependencies on
	1442	+ * the state of the scheduler: it requests the expiration of the
	1443	+ * in-service queue (unconditionally) only for queues that need to
	1444	+ * recover a hole. Then it delegates to other parts of the code the
	1445	+ * responsibility of handling the above case 2.
1385	1446	*/
1386	1447	static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1387	1448	struct bfq_queue *bfqq,
1388		- bool arrived_in_time,
1389		- bool wr_or_deserves_wr)
	1449	+ bool arrived_in_time)
1390	1450	{
1391	1451	struct bfq_entity *entity = &bfqq->entity;
1392	1452
1393		- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
	1453	+ /*
	1454	+ * In the next compound condition, we check also whether there
	1455	+ * is some budget left, because otherwise there is no point in
	1456	+ * trying to go on serving bfqq with this same budget: bfqq
	1457	+ * would be expired immediately after being selected for
	1458	+ * service. This would only cause useless overhead.
	1459	+ */
	1460	+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
	1461	+ bfq_bfqq_budget_left(bfqq) > 0) {
1394	1462	/*
1395	1463	* We do not clear the flag non_blocking_wait_rq here, as
1396	1464	* the latter is used in bfq_activate_bfqq to signal
..	..	@@ -1433,7 +1501,7 @@
1433	1501	entity->budget = max_t(unsigned long, bfqq->max_budget,
1434	1502	bfq_serv_to_charge(bfqq->next_rq, bfqq));
1435	1503	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1436		- return wr_or_deserves_wr;
	1504	+ return false;
1437	1505	}
1438	1506
1439	1507	/*
..	..	@@ -1551,6 +1619,36 @@
1551	1619	bfqd->bfq_wr_min_idle_time);
1552	1620	}
1553	1621
	1622	+
	1623	+/*
	1624	+ * Return true if bfqq is in a higher priority class, or has a higher
	1625	+ * weight than the in-service queue.
	1626	+ */
	1627	+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
	1628	+ struct bfq_queue *in_serv_bfqq)
	1629	+{
	1630	+ int bfqq_weight, in_serv_weight;
	1631	+
	1632	+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
	1633	+ return true;
	1634	+
	1635	+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
	1636	+ bfqq_weight = bfqq->entity.weight;
	1637	+ in_serv_weight = in_serv_bfqq->entity.weight;
	1638	+ } else {
	1639	+ if (bfqq->entity.parent)
	1640	+ bfqq_weight = bfqq->entity.parent->weight;
	1641	+ else
	1642	+ bfqq_weight = bfqq->entity.weight;
	1643	+ if (in_serv_bfqq->entity.parent)
	1644	+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
	1645	+ else
	1646	+ in_serv_weight = in_serv_bfqq->entity.weight;
	1647	+ }
	1648	+
	1649	+ return bfqq_weight > in_serv_weight;
	1650	+}
	1651	+
1554	1652	static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1555	1653	struct bfq_queue *bfqq,
1556	1654	int old_wr_coeff,
..	..	@@ -1579,6 +1677,7 @@
1579	1677	*/
1580	1678	in_burst = bfq_bfqq_in_large_burst(bfqq);
1581	1679	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
	1680	+ !BFQQ_TOTALLY_SEEKY(bfqq) &&
1582	1681	!in_burst &&
1583	1682	time_is_before_jiffies(bfqq->soft_rt_next_start) &&
1584	1683	bfqq->dispatched == 0;
..	..	@@ -1594,8 +1693,7 @@
1594	1693	*/
1595	1694	bfqq_wants_to_preempt =
1596	1695	bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1597		- arrived_in_time,
1598		- wr_or_deserves_wr);
	1696	+ arrived_in_time);
1599	1697
1600	1698	/*
1601	1699	* If bfqq happened to be activated in a burst, but has been
..	..	@@ -1660,19 +1758,109 @@
1660	1758
1661	1759	/*
1662	1760	* Expire in-service queue only if preemption may be needed
1663		- * for guarantees. In this respect, the function
1664		- * next_queue_may_preempt just checks a simple, necessary
1665		- * condition, and not a sufficient condition based on
1666		- * timestamps. In fact, for the latter condition to be
1667		- * evaluated, timestamps would need first to be updated, and
1668		- * this operation is quite costly (see the comments on the
1669		- * function bfq_bfqq_update_budg_for_activation).
	1761	+ * for guarantees. In particular, we care only about two
	1762	+ * cases. The first is that bfqq has to recover a service
	1763	+ * hole, as explained in the comments on
	1764	+ * bfq_bfqq_update_budg_for_activation(), i.e., that
	1765	+ * bfqq_wants_to_preempt is true. However, if bfqq does not
	1766	+ * carry time-critical I/O, then bfqq's bandwidth is less
	1767	+ * important than that of queues that carry time-critical I/O.
	1768	+ * So, as a further constraint, we consider this case only if
	1769	+ * bfqq is at least as weight-raised, i.e., at least as time
	1770	+ * critical, as the in-service queue.
	1771	+ *
	1772	+ * The second case is that bfqq is in a higher priority class,
	1773	+ * or has a higher weight than the in-service queue. If this
	1774	+ * condition does not hold, we don't care because, even if
	1775	+ * bfqq does not start to be served immediately, the resulting
	1776	+ * delay for bfqq's I/O is however lower or much lower than
	1777	+ * the ideal completion time to be guaranteed to bfqq's I/O.
	1778	+ *
	1779	+ * In both cases, preemption is needed only if, according to
	1780	+ * the timestamps of both bfqq and of the in-service queue,
	1781	+ * bfqq actually is the next queue to serve. So, to reduce
	1782	+ * useless preemptions, the return value of
	1783	+ * next_queue_may_preempt() is considered in the next compound
	1784	+ * condition too. Yet next_queue_may_preempt() just checks a
	1785	+ * simple, necessary condition for bfqq to be the next queue
	1786	+ * to serve. In fact, to evaluate a sufficient condition, the
	1787	+ * timestamps of the in-service queue would need to be
	1788	+ * updated, and this operation is quite costly (see the
	1789	+ * comments on bfq_bfqq_update_budg_for_activation()).
1670	1790	*/
1671		- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
1672		- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
	1791	+ if (bfqd->in_service_queue &&
	1792	+ ((bfqq_wants_to_preempt &&
	1793	+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) \|\|
	1794	+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
1673	1795	next_queue_may_preempt(bfqd))
1674	1796	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
1675	1797	false, BFQQE_PREEMPTED);
	1798	+}
	1799	+
	1800	+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
	1801	+ struct bfq_queue *bfqq)
	1802	+{
	1803	+ /* invalidate baseline total service time */
	1804	+ bfqq->last_serv_time_ns = 0;
	1805	+
	1806	+ /*
	1807	+ * Reset pointer in case we are waiting for
	1808	+ * some request completion.
	1809	+ */
	1810	+ bfqd->waited_rq = NULL;
	1811	+
	1812	+ /*
	1813	+ * If bfqq has a short think time, then start by setting the
	1814	+ * inject limit to 0 prudentially, because the service time of
	1815	+ * an injected I/O request may be higher than the think time
	1816	+ * of bfqq, and therefore, if one request was injected when
	1817	+ * bfqq remains empty, this injected request might delay the
	1818	+ * service of the next I/O request for bfqq significantly. In
	1819	+ * case bfqq can actually tolerate some injection, then the
	1820	+ * adaptive update will however raise the limit soon. This
	1821	+ * lucky circumstance holds exactly because bfqq has a short
	1822	+ * think time, and thus, after remaining empty, is likely to
	1823	+ * get new I/O enqueued---and then completed---before being
	1824	+ * expired. This is the very pattern that gives the
	1825	+ * limit-update algorithm the chance to measure the effect of
	1826	+ * injection on request service times, and then to update the
	1827	+ * limit accordingly.
	1828	+ *
	1829	+ * However, in the following special case, the inject limit is
	1830	+ * left to 1 even if the think time is short: bfqq's I/O is
	1831	+ * synchronized with that of some other queue, i.e., bfqq may
	1832	+ * receive new I/O only after the I/O of the other queue is
	1833	+ * completed. Keeping the inject limit to 1 allows the
	1834	+ * blocking I/O to be served while bfqq is in service. And
	1835	+ * this is very convenient both for bfqq and for overall
	1836	+ * throughput, as explained in detail in the comments in
	1837	+ * bfq_update_has_short_ttime().
	1838	+ *
	1839	+ * On the opposite end, if bfqq has a long think time, then
	1840	+ * start directly by 1, because:
	1841	+ * a) on the bright side, keeping at most one request in
	1842	+ * service in the drive is unlikely to cause any harm to the
	1843	+ * latency of bfqq's requests, as the service time of a single
	1844	+ * request is likely to be lower than the think time of bfqq;
	1845	+ * b) on the downside, after becoming empty, bfqq is likely to
	1846	+ * expire before getting its next request. With this request
	1847	+ * arrival pattern, it is very hard to sample total service
	1848	+ * times and update the inject limit accordingly (see comments
	1849	+ * on bfq_update_inject_limit()). So the limit is likely to be
	1850	+ * never, or at least seldom, updated. As a consequence, by
	1851	+ * setting the limit to 1, we avoid that no injection ever
	1852	+ * occurs with bfqq. On the downside, this proactive step
	1853	+ * further reduces chances to actually compute the baseline
	1854	+ * total service time. Thus it reduces chances to execute the
	1855	+ * limit-update algorithm and possibly raise the limit to more
	1856	+ * than 1.
	1857	+ */
	1858	+ if (bfq_bfqq_has_short_ttime(bfqq))
	1859	+ bfqq->inject_limit = 0;
	1860	+ else
	1861	+ bfqq->inject_limit = 1;
	1862	+
	1863	+ bfqq->decrease_time_jif = jiffies;
1676	1864	}
1677	1865
1678	1866	static void bfq_add_request(struct request *rq)
..	..	@@ -1687,6 +1875,180 @@
1687	1875	bfqq->queued[rq_is_sync(rq)]++;
1688	1876	bfqd->queued++;
1689	1877
	1878	+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
	1879	+ /*
	1880	+ * Detect whether bfqq's I/O seems synchronized with
	1881	+ * that of some other queue, i.e., whether bfqq, after
	1882	+ * remaining empty, happens to receive new I/O only
	1883	+ * right after some I/O request of the other queue has
	1884	+ * been completed. We call waker queue the other
	1885	+ * queue, and we assume, for simplicity, that bfqq may
	1886	+ * have at most one waker queue.
	1887	+ *
	1888	+ * A remarkable throughput boost can be reached by
	1889	+ * unconditionally injecting the I/O of the waker
	1890	+ * queue, every time a new bfq_dispatch_request
	1891	+ * happens to be invoked while I/O is being plugged
	1892	+ * for bfqq. In addition to boosting throughput, this
	1893	+ * unblocks bfqq's I/O, thereby improving bandwidth
	1894	+ * and latency for bfqq. Note that these same results
	1895	+ * may be achieved with the general injection
	1896	+ * mechanism, but less effectively. For details on
	1897	+ * this aspect, see the comments on the choice of the
	1898	+ * queue for injection in bfq_select_queue().
	1899	+ *
	1900	+ * Turning back to the detection of a waker queue, a
	1901	+ * queue Q is deemed as a waker queue for bfqq if, for
	1902	+ * two consecutive times, bfqq happens to become non
	1903	+ * empty right after a request of Q has been
	1904	+ * completed. In particular, on the first time, Q is
	1905	+ * tentatively set as a candidate waker queue, while
	1906	+ * on the second time, the flag
	1907	+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
	1908	+ * is a waker queue for bfqq. These detection steps
	1909	+ * are performed only if bfqq has a long think time,
	1910	+ * so as to make it more likely that bfqq's I/O is
	1911	+ * actually being blocked by a synchronization. This
	1912	+ * last filter, plus the above two-times requirement,
	1913	+ * make false positives less likely.
	1914	+ *
	1915	+ * NOTE
	1916	+ *
	1917	+ * The sooner a waker queue is detected, the sooner
	1918	+ * throughput can be boosted by injecting I/O from the
	1919	+ * waker queue. Fortunately, detection is likely to be
	1920	+ * actually fast, for the following reasons. While
	1921	+ * blocked by synchronization, bfqq has a long think
	1922	+ * time. This implies that bfqq's inject limit is at
	1923	+ * least equal to 1 (see the comments in
	1924	+ * bfq_update_inject_limit()). So, thanks to
	1925	+ * injection, the waker queue is likely to be served
	1926	+ * during the very first I/O-plugging time interval
	1927	+ * for bfqq. This triggers the first step of the
	1928	+ * detection mechanism. Thanks again to injection, the
	1929	+ * candidate waker queue is then likely to be
	1930	+ * confirmed no later than during the next
	1931	+ * I/O-plugging interval for bfqq.
	1932	+ */
	1933	+ if (bfqd->last_completed_rq_bfqq &&
	1934	+ !bfq_bfqq_has_short_ttime(bfqq) &&
	1935	+ ktime_get_ns() - bfqd->last_completion <
	1936	+ 200 * NSEC_PER_USEC) {
	1937	+ if (bfqd->last_completed_rq_bfqq != bfqq &&
	1938	+ bfqd->last_completed_rq_bfqq !=
	1939	+ bfqq->waker_bfqq) {
	1940	+ /*
	1941	+ * First synchronization detected with
	1942	+ * a candidate waker queue, or with a
	1943	+ * different candidate waker queue
	1944	+ * from the current one.
	1945	+ */
	1946	+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
	1947	+
	1948	+ /*
	1949	+ * If the waker queue disappears, then
	1950	+ * bfqq->waker_bfqq must be reset. To
	1951	+ * this goal, we maintain in each
	1952	+ * waker queue a list, woken_list, of
	1953	+ * all the queues that reference the
	1954	+ * waker queue through their
	1955	+ * waker_bfqq pointer. When the waker
	1956	+ * queue exits, the waker_bfqq pointer
	1957	+ * of all the queues in the woken_list
	1958	+ * is reset.
	1959	+ *
	1960	+ * In addition, if bfqq is already in
	1961	+ * the woken_list of a waker queue,
	1962	+ * then, before being inserted into
	1963	+ * the woken_list of a new waker
	1964	+ * queue, bfqq must be removed from
	1965	+ * the woken_list of the old waker
	1966	+ * queue.
	1967	+ */
	1968	+ if (!hlist_unhashed(&bfqq->woken_list_node))
	1969	+ hlist_del_init(&bfqq->woken_list_node);
	1970	+ hlist_add_head(&bfqq->woken_list_node,
	1971	+ &bfqd->last_completed_rq_bfqq->woken_list);
	1972	+
	1973	+ bfq_clear_bfqq_has_waker(bfqq);
	1974	+ } else if (bfqd->last_completed_rq_bfqq ==
	1975	+ bfqq->waker_bfqq &&
	1976	+ !bfq_bfqq_has_waker(bfqq)) {
	1977	+ /*
	1978	+ * synchronization with waker_bfqq
	1979	+ * seen for the second time
	1980	+ */
	1981	+ bfq_mark_bfqq_has_waker(bfqq);
	1982	+ }
	1983	+ }
	1984	+
	1985	+ /*
	1986	+ * Periodically reset inject limit, to make sure that
	1987	+ * the latter eventually drops in case workload
	1988	+ * changes, see step (3) in the comments on
	1989	+ * bfq_update_inject_limit().
	1990	+ */
	1991	+ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	1992	+ msecs_to_jiffies(1000)))
	1993	+ bfq_reset_inject_limit(bfqd, bfqq);
	1994	+
	1995	+ /*
	1996	+ * The following conditions must hold to setup a new
	1997	+ * sampling of total service time, and then a new
	1998	+ * update of the inject limit:
	1999	+ * - bfqq is in service, because the total service
	2000	+ * time is evaluated only for the I/O requests of
	2001	+ * the queues in service;
	2002	+ * - this is the right occasion to compute or to
	2003	+ * lower the baseline total service time, because
	2004	+ * there are actually no requests in the drive,
	2005	+ * or
	2006	+ * the baseline total service time is available, and
	2007	+ * this is the right occasion to compute the other
	2008	+ * quantity needed to update the inject limit, i.e.,
	2009	+ * the total service time caused by the amount of
	2010	+ * injection allowed by the current value of the
	2011	+ * limit. It is the right occasion because injection
	2012	+ * has actually been performed during the service
	2013	+ * hole, and there are still in-flight requests,
	2014	+ * which are very likely to be exactly the injected
	2015	+ * requests, or part of them;
	2016	+ * - the minimum interval for sampling the total
	2017	+ * service time and updating the inject limit has
	2018	+ * elapsed.
	2019	+ */
	2020	+ if (bfqq == bfqd->in_service_queue &&
	2021	+ (bfqd->rq_in_driver == 0 \|\|
	2022	+ (bfqq->last_serv_time_ns > 0 &&
	2023	+ bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
	2024	+ time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	2025	+ msecs_to_jiffies(10))) {
	2026	+ bfqd->last_empty_occupied_ns = ktime_get_ns();
	2027	+ /*
	2028	+ * Start the state machine for measuring the
	2029	+ * total service time of rq: setting
	2030	+ * wait_dispatch will cause bfqd->waited_rq to
	2031	+ * be set when rq will be dispatched.
	2032	+ */
	2033	+ bfqd->wait_dispatch = true;
	2034	+ /*
	2035	+ * If there is no I/O in service in the drive,
	2036	+ * then possible injection occurred before the
	2037	+ * arrival of rq will not affect the total
	2038	+ * service time of rq. So the injection limit
	2039	+ * must not be updated as a function of such
	2040	+ * total service time, unless new injection
	2041	+ * occurs before rq is completed. To have the
	2042	+ * injection limit updated only in the latter
	2043	+ * case, reset rqs_injected here (rqs_injected
	2044	+ * will be set in case injection is performed
	2045	+ * on bfqq before rq is completed).
	2046	+ */
	2047	+ if (bfqd->rq_in_driver == 0)
	2048	+ bfqd->rqs_injected = false;
	2049	+ }
	2050	+ }
	2051	+
1690	2052	elv_rb_add(&bfqq->sort_list, rq);
1691	2053
1692	2054	/*
..	..	@@ -1698,8 +2060,9 @@
1698	2060
1699	2061	/*
1700	2062	* Adjust priority tree position, if next_rq changes.
	2063	+ * See comments on bfq_pos_tree_add_move() for the unlikely().
1701	2064	*/
1702		- if (prev != bfqq->next_rq)
	2065	+ if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
1703	2066	bfq_pos_tree_add_move(bfqd, bfqq);
1704	2067
1705	2068	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
..	..	@@ -1839,7 +2202,9 @@
1839	2202	bfqq->pos_root = NULL;
1840	2203	}
1841	2204	} else {
1842		- bfq_pos_tree_add_move(bfqd, bfqq);
	2205	+ /* see comments on bfq_pos_tree_add_move() for the unlikely() */
	2206	+ if (unlikely(!bfqd->nonrot_with_queueing))
	2207	+ bfq_pos_tree_add_move(bfqd, bfqq);
1843	2208	}
1844	2209
1845	2210	if (rq->cmd_flags & REQ_META)
..	..	@@ -1847,9 +2212,9 @@
1847	2212
1848	2213	}
1849	2214
1850		-static bool bfq_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
	2215	+static bool bfq_bio_merge(struct request_queue q, struct bio bio,
	2216	+ unsigned int nr_segs)
1851	2217	{
1852		- struct request_queue *q = hctx->queue;
1853	2218	struct bfq_data *bfqd = q->elevator->elevator_data;
1854	2219	struct request *free = NULL;
1855	2220	/*
..	..	@@ -1864,13 +2229,20 @@
1864	2229
1865	2230	spin_lock_irq(&bfqd->lock);
1866	2231
1867		- if (bic)
	2232	+ if (bic) {
	2233	+ /*
	2234	+ * Make sure cgroup info is uptodate for current process before
	2235	+ * considering the merge.
	2236	+ */
	2237	+ bfq_bic_update_cgroup(bic, bio);
	2238	+
1868	2239	bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
1869		- else
	2240	+ } else {
1870	2241	bfqd->bio_bfqq = NULL;
	2242	+ }
1871	2243	bfqd->bio_bic = bic;
1872	2244
1873		- ret = blk_mq_sched_try_merge(q, bio, &free);
	2245	+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
1874	2246
1875	2247	if (free)
1876	2248	blk_mq_free_request(free);
..	..	@@ -1888,13 +2260,14 @@
1888	2260	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
1889	2261	if (__rq && elv_bio_merge_ok(__rq, bio)) {
1890	2262	*req = __rq;
	2263	+
	2264	+ if (blk_discard_mergable(__rq))
	2265	+ return ELEVATOR_DISCARD_MERGE;
1891	2266	return ELEVATOR_FRONT_MERGE;
1892	2267	}
1893	2268
1894	2269	return ELEVATOR_NO_MERGE;
1895	2270	}
1896		-
1897		-static struct bfq_queue bfq_init_rq(struct request rq);
1898	2271
1899	2272	static void bfq_request_merged(struct request_queue q, struct request req,
1900	2273	enum elv_merge type)
..	..	@@ -1904,7 +2277,7 @@
1904	2277	blk_rq_pos(req) <
1905	2278	blk_rq_pos(container_of(rb_prev(&req->rb_node),
1906	2279	struct request, rb_node))) {
1907		- struct bfq_queue *bfqq = bfq_init_rq(req);
	2280	+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1908	2281	struct bfq_data *bfqd;
1909	2282	struct request prev, next_rq;
1910	2283
..	..	@@ -1929,7 +2302,12 @@
1929	2302	*/
1930	2303	if (prev != bfqq->next_rq) {
1931	2304	bfq_updated_next_req(bfqd, bfqq);
1932		- bfq_pos_tree_add_move(bfqd, bfqq);
	2305	+ /*
	2306	+ * See comments on bfq_pos_tree_add_move() for
	2307	+ * the unlikely().
	2308	+ */
	2309	+ if (unlikely(!bfqd->nonrot_with_queueing))
	2310	+ bfq_pos_tree_add_move(bfqd, bfqq);
1933	2311	}
1934	2312	}
1935	2313	}
..	..	@@ -1951,8 +2329,8 @@
1951	2329	static void bfq_requests_merged(struct request_queue q, struct request rq,
1952	2330	struct request *next)
1953	2331	{
1954		- struct bfq_queue *bfqq = bfq_init_rq(rq),
1955		- *next_bfqq = bfq_init_rq(next);
	2332	+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
	2333	+ *next_bfqq = RQ_BFQQ(next);
1956	2334
1957	2335	if (!bfqq)
1958	2336	return;
..	..	@@ -2131,6 +2509,14 @@
2131	2509	if (process_refs == 0 \|\| new_process_refs == 0)
2132	2510	return NULL;
2133	2511
	2512	+ /*
	2513	+ * Make sure merged queues belong to the same parent. Parents could
	2514	+ * have changed since the time we decided the two queues are suitable
	2515	+ * for merging.
	2516	+ */
	2517	+ if (new_bfqq->entity.parent != bfqq->entity.parent)
	2518	+ return NULL;
	2519	+
2134	2520	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2135	2521	new_bfqq->pid);
2136	2522
..	..	@@ -2155,6 +2541,15 @@
2155	2541	* are likely to increase the throughput.
2156	2542	*/
2157	2543	bfqq->new_bfqq = new_bfqq;
	2544	+ /*
	2545	+ * The above assignment schedules the following redirections:
	2546	+ * each time some I/O for bfqq arrives, the process that
	2547	+ * generated that I/O is disassociated from bfqq and
	2548	+ * associated with new_bfqq. Here we increases new_bfqq->ref
	2549	+ * in advance, adding the number of processes that are
	2550	+ * expected to be associated with new_bfqq as they happen to
	2551	+ * issue I/O.
	2552	+ */
2158	2553	new_bfqq->ref += process_refs;
2159	2554	return new_bfqq;
2160	2555	}
..	..	@@ -2214,6 +2609,50 @@
2214	2609	{
2215	2610	struct bfq_queue in_service_bfqq, new_bfqq;
2216	2611
	2612	+ /* if a merge has already been setup, then proceed with that first */
	2613	+ if (bfqq->new_bfqq)
	2614	+ return bfqq->new_bfqq;
	2615	+
	2616	+ /*
	2617	+ * Do not perform queue merging if the device is non
	2618	+ * rotational and performs internal queueing. In fact, such a
	2619	+ * device reaches a high speed through internal parallelism
	2620	+ * and pipelining. This means that, to reach a high
	2621	+ * throughput, it must have many requests enqueued at the same
	2622	+ * time. But, in this configuration, the internal scheduling
	2623	+ * algorithm of the device does exactly the job of queue
	2624	+ * merging: it reorders requests so as to obtain as much as
	2625	+ * possible a sequential I/O pattern. As a consequence, with
	2626	+ * the workload generated by processes doing interleaved I/O,
	2627	+ * the throughput reached by the device is likely to be the
	2628	+ * same, with and without queue merging.
	2629	+ *
	2630	+ * Disabling merging also provides a remarkable benefit in
	2631	+ * terms of throughput. Merging tends to make many workloads
	2632	+ * artificially more uneven, because of shared queues
	2633	+ * remaining non empty for incomparably more time than
	2634	+ * non-merged queues. This may accentuate workload
	2635	+ * asymmetries. For example, if one of the queues in a set of
	2636	+ * merged queues has a higher weight than a normal queue, then
	2637	+ * the shared queue may inherit such a high weight and, by
	2638	+ * staying almost always active, may force BFQ to perform I/O
	2639	+ * plugging most of the time. This evidently makes it harder
	2640	+ * for BFQ to let the device reach a high throughput.
	2641	+ *
	2642	+ * Finally, the likely() macro below is not used because one
	2643	+ * of the two branches is more likely than the other, but to
	2644	+ * have the code path after the following if() executed as
	2645	+ * fast as possible for the case of a non rotational device
	2646	+ * with queueing. We want it because this is the fastest kind
	2647	+ * of device. On the opposite end, the likely() may lengthen
	2648	+ * the execution time of BFQ for the case of slower devices
	2649	+ * (rotational or at least without queueing). But in this case
	2650	+ * the execution time of BFQ matters very little, if not at
	2651	+ * all.
	2652	+ */
	2653	+ if (likely(bfqd->nonrot_with_queueing))
	2654	+ return NULL;
	2655	+
2217	2656	/*
2218	2657	* Prevent bfqq from being merged if it has been created too
2219	2658	* long ago. The idea is that true cooperating processes, and
..	..	@@ -2228,14 +2667,11 @@
2228	2667	if (bfq_too_late_for_merging(bfqq))
2229	2668	return NULL;
2230	2669
2231		- if (bfqq->new_bfqq)
2232		- return bfqq->new_bfqq;
2233		-
2234	2670	if (!io_struct \|\| unlikely(bfqq == &bfqd->oom_bfqq))
2235	2671	return NULL;
2236	2672
2237	2673	/* If there is only one backlogged queue, don't search. */
2238		- if (bfqd->busy_queues == 1)
	2674	+ if (bfq_tot_busy_queues(bfqd) == 1)
2239	2675	return NULL;
2240	2676
2241	2677	in_service_bfqq = bfqd->in_service_queue;
..	..	@@ -2277,6 +2713,7 @@
2277	2713	if (!bic)
2278	2714	return;
2279	2715
	2716	+ bic->saved_weight = bfqq->entity.orig_weight;
2280	2717	bic->saved_ttime = bfqq->ttime;
2281	2718	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
2282	2719	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
..	..	@@ -2295,6 +2732,7 @@
2295	2732	* to enjoy weight raising if split soon.
2296	2733	*/
2297	2734	bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
	2735	+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
2298	2736	bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
2299	2737	bic->saved_last_wr_start_finish = jiffies;
2300	2738	} else {
..	..	@@ -2304,6 +2742,26 @@
2304	2742	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
2305	2743	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
2306	2744	}
	2745	+}
	2746	+
	2747	+void bfq_release_process_ref(struct bfq_data bfqd, struct bfq_queue bfqq)
	2748	+{
	2749	+ /*
	2750	+ * To prevent bfqq's service guarantees from being violated,
	2751	+ * bfqq may be left busy, i.e., queued for service, even if
	2752	+ * empty (see comments in __bfq_bfqq_expire() for
	2753	+ * details). But, if no process will send requests to bfqq any
	2754	+ * longer, then there is no point in keeping bfqq queued for
	2755	+ * service. In addition, keeping bfqq queued for service, but
	2756	+ * with no process ref any longer, may have caused bfqq to be
	2757	+ * freed when dequeued from service. But this is assumed to
	2758	+ * never happen.
	2759	+ */
	2760	+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
	2761	+ bfqq != bfqd->in_service_queue)
	2762	+ bfq_del_bfqq_busy(bfqd, bfqq, false);
	2763	+
	2764	+ bfq_put_queue(bfqq);
2307	2765	}
2308	2766
2309	2767	static void
..	..	@@ -2365,9 +2823,18 @@
2365	2823	* assignment causes no harm).
2366	2824	*/
2367	2825	new_bfqq->bic = NULL;
	2826	+ /*
	2827	+ * If the queue is shared, the pid is the pid of one of the associated
	2828	+ * processes. Which pid depends on the exact sequence of merge events
	2829	+ * the queue underwent. So printing such a pid is useless and confusing
	2830	+ * because it reports a random pid between those of the associated
	2831	+ * processes.
	2832	+ * We mark such a queue with a pid -1, and then print SHARED instead of
	2833	+ * a pid in logging messages.
	2834	+ */
	2835	+ new_bfqq->pid = -1;
2368	2836	bfqq->bic = NULL;
2369		- /* release process reference to bfqq */
2370		- bfq_put_queue(bfqq);
	2837	+ bfq_release_process_ref(bfqd, bfqq);
2371	2838	}
2372	2839
2373	2840	static bool bfq_allow_bio_merge(struct request_queue q, struct request rq,
..	..	@@ -2399,8 +2866,8 @@
2399	2866	/*
2400	2867	* bic still points to bfqq, then it has not yet been
2401	2868	* redirected to some other bfq_queue, and a queue
2402		- * merge beween bfqq and new_bfqq can be safely
2403		- * fulfillled, i.e., bic can be redirected to new_bfqq
	2869	+ * merge between bfqq and new_bfqq can be safely
	2870	+ * fulfilled, i.e., bic can be redirected to new_bfqq
2404	2871	* and bfqq can be put.
2405	2872	*/
2406	2873	bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
..	..	@@ -2535,12 +3002,14 @@
2535	3002	* queue).
2536	3003	*/
2537	3004	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2538		- bfq_symmetric_scenario(bfqd))
	3005	+ !bfq_asymmetric_scenario(bfqd, bfqq))
2539	3006	sl = min_t(u64, sl, BFQ_MIN_TT);
2540	3007	else if (bfqq->wr_coeff > 1)
2541	3008	sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
2542	3009
2543	3010	bfqd->last_idling_start = ktime_get();
	3011	+ bfqd->last_idling_start_jiffies = jiffies;
	3012	+
2544	3013	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2545	3014	HRTIMER_MODE_REL);
2546	3015	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
..	..	@@ -2764,7 +3233,7 @@
2764	3233
2765	3234	if ((bfqd->rq_in_driver > 0 \|\|
2766	3235	now_ns - bfqd->last_completion < BFQ_MIN_TT)
2767		- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
	3236	+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
2768	3237	bfqd->sequential_samples++;
2769	3238
2770	3239	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
..	..	@@ -2816,7 +3285,209 @@
2816	3285	bfq_remove_request(q, rq);
2817	3286	}
2818	3287
2819		-static bool __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
	3288	+/*
	3289	+ * There is a case where idling does not have to be performed for
	3290	+ * throughput concerns, but to preserve the throughput share of
	3291	+ * the process associated with bfqq.
	3292	+ *
	3293	+ * To introduce this case, we can note that allowing the drive
	3294	+ * to enqueue more than one request at a time, and hence
	3295	+ * delegating de facto final scheduling decisions to the
	3296	+ * drive's internal scheduler, entails loss of control on the
	3297	+ * actual request service order. In particular, the critical
	3298	+ * situation is when requests from different processes happen
	3299	+ * to be present, at the same time, in the internal queue(s)
	3300	+ * of the drive. In such a situation, the drive, by deciding
	3301	+ * the service order of the internally-queued requests, does
	3302	+ * determine also the actual throughput distribution among
	3303	+ * these processes. But the drive typically has no notion or
	3304	+ * concern about per-process throughput distribution, and
	3305	+ * makes its decisions only on a per-request basis. Therefore,
	3306	+ * the service distribution enforced by the drive's internal
	3307	+ * scheduler is likely to coincide with the desired throughput
	3308	+ * distribution only in a completely symmetric, or favorably
	3309	+ * skewed scenario where:
	3310	+ * (i-a) each of these processes must get the same throughput as
	3311	+ * the others,
	3312	+ * (i-b) in case (i-a) does not hold, it holds that the process
	3313	+ * associated with bfqq must receive a lower or equal
	3314	+ * throughput than any of the other processes;
	3315	+ * (ii) the I/O of each process has the same properties, in
	3316	+ * terms of locality (sequential or random), direction
	3317	+ * (reads or writes), request sizes, greediness
	3318	+ * (from I/O-bound to sporadic), and so on;
	3319	+
	3320	+ * In fact, in such a scenario, the drive tends to treat the requests
	3321	+ * of each process in about the same way as the requests of the
	3322	+ * others, and thus to provide each of these processes with about the
	3323	+ * same throughput. This is exactly the desired throughput
	3324	+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
	3325	+ * even more convenient distribution for (the process associated with)
	3326	+ * bfqq.
	3327	+ *
	3328	+ * In contrast, in any asymmetric or unfavorable scenario, device
	3329	+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
	3330	+ * that bfqq receives its assigned fraction of the device throughput
	3331	+ * (see [1] for details).
	3332	+ *
	3333	+ * The problem is that idling may significantly reduce throughput with
	3334	+ * certain combinations of types of I/O and devices. An important
	3335	+ * example is sync random I/O on flash storage with command
	3336	+ * queueing. So, unless bfqq falls in cases where idling also boosts
	3337	+ * throughput, it is important to check conditions (i-a), i(-b) and
	3338	+ * (ii) accurately, so as to avoid idling when not strictly needed for
	3339	+ * service guarantees.
	3340	+ *
	3341	+ * Unfortunately, it is extremely difficult to thoroughly check
	3342	+ * condition (ii). And, in case there are active groups, it becomes
	3343	+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
	3344	+ * if there are active groups, then, for conditions (i-a) or (i-b) to
	3345	+ * become false 'indirectly', it is enough that an active group
	3346	+ * contains more active processes or sub-groups than some other active
	3347	+ * group. More precisely, for conditions (i-a) or (i-b) to become
	3348	+ * false because of such a group, it is not even necessary that the
	3349	+ * group is (still) active: it is sufficient that, even if the group
	3350	+ * has become inactive, some of its descendant processes still have
	3351	+ * some request already dispatched but still waiting for
	3352	+ * completion. In fact, requests have still to be guaranteed their
	3353	+ * share of the throughput even after being dispatched. In this
	3354	+ * respect, it is easy to show that, if a group frequently becomes
	3355	+ * inactive while still having in-flight requests, and if, when this
	3356	+ * happens, the group is not considered in the calculation of whether
	3357	+ * the scenario is asymmetric, then the group may fail to be
	3358	+ * guaranteed its fair share of the throughput (basically because
	3359	+ * idling may not be performed for the descendant processes of the
	3360	+ * group, but it had to be). We address this issue with the following
	3361	+ * bi-modal behavior, implemented in the function
	3362	+ * bfq_asymmetric_scenario().
	3363	+ *
	3364	+ * If there are groups with requests waiting for completion
	3365	+ * (as commented above, some of these groups may even be
	3366	+ * already inactive), then the scenario is tagged as
	3367	+ * asymmetric, conservatively, without checking any of the
	3368	+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
	3369	+ * This behavior matches also the fact that groups are created
	3370	+ * exactly if controlling I/O is a primary concern (to
	3371	+ * preserve bandwidth and latency guarantees).
	3372	+ *
	3373	+ * On the opposite end, if there are no groups with requests waiting
	3374	+ * for completion, then only conditions (i-a) and (i-b) are actually
	3375	+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
	3376	+ * idling is not performed, regardless of whether condition (ii)
	3377	+ * holds. In other words, only if conditions (i-a) and (i-b) do not
	3378	+ * hold, then idling is allowed, and the device tends to be prevented
	3379	+ * from queueing many requests, possibly of several processes. Since
	3380	+ * there are no groups with requests waiting for completion, then, to
	3381	+ * control conditions (i-a) and (i-b) it is enough to check just
	3382	+ * whether all the queues with requests waiting for completion also
	3383	+ * have the same weight.
	3384	+ *
	3385	+ * Not checking condition (ii) evidently exposes bfqq to the
	3386	+ * risk of getting less throughput than its fair share.
	3387	+ * However, for queues with the same weight, a further
	3388	+ * mechanism, preemption, mitigates or even eliminates this
	3389	+ * problem. And it does so without consequences on overall
	3390	+ * throughput. This mechanism and its benefits are explained
	3391	+ * in the next three paragraphs.
	3392	+ *
	3393	+ * Even if a queue, say Q, is expired when it remains idle, Q
	3394	+ * can still preempt the new in-service queue if the next
	3395	+ * request of Q arrives soon (see the comments on
	3396	+ * bfq_bfqq_update_budg_for_activation). If all queues and
	3397	+ * groups have the same weight, this form of preemption,
	3398	+ * combined with the hole-recovery heuristic described in the
	3399	+ * comments on function bfq_bfqq_update_budg_for_activation,
	3400	+ * are enough to preserve a correct bandwidth distribution in
	3401	+ * the mid term, even without idling. In fact, even if not
	3402	+ * idling allows the internal queues of the device to contain
	3403	+ * many requests, and thus to reorder requests, we can rather
	3404	+ * safely assume that the internal scheduler still preserves a
	3405	+ * minimum of mid-term fairness.
	3406	+ *
	3407	+ * More precisely, this preemption-based, idleless approach
	3408	+ * provides fairness in terms of IOPS, and not sectors per
	3409	+ * second. This can be seen with a simple example. Suppose
	3410	+ * that there are two queues with the same weight, but that
	3411	+ * the first queue receives requests of 8 sectors, while the
	3412	+ * second queue receives requests of 1024 sectors. In
	3413	+ * addition, suppose that each of the two queues contains at
	3414	+ * most one request at a time, which implies that each queue
	3415	+ * always remains idle after it is served. Finally, after
	3416	+ * remaining idle, each queue receives very quickly a new
	3417	+ * request. It follows that the two queues are served
	3418	+ * alternatively, preempting each other if needed. This
	3419	+ * implies that, although both queues have the same weight,
	3420	+ * the queue with large requests receives a service that is
	3421	+ * 1024/8 times as high as the service received by the other
	3422	+ * queue.
	3423	+ *
	3424	+ * The motivation for using preemption instead of idling (for
	3425	+ * queues with the same weight) is that, by not idling,
	3426	+ * service guarantees are preserved (completely or at least in
	3427	+ * part) without minimally sacrificing throughput. And, if
	3428	+ * there is no active group, then the primary expectation for
	3429	+ * this device is probably a high throughput.
	3430	+ *
	3431	+ * We are now left only with explaining the two sub-conditions in the
	3432	+ * additional compound condition that is checked below for deciding
	3433	+ * whether the scenario is asymmetric. To explain the first
	3434	+ * sub-condition, we need to add that the function
	3435	+ * bfq_asymmetric_scenario checks the weights of only
	3436	+ * non-weight-raised queues, for efficiency reasons (see comments on
	3437	+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
	3438	+ * is checked explicitly here. More precisely, the compound condition
	3439	+ * below takes into account also the fact that, even if bfqq is being
	3440	+ * weight-raised, the scenario is still symmetric if all queues with
	3441	+ * requests waiting for completion happen to be
	3442	+ * weight-raised. Actually, we should be even more precise here, and
	3443	+ * differentiate between interactive weight raising and soft real-time
	3444	+ * weight raising.
	3445	+ *
	3446	+ * The second sub-condition checked in the compound condition is
	3447	+ * whether there is a fair amount of already in-flight I/O not
	3448	+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
	3449	+ * following reason. The drive may decide to serve in-flight
	3450	+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
	3451	+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
	3452	+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
	3453	+ * basically uncontrolled amount of I/O from other queues may be
	3454	+ * dispatched too, possibly causing the service of bfqq's I/O to be
	3455	+ * delayed even longer in the drive. This problem gets more and more
	3456	+ * serious as the speed and the queue depth of the drive grow,
	3457	+ * because, as these two quantities grow, the probability to find no
	3458	+ * queue busy but many requests in flight grows too. By contrast,
	3459	+ * plugging I/O dispatching minimizes the delay induced by already
	3460	+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
	3461	+ * lose because of this delay.
	3462	+ *
	3463	+ * As a side note, it is worth considering that the above
	3464	+ * device-idling countermeasures may however fail in the following
	3465	+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
	3466	+ * in a time period during which all symmetry sub-conditions hold, and
	3467	+ * therefore the device is allowed to enqueue many requests, but at
	3468	+ * some later point in time some sub-condition stops to hold, then it
	3469	+ * may become impossible to make requests be served in the desired
	3470	+ * order until all the requests already queued in the device have been
	3471	+ * served. The last sub-condition commented above somewhat mitigates
	3472	+ * this problem for weight-raised queues.
	3473	+ */
	3474	+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
	3475	+ struct bfq_queue *bfqq)
	3476	+{
	3477	+ /* No point in idling for bfqq if it won't get requests any longer */
	3478	+ if (unlikely(!bfqq_process_refs(bfqq)))
	3479	+ return false;
	3480	+
	3481	+ return (bfqq->wr_coeff > 1 &&
	3482	+ (bfqd->wr_busy_queues <
	3483	+ bfq_tot_busy_queues(bfqd) \|\|
	3484	+ bfqd->rq_in_driver >=
	3485	+ bfqq->dispatched + 4)) \|\|
	3486	+ bfq_asymmetric_scenario(bfqd, bfqq);
	3487	+}
	3488	+
	3489	+static bool __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq,
	3490	+ enum bfqq_expiration reason)
2820	3491	{
2821	3492	/*
2822	3493	* If this bfqq is shared between multiple processes, check
..	..	@@ -2827,7 +3498,22 @@
2827	3498	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2828	3499	bfq_mark_bfqq_split_coop(bfqq);
2829	3500
2830		- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
	3501	+ /*
	3502	+ * Consider queues with a higher finish virtual time than
	3503	+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
	3504	+ * true, then bfqq's bandwidth would be violated if an
	3505	+ * uncontrolled amount of I/O from these queues were
	3506	+ * dispatched while bfqq is waiting for its new I/O to
	3507	+ * arrive. This is exactly what may happen if this is a forced
	3508	+ * expiration caused by a preemption attempt, and if bfqq is
	3509	+ * not re-scheduled. To prevent this from happening, re-queue
	3510	+ * bfqq if it needs I/O-dispatch plugging, even if it is
	3511	+ * empty. By doing so, bfqq is granted to be served before the
	3512	+ * above queues (provided that bfqq is of course eligible).
	3513	+ */
	3514	+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
	3515	+ !(reason == BFQQE_PREEMPTED &&
	3516	+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
2831	3517	if (bfqq->dispatched == 0)
2832	3518	/*
2833	3519	* Overloading budget_timeout field to store
..	..	@@ -2842,8 +3528,11 @@
2842	3528	bfq_requeue_bfqq(bfqd, bfqq, true);
2843	3529	/*
2844	3530	* Resort priority tree of potential close cooperators.
	3531	+ * See comments on bfq_pos_tree_add_move() for the unlikely().
2845	3532	*/
2846		- bfq_pos_tree_add_move(bfqd, bfqq);
	3533	+ if (unlikely(!bfqd->nonrot_with_queueing &&
	3534	+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
	3535	+ bfq_pos_tree_add_move(bfqd, bfqq);
2847	3536	}
2848	3537
2849	3538	/*
..	..	@@ -3217,13 +3906,6 @@
3217	3906	jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
3218	3907	}
3219	3908
3220		-static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
3221		-{
3222		- return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
3223		- blk_queue_nonrot(bfqq->bfqd->queue) &&
3224		- bfqq->bfqd->hw_tag;
3225		-}
3226		-
3227	3909	/**
3228	3910	* bfq_bfqq_expire - expire a queue.
3229	3911	* @bfqd: device owning the queue.
..	..	@@ -3299,16 +3981,32 @@
3299	3981	* requests, then the request pattern is isochronous
3300	3982	* (see the comments on the function
3301	3983	* bfq_bfqq_softrt_next_start()). Thus we can compute
3302		- * soft_rt_next_start. If, instead, the queue still
3303		- * has outstanding requests, then we have to wait for
3304		- * the completion of all the outstanding requests to
3305		- * discover whether the request pattern is actually
3306		- * isochronous.
	3984	+ * soft_rt_next_start. And we do it, unless bfqq is in
	3985	+ * interactive weight raising. We do not do it in the
	3986	+ * latter subcase, for the following reason. bfqq may
	3987	+ * be conveying the I/O needed to load a soft
	3988	+ * real-time application. Such an application will
	3989	+ * actually exhibit a soft real-time I/O pattern after
	3990	+ * it finally starts doing its job. But, if
	3991	+ * soft_rt_next_start is computed here for an
	3992	+ * interactive bfqq, and bfqq had received a lot of
	3993	+ * service before remaining with no outstanding
	3994	+ * request (likely to happen on a fast device), then
	3995	+ * soft_rt_next_start would be assigned such a high
	3996	+ * value that, for a very long time, bfqq would be
	3997	+ * prevented from being possibly considered as soft
	3998	+ * real time.
	3999	+ *
	4000	+ * If, instead, the queue still has outstanding
	4001	+ * requests, then we have to wait for the completion
	4002	+ * of all the outstanding requests to discover whether
	4003	+ * the request pattern is actually isochronous.
3307	4004	*/
3308		- if (bfqq->dispatched == 0)
	4005	+ if (bfqq->dispatched == 0 &&
	4006	+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
3309	4007	bfqq->soft_rt_next_start =
3310	4008	bfq_bfqq_softrt_next_start(bfqd, bfqq);
3311		- else {
	4009	+ else if (bfqq->dispatched > 0) {
3312	4010	/*
3313	4011	* Schedule an update of soft_rt_next_start to when
3314	4012	* the task may be discovered to be isochronous.
..	..	@@ -3322,15 +4020,21 @@
3322	4020	slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
3323	4021
3324	4022	/*
	4023	+ * bfqq expired, so no total service time needs to be computed
	4024	+ * any longer: reset state machine for measuring total service
	4025	+ * times.
	4026	+ */
	4027	+ bfqd->rqs_injected = bfqd->wait_dispatch = false;
	4028	+ bfqd->waited_rq = NULL;
	4029	+
	4030	+ /*
3325	4031	* Increase, decrease or leave budget unchanged according to
3326	4032	* reason.
3327	4033	*/
3328	4034	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3329		- if (__bfq_bfqq_expire(bfqd, bfqq))
	4035	+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
3330	4036	/* bfqq is gone, no more actions on it */
3331	4037	return;
3332		-
3333		- bfqq->injected_service = 0;
3334	4038
3335	4039	/* mark bfqq as waiting a request only if a bic still points to it */
3336	4040	if (!bfq_bfqq_busy(bfqq) &&
..	..	@@ -3399,52 +4103,16 @@
3399	4103	bfq_bfqq_budget_timeout(bfqq);
3400	4104	}
3401	4105
3402		-/*
3403		- * For a queue that becomes empty, device idling is allowed only if
3404		- * this function returns true for the queue. As a consequence, since
3405		- * device idling plays a critical role in both throughput boosting and
3406		- * service guarantees, the return value of this function plays a
3407		- * critical role in both these aspects as well.
3408		- *
3409		- * In a nutshell, this function returns true only if idling is
3410		- * beneficial for throughput or, even if detrimental for throughput,
3411		- * idling is however necessary to preserve service guarantees (low
3412		- * latency, desired throughput distribution, ...). In particular, on
3413		- * NCQ-capable devices, this function tries to return false, so as to
3414		- * help keep the drives' internal queues full, whenever this helps the
3415		- * device boost the throughput without causing any service-guarantee
3416		- * issue.
3417		- *
3418		- * In more detail, the return value of this function is obtained by,
3419		- * first, computing a number of boolean variables that take into
3420		- * account throughput and service-guarantee issues, and, then,
3421		- * combining these variables in a logical expression. Most of the
3422		- * issues taken into account are not trivial. We discuss these issues
3423		- * individually while introducing the variables.
3424		- */
3425		-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
	4106	+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
	4107	+ struct bfq_queue *bfqq)
3426	4108	{
3427		- struct bfq_data *bfqd = bfqq->bfqd;
3428	4109	bool rot_without_queueing =
3429	4110	!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
3430	4111	bfqq_sequential_and_IO_bound,
3431		- idling_boosts_thr, idling_boosts_thr_without_issues,
3432		- idling_needed_for_service_guarantees,
3433		- asymmetric_scenario;
	4112	+ idling_boosts_thr;
3434	4113
3435		- if (bfqd->strict_guarantees)
3436		- return true;
3437		-
3438		- /*
3439		- * Idling is performed only if slice_idle > 0. In addition, we
3440		- * do not idle if
3441		- * (a) bfqq is async
3442		- * (b) bfqq is in the idle io prio class: in this case we do
3443		- * not idle because we want to minimize the bandwidth that
3444		- * queues in this class can steal to higher-priority queues
3445		- */
3446		- if (bfqd->bfq_slice_idle == 0 \|\| !bfq_bfqq_sync(bfqq) \|\|
3447		- bfq_class_idle(bfqq))
	4114	+ /* No point in idling for bfqq if it won't get requests any longer */
	4115	+ if (unlikely(!bfqq_process_refs(bfqq)))
3448	4116	return false;
3449	4117
3450	4118	bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
..	..	@@ -3477,8 +4145,7 @@
3477	4145	bfqq_sequential_and_IO_bound);
3478	4146
3479	4147	/*
3480		- * The value of the next variable,
3481		- * idling_boosts_thr_without_issues, is equal to that of
	4148	+ * The return value of this function is equal to that of
3482	4149	* idling_boosts_thr, unless a special case holds. In this
3483	4150	* special case, described below, idling may cause problems to
3484	4151	* weight-raised queues.
..	..	@@ -3495,217 +4162,85 @@
3495	4162	* which enqueue several requests in advance, and further
3496	4163	* reorder internally-queued requests.
3497	4164	*
3498		- * For this reason, we force to false the value of
3499		- * idling_boosts_thr_without_issues if there are weight-raised
3500		- * busy queues. In this case, and if bfqq is not weight-raised,
3501		- * this guarantees that the device is not idled for bfqq (if,
3502		- * instead, bfqq is weight-raised, then idling will be
3503		- * guaranteed by another variable, see below). Combined with
3504		- * the timestamping rules of BFQ (see [1] for details), this
3505		- * behavior causes bfqq, and hence any sync non-weight-raised
3506		- * queue, to get a lower number of requests served, and thus
3507		- * to ask for a lower number of requests from the request
3508		- * pool, before the busy weight-raised queues get served
3509		- * again. This often mitigates starvation problems in the
3510		- * presence of heavy write workloads and NCQ, thereby
3511		- * guaranteeing a higher application and system responsiveness
3512		- * in these hostile scenarios.
	4165	+ * For this reason, we force to false the return value if
	4166	+ * there are weight-raised busy queues. In this case, and if
	4167	+ * bfqq is not weight-raised, this guarantees that the device
	4168	+ * is not idled for bfqq (if, instead, bfqq is weight-raised,
	4169	+ * then idling will be guaranteed by another variable, see
	4170	+ * below). Combined with the timestamping rules of BFQ (see
	4171	+ * [1] for details), this behavior causes bfqq, and hence any
	4172	+ * sync non-weight-raised queue, to get a lower number of
	4173	+ * requests served, and thus to ask for a lower number of
	4174	+ * requests from the request pool, before the busy
	4175	+ * weight-raised queues get served again. This often mitigates
	4176	+ * starvation problems in the presence of heavy write
	4177	+ * workloads and NCQ, thereby guaranteeing a higher
	4178	+ * application and system responsiveness in these hostile
	4179	+ * scenarios.
3513	4180	*/
3514		- idling_boosts_thr_without_issues = idling_boosts_thr &&
	4181	+ return idling_boosts_thr &&
3515	4182	bfqd->wr_busy_queues == 0;
	4183	+}
	4184	+
	4185	+/*
	4186	+ * For a queue that becomes empty, device idling is allowed only if
	4187	+ * this function returns true for that queue. As a consequence, since
	4188	+ * device idling plays a critical role for both throughput boosting
	4189	+ * and service guarantees, the return value of this function plays a
	4190	+ * critical role as well.
	4191	+ *
	4192	+ * In a nutshell, this function returns true only if idling is
	4193	+ * beneficial for throughput or, even if detrimental for throughput,
	4194	+ * idling is however necessary to preserve service guarantees (low
	4195	+ * latency, desired throughput distribution, ...). In particular, on
	4196	+ * NCQ-capable devices, this function tries to return false, so as to
	4197	+ * help keep the drives' internal queues full, whenever this helps the
	4198	+ * device boost the throughput without causing any service-guarantee
	4199	+ * issue.
	4200	+ *
	4201	+ * Most of the issues taken into account to get the return value of
	4202	+ * this function are not trivial. We discuss these issues in the two
	4203	+ * functions providing the main pieces of information needed by this
	4204	+ * function.
	4205	+ */
	4206	+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
	4207	+{
	4208	+ struct bfq_data *bfqd = bfqq->bfqd;
	4209	+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
	4210	+
	4211	+ /* No point in idling for bfqq if it won't get requests any longer */
	4212	+ if (unlikely(!bfqq_process_refs(bfqq)))
	4213	+ return false;
	4214	+
	4215	+ if (unlikely(bfqd->strict_guarantees))
	4216	+ return true;
3516	4217
3517	4218	/*
3518		- * There is then a case where idling must be performed not
3519		- * for throughput concerns, but to preserve service
3520		- * guarantees.
3521		- *
3522		- * To introduce this case, we can note that allowing the drive
3523		- * to enqueue more than one request at a time, and hence
3524		- * delegating de facto final scheduling decisions to the
3525		- * drive's internal scheduler, entails loss of control on the
3526		- * actual request service order. In particular, the critical
3527		- * situation is when requests from different processes happen
3528		- * to be present, at the same time, in the internal queue(s)
3529		- * of the drive. In such a situation, the drive, by deciding
3530		- * the service order of the internally-queued requests, does
3531		- * determine also the actual throughput distribution among
3532		- * these processes. But the drive typically has no notion or
3533		- * concern about per-process throughput distribution, and
3534		- * makes its decisions only on a per-request basis. Therefore,
3535		- * the service distribution enforced by the drive's internal
3536		- * scheduler is likely to coincide with the desired
3537		- * device-throughput distribution only in a completely
3538		- * symmetric scenario where:
3539		- * (i) each of these processes must get the same throughput as
3540		- * the others;
3541		- * (ii) the I/O of each process has the same properties, in
3542		- * terms of locality (sequential or random), direction
3543		- * (reads or writes), request sizes, greediness
3544		- * (from I/O-bound to sporadic), and so on.
3545		- * In fact, in such a scenario, the drive tends to treat
3546		- * the requests of each of these processes in about the same
3547		- * way as the requests of the others, and thus to provide
3548		- * each of these processes with about the same throughput
3549		- * (which is exactly the desired throughput distribution). In
3550		- * contrast, in any asymmetric scenario, device idling is
3551		- * certainly needed to guarantee that bfqq receives its
3552		- * assigned fraction of the device throughput (see [1] for
3553		- * details).
3554		- * The problem is that idling may significantly reduce
3555		- * throughput with certain combinations of types of I/O and
3556		- * devices. An important example is sync random I/O, on flash
3557		- * storage with command queueing. So, unless bfqq falls in the
3558		- * above cases where idling also boosts throughput, it would
3559		- * be important to check conditions (i) and (ii) accurately,
3560		- * so as to avoid idling when not strictly needed for service
3561		- * guarantees.
3562		- *
3563		- * Unfortunately, it is extremely difficult to thoroughly
3564		- * check condition (ii). And, in case there are active groups,
3565		- * it becomes very difficult to check condition (i) too. In
3566		- * fact, if there are active groups, then, for condition (i)
3567		- * to become false, it is enough that an active group contains
3568		- * more active processes or sub-groups than some other active
3569		- * group. More precisely, for condition (i) to hold because of
3570		- * such a group, it is not even necessary that the group is
3571		- * (still) active: it is sufficient that, even if the group
3572		- * has become inactive, some of its descendant processes still
3573		- * have some request already dispatched but still waiting for
3574		- * completion. In fact, requests have still to be guaranteed
3575		- * their share of the throughput even after being
3576		- * dispatched. In this respect, it is easy to show that, if a
3577		- * group frequently becomes inactive while still having
3578		- * in-flight requests, and if, when this happens, the group is
3579		- * not considered in the calculation of whether the scenario
3580		- * is asymmetric, then the group may fail to be guaranteed its
3581		- * fair share of the throughput (basically because idling may
3582		- * not be performed for the descendant processes of the group,
3583		- * but it had to be). We address this issue with the
3584		- * following bi-modal behavior, implemented in the function
3585		- * bfq_symmetric_scenario().
3586		- *
3587		- * If there are groups with requests waiting for completion
3588		- * (as commented above, some of these groups may even be
3589		- * already inactive), then the scenario is tagged as
3590		- * asymmetric, conservatively, without checking any of the
3591		- * conditions (i) and (ii). So the device is idled for bfqq.
3592		- * This behavior matches also the fact that groups are created
3593		- * exactly if controlling I/O is a primary concern (to
3594		- * preserve bandwidth and latency guarantees).
3595		- *
3596		- * On the opposite end, if there are no groups with requests
3597		- * waiting for completion, then only condition (i) is actually
3598		- * controlled, i.e., provided that condition (i) holds, idling
3599		- * is not performed, regardless of whether condition (ii)
3600		- * holds. In other words, only if condition (i) does not hold,
3601		- * then idling is allowed, and the device tends to be
3602		- * prevented from queueing many requests, possibly of several
3603		- * processes. Since there are no groups with requests waiting
3604		- * for completion, then, to control condition (i) it is enough
3605		- * to check just whether all the queues with requests waiting
3606		- * for completion also have the same weight.
3607		- *
3608		- * Not checking condition (ii) evidently exposes bfqq to the
3609		- * risk of getting less throughput than its fair share.
3610		- * However, for queues with the same weight, a further
3611		- * mechanism, preemption, mitigates or even eliminates this
3612		- * problem. And it does so without consequences on overall
3613		- * throughput. This mechanism and its benefits are explained
3614		- * in the next three paragraphs.
3615		- *
3616		- * Even if a queue, say Q, is expired when it remains idle, Q
3617		- * can still preempt the new in-service queue if the next
3618		- * request of Q arrives soon (see the comments on
3619		- * bfq_bfqq_update_budg_for_activation). If all queues and
3620		- * groups have the same weight, this form of preemption,
3621		- * combined with the hole-recovery heuristic described in the
3622		- * comments on function bfq_bfqq_update_budg_for_activation,
3623		- * are enough to preserve a correct bandwidth distribution in
3624		- * the mid term, even without idling. In fact, even if not
3625		- * idling allows the internal queues of the device to contain
3626		- * many requests, and thus to reorder requests, we can rather
3627		- * safely assume that the internal scheduler still preserves a
3628		- * minimum of mid-term fairness.
3629		- *
3630		- * More precisely, this preemption-based, idleless approach
3631		- * provides fairness in terms of IOPS, and not sectors per
3632		- * second. This can be seen with a simple example. Suppose
3633		- * that there are two queues with the same weight, but that
3634		- * the first queue receives requests of 8 sectors, while the
3635		- * second queue receives requests of 1024 sectors. In
3636		- * addition, suppose that each of the two queues contains at
3637		- * most one request at a time, which implies that each queue
3638		- * always remains idle after it is served. Finally, after
3639		- * remaining idle, each queue receives very quickly a new
3640		- * request. It follows that the two queues are served
3641		- * alternatively, preempting each other if needed. This
3642		- * implies that, although both queues have the same weight,
3643		- * the queue with large requests receives a service that is
3644		- * 1024/8 times as high as the service received by the other
3645		- * queue.
3646		- *
3647		- * The motivation for using preemption instead of idling (for
3648		- * queues with the same weight) is that, by not idling,
3649		- * service guarantees are preserved (completely or at least in
3650		- * part) without minimally sacrificing throughput. And, if
3651		- * there is no active group, then the primary expectation for
3652		- * this device is probably a high throughput.
3653		- *
3654		- * We are now left only with explaining the additional
3655		- * compound condition that is checked below for deciding
3656		- * whether the scenario is asymmetric. To explain this
3657		- * compound condition, we need to add that the function
3658		- * bfq_symmetric_scenario checks the weights of only
3659		- * non-weight-raised queues, for efficiency reasons (see
3660		- * comments on bfq_weights_tree_add()). Then the fact that
3661		- * bfqq is weight-raised is checked explicitly here. More
3662		- * precisely, the compound condition below takes into account
3663		- * also the fact that, even if bfqq is being weight-raised,
3664		- * the scenario is still symmetric if all queues with requests
3665		- * waiting for completion happen to be
3666		- * weight-raised. Actually, we should be even more precise
3667		- * here, and differentiate between interactive weight raising
3668		- * and soft real-time weight raising.
3669		- *
3670		- * As a side note, it is worth considering that the above
3671		- * device-idling countermeasures may however fail in the
3672		- * following unlucky scenario: if idling is (correctly)
3673		- * disabled in a time period during which all symmetry
3674		- * sub-conditions hold, and hence the device is allowed to
3675		- * enqueue many requests, but at some later point in time some
3676		- * sub-condition stops to hold, then it may become impossible
3677		- * to let requests be served in the desired order until all
3678		- * the requests already queued in the device have been served.
	4219	+ * Idling is performed only if slice_idle > 0. In addition, we
	4220	+ * do not idle if
	4221	+ * (a) bfqq is async
	4222	+ * (b) bfqq is in the idle io prio class: in this case we do
	4223	+ * not idle because we want to minimize the bandwidth that
	4224	+ * queues in this class can steal to higher-priority queues
3679	4225	*/
3680		- asymmetric_scenario = (bfqq->wr_coeff > 1 &&
3681		- bfqd->wr_busy_queues < bfqd->busy_queues) \|\|
3682		- !bfq_symmetric_scenario(bfqd);
	4226	+ if (bfqd->bfq_slice_idle == 0 \|\| !bfq_bfqq_sync(bfqq) \|\|
	4227	+ bfq_class_idle(bfqq))
	4228	+ return false;
	4229	+
	4230	+ idling_boosts_thr_with_no_issue =
	4231	+ idling_boosts_thr_without_issues(bfqd, bfqq);
	4232	+
	4233	+ idling_needed_for_service_guar =
	4234	+ idling_needed_for_service_guarantees(bfqd, bfqq);
3683	4235
3684	4236	/*
3685		- * Finally, there is a case where maximizing throughput is the
3686		- * best choice even if it may cause unfairness toward
3687		- * bfqq. Such a case is when bfqq became active in a burst of
3688		- * queue activations. Queues that became active during a large
3689		- * burst benefit only from throughput, as discussed in the
3690		- * comments on bfq_handle_burst. Thus, if bfqq became active
3691		- * in a burst and not idling the device maximizes throughput,
3692		- * then the device must no be idled, because not idling the
3693		- * device provides bfqq and all other queues in the burst with
3694		- * maximum benefit. Combining this and the above case, we can
3695		- * now establish when idling is actually needed to preserve
3696		- * service guarantees.
3697		- */
3698		- idling_needed_for_service_guarantees =
3699		- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
3700		-
3701		- /*
3702		- * We have now all the components we need to compute the
	4237	+ * We have now the two components we need to compute the
3703	4238	* return value of the function, which is true only if idling
3704	4239	* either boosts the throughput (without issues), or is
3705	4240	* necessary to preserve service guarantees.
3706	4241	*/
3707		- return idling_boosts_thr_without_issues \|\|
3708		- idling_needed_for_service_guarantees;
	4242	+ return idling_boosts_thr_with_no_issue \|\|
	4243	+ idling_needed_for_service_guar;
3709	4244	}
3710	4245
3711	4246	/*
..	..	@@ -3724,26 +4259,98 @@
3724	4259	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
3725	4260	}
3726	4261
3727		-static struct bfq_queue bfq_choose_bfqq_for_injection(struct bfq_data bfqd)
	4262	+/*
	4263	+ * This function chooses the queue from which to pick the next extra
	4264	+ * I/O request to inject, if it finds a compatible queue. See the
	4265	+ * comments on bfq_update_inject_limit() for details on the injection
	4266	+ * mechanism, and for the definitions of the quantities mentioned
	4267	+ * below.
	4268	+ */
	4269	+static struct bfq_queue *
	4270	+bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
3728	4271	{
3729		- struct bfq_queue *bfqq;
	4272	+ struct bfq_queue bfqq, in_serv_bfqq = bfqd->in_service_queue;
	4273	+ unsigned int limit = in_serv_bfqq->inject_limit;
	4274	+ /*
	4275	+ * If
	4276	+ * - bfqq is not weight-raised and therefore does not carry
	4277	+ * time-critical I/O,
	4278	+ * or
	4279	+ * - regardless of whether bfqq is weight-raised, bfqq has
	4280	+ * however a long think time, during which it can absorb the
	4281	+ * effect of an appropriate number of extra I/O requests
	4282	+ * from other queues (see bfq_update_inject_limit for
	4283	+ * details on the computation of this number);
	4284	+ * then injection can be performed without restrictions.
	4285	+ */
	4286	+ bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 \|\|
	4287	+ !bfq_bfqq_has_short_ttime(in_serv_bfqq);
3730	4288
3731	4289	/*
3732		- * A linear search; but, with a high probability, very few
3733		- * steps are needed to find a candidate queue, i.e., a queue
3734		- * with enough budget left for its next request. In fact:
	4290	+ * If
	4291	+ * - the baseline total service time could not be sampled yet,
	4292	+ * so the inject limit happens to be still 0, and
	4293	+ * - a lot of time has elapsed since the plugging of I/O
	4294	+ * dispatching started, so drive speed is being wasted
	4295	+ * significantly;
	4296	+ * then temporarily raise inject limit to one request.
	4297	+ */
	4298	+ if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
	4299	+ bfq_bfqq_wait_request(in_serv_bfqq) &&
	4300	+ time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
	4301	+ bfqd->bfq_slice_idle)
	4302	+ )
	4303	+ limit = 1;
	4304	+
	4305	+ if (bfqd->rq_in_driver >= limit)
	4306	+ return NULL;
	4307	+
	4308	+ /*
	4309	+ * Linear search of the source queue for injection; but, with
	4310	+ * a high probability, very few steps are needed to find a
	4311	+ * candidate queue, i.e., a queue with enough budget left for
	4312	+ * its next request. In fact:
3735	4313	* - BFQ dynamically updates the budget of every queue so as
3736	4314	* to accommodate the expected backlog of the queue;
3737	4315	* - if a queue gets all its requests dispatched as injected
3738	4316	* service, then the queue is removed from the active list
3739		- * (and re-added only if it gets new requests, but with
3740		- * enough budget for its new backlog).
	4317	+ * (and re-added only if it gets new requests, but then it
	4318	+ * is assigned again enough budget for its new backlog).
3741	4319	*/
3742	4320	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
3743	4321	if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
	4322	+ (in_serv_always_inject \|\| bfqq->wr_coeff > 1) &&
3744	4323	bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
3745		- bfq_bfqq_budget_left(bfqq))
3746		- return bfqq;
	4324	+ bfq_bfqq_budget_left(bfqq)) {
	4325	+ /*
	4326	+ * Allow for only one large in-flight request
	4327	+ * on non-rotational devices, for the
	4328	+ * following reason. On non-rotationl drives,
	4329	+ * large requests take much longer than
	4330	+ * smaller requests to be served. In addition,
	4331	+ * the drive prefers to serve large requests
	4332	+ * w.r.t. to small ones, if it can choose. So,
	4333	+ * having more than one large requests queued
	4334	+ * in the drive may easily make the next first
	4335	+ * request of the in-service queue wait for so
	4336	+ * long to break bfqq's service guarantees. On
	4337	+ * the bright side, large requests let the
	4338	+ * drive reach a very high throughput, even if
	4339	+ * there is only one in-flight large request
	4340	+ * at a time.
	4341	+ */
	4342	+ if (blk_queue_nonrot(bfqd->queue) &&
	4343	+ blk_rq_sectors(bfqq->next_rq) >=
	4344	+ BFQQ_SECT_THR_NONROT)
	4345	+ limit = min_t(unsigned int, 1, limit);
	4346	+ else
	4347	+ limit = in_serv_bfqq->inject_limit;
	4348	+
	4349	+ if (bfqd->rq_in_driver < limit) {
	4350	+ bfqd->rqs_injected = true;
	4351	+ return bfqq;
	4352	+ }
	4353	+ }
3747	4354
3748	4355	return NULL;
3749	4356	}
..	..	@@ -3830,14 +4437,105 @@
3830	4437	* for a new request, or has requests waiting for a completion and
3831	4438	* may idle after their completion, then keep it anyway.
3832	4439	*
3833		- * Yet, to boost throughput, inject service from other queues if
3834		- * possible.
	4440	+ * Yet, inject service from other queues if it boosts
	4441	+ * throughput and is possible.
3835	4442	*/
3836	4443	if (bfq_bfqq_wait_request(bfqq) \|\|
3837	4444	(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3838		- if (bfq_bfqq_injectable(bfqq) &&
3839		- bfqq->injected_service * bfqq->inject_coeff <
3840		- bfqq->entity.service * 10)
	4445	+ struct bfq_queue *async_bfqq =
	4446	+ bfqq->bic && bfqq->bic->bfqq[0] &&
	4447	+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
	4448	+ bfqq->bic->bfqq[0]->next_rq ?
	4449	+ bfqq->bic->bfqq[0] : NULL;
	4450	+
	4451	+ /*
	4452	+ * The next three mutually-exclusive ifs decide
	4453	+ * whether to try injection, and choose the queue to
	4454	+ * pick an I/O request from.
	4455	+ *
	4456	+ * The first if checks whether the process associated
	4457	+ * with bfqq has also async I/O pending. If so, it
	4458	+ * injects such I/O unconditionally. Injecting async
	4459	+ * I/O from the same process can cause no harm to the
	4460	+ * process. On the contrary, it can only increase
	4461	+ * bandwidth and reduce latency for the process.
	4462	+ *
	4463	+ * The second if checks whether there happens to be a
	4464	+ * non-empty waker queue for bfqq, i.e., a queue whose
	4465	+ * I/O needs to be completed for bfqq to receive new
	4466	+ * I/O. This happens, e.g., if bfqq is associated with
	4467	+ * a process that does some sync. A sync generates
	4468	+ * extra blocking I/O, which must be completed before
	4469	+ * the process associated with bfqq can go on with its
	4470	+ * I/O. If the I/O of the waker queue is not served,
	4471	+ * then bfqq remains empty, and no I/O is dispatched,
	4472	+ * until the idle timeout fires for bfqq. This is
	4473	+ * likely to result in lower bandwidth and higher
	4474	+ * latencies for bfqq, and in a severe loss of total
	4475	+ * throughput. The best action to take is therefore to
	4476	+ * serve the waker queue as soon as possible. So do it
	4477	+ * (without relying on the third alternative below for
	4478	+ * eventually serving waker_bfqq's I/O; see the last
	4479	+ * paragraph for further details). This systematic
	4480	+ * injection of I/O from the waker queue does not
	4481	+ * cause any delay to bfqq's I/O. On the contrary,
	4482	+ * next bfqq's I/O is brought forward dramatically,
	4483	+ * for it is not blocked for milliseconds.
	4484	+ *
	4485	+ * The third if checks whether bfqq is a queue for
	4486	+ * which it is better to avoid injection. It is so if
	4487	+ * bfqq delivers more throughput when served without
	4488	+ * any further I/O from other queues in the middle, or
	4489	+ * if the service times of bfqq's I/O requests both
	4490	+ * count more than overall throughput, and may be
	4491	+ * easily increased by injection (this happens if bfqq
	4492	+ * has a short think time). If none of these
	4493	+ * conditions holds, then a candidate queue for
	4494	+ * injection is looked for through
	4495	+ * bfq_choose_bfqq_for_injection(). Note that the
	4496	+ * latter may return NULL (for example if the inject
	4497	+ * limit for bfqq is currently 0).
	4498	+ *
	4499	+ * NOTE: motivation for the second alternative
	4500	+ *
	4501	+ * Thanks to the way the inject limit is updated in
	4502	+ * bfq_update_has_short_ttime(), it is rather likely
	4503	+ * that, if I/O is being plugged for bfqq and the
	4504	+ * waker queue has pending I/O requests that are
	4505	+ * blocking bfqq's I/O, then the third alternative
	4506	+ * above lets the waker queue get served before the
	4507	+ * I/O-plugging timeout fires. So one may deem the
	4508	+ * second alternative superfluous. It is not, because
	4509	+ * the third alternative may be way less effective in
	4510	+ * case of a synchronization. For two main
	4511	+ * reasons. First, throughput may be low because the
	4512	+ * inject limit may be too low to guarantee the same
	4513	+ * amount of injected I/O, from the waker queue or
	4514	+ * other queues, that the second alternative
	4515	+ * guarantees (the second alternative unconditionally
	4516	+ * injects a pending I/O request of the waker queue
	4517	+ * for each bfq_dispatch_request()). Second, with the
	4518	+ * third alternative, the duration of the plugging,
	4519	+ * i.e., the time before bfqq finally receives new I/O,
	4520	+ * may not be minimized, because the waker queue may
	4521	+ * happen to be served only after other queues.
	4522	+ */
	4523	+ if (async_bfqq &&
	4524	+ icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
	4525	+ bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
	4526	+ bfq_bfqq_budget_left(async_bfqq))
	4527	+ bfqq = bfqq->bic->bfqq[0];
	4528	+ else if (bfq_bfqq_has_waker(bfqq) &&
	4529	+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
	4530	+ bfqq->waker_bfqq->next_rq &&
	4531	+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
	4532	+ bfqq->waker_bfqq) <=
	4533	+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
	4534	+ )
	4535	+ bfqq = bfqq->waker_bfqq;
	4536	+ else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
	4537	+ (bfqq->wr_coeff == 1 \|\| bfqd->wr_busy_queues > 1 \|\|
	4538	+ !bfq_bfqq_has_short_ttime(bfqq)))
3841	4539	bfqq = bfq_choose_bfqq_for_injection(bfqd);
3842	4540	else
3843	4541	bfqq = NULL;
..	..	@@ -3929,15 +4627,15 @@
3929	4627
3930	4628	bfq_bfqq_served(bfqq, service_to_charge);
3931	4629
	4630	+ if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
	4631	+ bfqd->wait_dispatch = false;
	4632	+ bfqd->waited_rq = rq;
	4633	+ }
	4634	+
3932	4635	bfq_dispatch_remove(bfqd->queue, rq);
3933	4636
3934		- if (bfqq != bfqd->in_service_queue) {
3935		- if (likely(bfqd->in_service_queue))
3936		- bfqd->in_service_queue->injected_service +=
3937		- bfq_serv_to_charge(rq, bfqq);
3938		-
	4637	+ if (bfqq != bfqd->in_service_queue)
3939	4638	goto return_rq;
3940		- }
3941	4639
3942	4640	/*
3943	4641	* If weight raising has to terminate for bfqq, then next
..	..	@@ -3957,7 +4655,7 @@
3957	4655	* belongs to CLASS_IDLE and other queues are waiting for
3958	4656	* service.
3959	4657	*/
3960		- if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
	4658	+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
3961	4659	goto return_rq;
3962	4660
3963	4661	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
..	..	@@ -3975,7 +4673,7 @@
3975	4673	* most a call to dispatch for nothing
3976	4674	*/
3977	4675	return !list_empty_careful(&bfqd->dispatch) \|\|
3978		- bfqd->busy_queues > 0;
	4676	+ bfq_tot_busy_queues(bfqd) > 0;
3979	4677	}
3980	4678
3981	4679	static struct request __bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
..	..	@@ -4029,9 +4727,10 @@
4029	4727	goto start_rq;
4030	4728	}
4031	4729
4032		- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
	4730	+ bfq_log(bfqd, "dispatch requests: %d busy queues",
	4731	+ bfq_tot_busy_queues(bfqd));
4033	4732
4034		- if (bfqd->busy_queues == 0)
	4733	+ if (bfq_tot_busy_queues(bfqd) == 0)
4035	4734	goto exit;
4036	4735
4037	4736	/*
..	..	@@ -4043,7 +4742,7 @@
4043	4742	* some unlucky request wait for as long as the device
4044	4743	* wishes.
4045	4744	*
4046		- * Of course, serving one request at at time may cause loss of
	4745	+ * Of course, serving one request at a time may cause loss of
4047	4746	* throughput.
4048	4747	*/
4049	4748	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
..	..	@@ -4065,7 +4764,7 @@
4065	4764	return rq;
4066	4765	}
4067	4766
4068		-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
	4767	+#ifdef CONFIG_BFQ_CGROUP_DEBUG
4069	4768	static void bfq_update_dispatch_stats(struct request_queue *q,
4070	4769	struct request *rq,
4071	4770	struct bfq_queue *in_serv_queue,
..	..	@@ -4089,7 +4788,7 @@
4089	4788	* In addition, the following queue lock guarantees that
4090	4789	* bfqq_group(bfqq) exists as well.
4091	4790	*/
4092		- spin_lock_irq(q->queue_lock);
	4791	+ spin_lock_irq(&q->queue_lock);
4093	4792	if (idle_timer_disabled)
4094	4793	/*
4095	4794	* Since the idle timer has been disabled,
..	..	@@ -4108,21 +4807,21 @@
4108	4807	bfqg_stats_set_start_empty_time(bfqg);
4109	4808	bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
4110	4809	}
4111		- spin_unlock_irq(q->queue_lock);
	4810	+ spin_unlock_irq(&q->queue_lock);
4112	4811	}
4113	4812	#else
4114	4813	static inline void bfq_update_dispatch_stats(struct request_queue *q,
4115	4814	struct request *rq,
4116	4815	struct bfq_queue *in_serv_queue,
4117	4816	bool idle_timer_disabled) {}
4118		-#endif
	4817	+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
4119	4818
4120	4819	static struct request bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
4121	4820	{
4122	4821	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
4123	4822	struct request *rq;
4124	4823	struct bfq_queue *in_serv_queue;
4125		- bool waiting_rq, idle_timer_disabled;
	4824	+ bool waiting_rq, idle_timer_disabled = false;
4126	4825
4127	4826	spin_lock_irq(&bfqd->lock);
4128	4827
..	..	@@ -4130,14 +4829,15 @@
4130	4829	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
4131	4830
4132	4831	rq = __bfq_dispatch_request(hctx);
4133		-
4134		- idle_timer_disabled =
4135		- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
	4832	+ if (in_serv_queue == bfqd->in_service_queue) {
	4833	+ idle_timer_disabled =
	4834	+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
	4835	+ }
4136	4836
4137	4837	spin_unlock_irq(&bfqd->lock);
4138		-
4139		- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
4140		- idle_timer_disabled);
	4838	+ bfq_update_dispatch_stats(hctx->queue, rq,
	4839	+ idle_timer_disabled ? in_serv_queue : NULL,
	4840	+ idle_timer_disabled);
4141	4841
4142	4842	return rq;
4143	4843	}
..	..	@@ -4151,9 +4851,9 @@
4151	4851	*/
4152	4852	void bfq_put_queue(struct bfq_queue *bfqq)
4153	4853	{
4154		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
	4854	+ struct bfq_queue *item;
	4855	+ struct hlist_node *n;
4155	4856	struct bfq_group *bfqg = bfqq_group(bfqq);
4156		-#endif
4157	4857
4158	4858	if (bfqq->bfqd)
4159	4859	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
..	..	@@ -4195,13 +4895,41 @@
4195	4895	bfqq->bfqd->burst_size--;
4196	4896	}
4197	4897
	4898	+ /*
	4899	+ * bfqq does not exist any longer, so it cannot be woken by
	4900	+ * any other queue, and cannot wake any other queue. Then bfqq
	4901	+ * must be removed from the woken list of its possible waker
	4902	+ * queue, and all queues in the woken list of bfqq must stop
	4903	+ * having a waker queue. Strictly speaking, these updates
	4904	+ * should be performed when bfqq remains with no I/O source
	4905	+ * attached to it, which happens before bfqq gets freed. In
	4906	+ * particular, this happens when the last process associated
	4907	+ * with bfqq exits or gets associated with a different
	4908	+ * queue. However, both events lead to bfqq being freed soon,
	4909	+ * and dangling references would come out only after bfqq gets
	4910	+ * freed. So these updates are done here, as a simple and safe
	4911	+ * way to handle all cases.
	4912	+ */
	4913	+ /* remove bfqq from woken list */
	4914	+ if (!hlist_unhashed(&bfqq->woken_list_node))
	4915	+ hlist_del_init(&bfqq->woken_list_node);
	4916	+
	4917	+ /* reset waker for all queues in woken list */
	4918	+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
	4919	+ woken_list_node) {
	4920	+ item->waker_bfqq = NULL;
	4921	+ bfq_clear_bfqq_has_waker(item);
	4922	+ hlist_del_init(&item->woken_list_node);
	4923	+ }
	4924	+
	4925	+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
	4926	+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
	4927	+
4198	4928	kmem_cache_free(bfq_pool, bfqq);
4199		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
4200	4929	bfqg_and_blkg_put(bfqg);
4201		-#endif
4202	4930	}
4203	4931
4204		-static void bfq_put_cooperator(struct bfq_queue *bfqq)
	4932	+void bfq_put_cooperator(struct bfq_queue *bfqq)
4205	4933	{
4206	4934	struct bfq_queue __bfqq, next;
4207	4935
..	..	@@ -4223,7 +4951,7 @@
4223	4951	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
4224	4952	{
4225	4953	if (bfqq == bfqd->in_service_queue) {
4226		- __bfq_bfqq_expire(bfqd, bfqq);
	4954	+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
4227	4955	bfq_schedule_dispatch(bfqd);
4228	4956	}
4229	4957
..	..	@@ -4231,7 +4959,7 @@
4231	4959
4232	4960	bfq_put_cooperator(bfqq);
4233	4961
4234		- bfq_put_queue(bfqq); /* release process reference */
	4962	+ bfq_release_process_ref(bfqd, bfqq);
4235	4963	}
4236	4964
4237	4965	static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
..	..	@@ -4281,7 +5009,7 @@
4281	5009	pr_err("bdi %s: bfq: bad prio class %d\n",
4282	5010	bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
4283	5011	ioprio_class);
4284		- /* fall through */
	5012	+ fallthrough;
4285	5013	case IOPRIO_CLASS_NONE:
4286	5014	/*
4287	5015	* No prio set, inherit CPU scheduling settings.
..	..	@@ -4334,8 +5062,7 @@
4334	5062
4335	5063	bfqq = bic_to_bfqq(bic, false);
4336	5064	if (bfqq) {
4337		- /* release process reference on this queue */
4338		- bfq_put_queue(bfqq);
	5065	+ bfq_release_process_ref(bfqd, bfqq);
4339	5066	bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
4340	5067	bic_set_bfqq(bic, bfqq, false);
4341	5068	}
..	..	@@ -4351,6 +5078,8 @@
4351	5078	RB_CLEAR_NODE(&bfqq->entity.rb_node);
4352	5079	INIT_LIST_HEAD(&bfqq->fifo);
4353	5080	INIT_HLIST_NODE(&bfqq->burst_list_node);
	5081	+ INIT_HLIST_NODE(&bfqq->woken_list_node);
	5082	+ INIT_HLIST_HEAD(&bfqq->woken_list);
4354	5083
4355	5084	bfqq->ref = 0;
4356	5085	bfqq->bfqd = bfqd;
..	..	@@ -4369,13 +5098,6 @@
4369	5098	bfq_mark_bfqq_has_short_ttime(bfqq);
4370	5099	bfq_mark_bfqq_sync(bfqq);
4371	5100	bfq_mark_bfqq_just_created(bfqq);
4372		- /*
4373		- * Aggressively inject a lot of service: up to 90%.
4374		- * This coefficient remains constant during bfqq life,
4375		- * but this behavior might be changed, after enough
4376		- * testing and tuning.
4377		- */
4378		- bfqq->inject_coeff = 1;
4379	5101	} else
4380	5102	bfq_clear_bfqq_sync(bfqq);
4381	5103
..	..	@@ -4419,7 +5141,7 @@
4419	5141	return &bfqg->async_bfqq[0][ioprio];
4420	5142	case IOPRIO_CLASS_NONE:
4421	5143	ioprio = IOPRIO_NORM;
4422		- /* fall through */
	5144	+ fallthrough;
4423	5145	case IOPRIO_CLASS_BE:
4424	5146	return &bfqg->async_bfqq[1][ioprio];
4425	5147	case IOPRIO_CLASS_IDLE:
..	..	@@ -4439,14 +5161,7 @@
4439	5161	struct bfq_queue *bfqq;
4440	5162	struct bfq_group *bfqg;
4441	5163
4442		- rcu_read_lock();
4443		-
4444		- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
4445		- if (!bfqg) {
4446		- bfqq = &bfqd->oom_bfqq;
4447		- goto out;
4448		- }
4449		-
	5164	+ bfqg = bfq_bio_bfqg(bfqd, bio);
4450	5165	if (!is_sync) {
4451	5166	async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
4452	5167	ioprio);
..	..	@@ -4490,7 +5205,6 @@
4490	5205	out:
4491	5206	bfqq->ref++; /* get a process reference to this queue */
4492	5207	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
4493		- rcu_read_unlock();
4494	5208	return bfqq;
4495	5209	}
4496	5210
..	..	@@ -4513,17 +5227,19 @@
4513	5227	struct request *rq)
4514	5228	{
4515	5229	bfqq->seek_history <<= 1;
4516		- bfqq->seek_history \|=
4517		- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
4518		- (!blk_queue_nonrot(bfqd->queue) \|\|
4519		- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
	5230	+ bfqq->seek_history \|= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
	5231	+
	5232	+ if (bfqq->wr_coeff > 1 &&
	5233	+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
	5234	+ BFQQ_TOTALLY_SEEKY(bfqq))
	5235	+ bfq_bfqq_end_wr(bfqq);
4520	5236	}
4521	5237
4522	5238	static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4523	5239	struct bfq_queue *bfqq,
4524	5240	struct bfq_io_cq *bic)
4525	5241	{
4526		- bool has_short_ttime = true;
	5242	+ bool has_short_ttime = true, state_changed;
4527	5243
4528	5244	/*
4529	5245	* No need to update has_short_ttime if bfqq is async or in
..	..	@@ -4548,13 +5264,102 @@
4548	5264	bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
4549	5265	has_short_ttime = false;
4550	5266
4551		- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
4552		- has_short_ttime);
	5267	+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
4553	5268
4554	5269	if (has_short_ttime)
4555	5270	bfq_mark_bfqq_has_short_ttime(bfqq);
4556	5271	else
4557	5272	bfq_clear_bfqq_has_short_ttime(bfqq);
	5273	+
	5274	+ /*
	5275	+ * Until the base value for the total service time gets
	5276	+ * finally computed for bfqq, the inject limit does depend on
	5277	+ * the think-time state (short\|long). In particular, the limit
	5278	+ * is 0 or 1 if the think time is deemed, respectively, as
	5279	+ * short or long (details in the comments in
	5280	+ * bfq_update_inject_limit()). Accordingly, the next
	5281	+ * instructions reset the inject limit if the think-time state
	5282	+ * has changed and the above base value is still to be
	5283	+ * computed.
	5284	+ *
	5285	+ * However, the reset is performed only if more than 100 ms
	5286	+ * have elapsed since the last update of the inject limit, or
	5287	+ * (inclusive) if the change is from short to long think
	5288	+ * time. The reason for this waiting is as follows.
	5289	+ *
	5290	+ * bfqq may have a long think time because of a
	5291	+ * synchronization with some other queue, i.e., because the
	5292	+ * I/O of some other queue may need to be completed for bfqq
	5293	+ * to receive new I/O. Details in the comments on the choice
	5294	+ * of the queue for injection in bfq_select_queue().
	5295	+ *
	5296	+ * As stressed in those comments, if such a synchronization is
	5297	+ * actually in place, then, without injection on bfqq, the
	5298	+ * blocking I/O cannot happen to served while bfqq is in
	5299	+ * service. As a consequence, if bfqq is granted
	5300	+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
	5301	+ * is dispatched, until the idle timeout fires. This is likely
	5302	+ * to result in lower bandwidth and higher latencies for bfqq,
	5303	+ * and in a severe loss of total throughput.
	5304	+ *
	5305	+ * On the opposite end, a non-zero inject limit may allow the
	5306	+ * I/O that blocks bfqq to be executed soon, and therefore
	5307	+ * bfqq to receive new I/O soon.
	5308	+ *
	5309	+ * But, if the blocking gets actually eliminated, then the
	5310	+ * next think-time sample for bfqq may be very low. This in
	5311	+ * turn may cause bfqq's think time to be deemed
	5312	+ * short. Without the 100 ms barrier, this new state change
	5313	+ * would cause the body of the next if to be executed
	5314	+ * immediately. But this would set to 0 the inject
	5315	+ * limit. Without injection, the blocking I/O would cause the
	5316	+ * think time of bfqq to become long again, and therefore the
	5317	+ * inject limit to be raised again, and so on. The only effect
	5318	+ * of such a steady oscillation between the two think-time
	5319	+ * states would be to prevent effective injection on bfqq.
	5320	+ *
	5321	+ * In contrast, if the inject limit is not reset during such a
	5322	+ * long time interval as 100 ms, then the number of short
	5323	+ * think time samples can grow significantly before the reset
	5324	+ * is performed. As a consequence, the think time state can
	5325	+ * become stable before the reset. Therefore there will be no
	5326	+ * state change when the 100 ms elapse, and no reset of the
	5327	+ * inject limit. The inject limit remains steadily equal to 1
	5328	+ * both during and after the 100 ms. So injection can be
	5329	+ * performed at all times, and throughput gets boosted.
	5330	+ *
	5331	+ * An inject limit equal to 1 is however in conflict, in
	5332	+ * general, with the fact that the think time of bfqq is
	5333	+ * short, because injection may be likely to delay bfqq's I/O
	5334	+ * (as explained in the comments in
	5335	+ * bfq_update_inject_limit()). But this does not happen in
	5336	+ * this special case, because bfqq's low think time is due to
	5337	+ * an effective handling of a synchronization, through
	5338	+ * injection. In this special case, bfqq's I/O does not get
	5339	+ * delayed by injection; on the contrary, bfqq's I/O is
	5340	+ * brought forward, because it is not blocked for
	5341	+ * milliseconds.
	5342	+ *
	5343	+ * In addition, serving the blocking I/O much sooner, and much
	5344	+ * more frequently than once per I/O-plugging timeout, makes
	5345	+ * it much quicker to detect a waker queue (the concept of
	5346	+ * waker queue is defined in the comments in
	5347	+ * bfq_add_request()). This makes it possible to start sooner
	5348	+ * to boost throughput more effectively, by injecting the I/O
	5349	+ * of the waker queue unconditionally on every
	5350	+ * bfq_dispatch_request().
	5351	+ *
	5352	+ * One last, important benefit of not resetting the inject
	5353	+ * limit before 100 ms is that, during this time interval, the
	5354	+ * base value for the total service time is likely to get
	5355	+ * finally computed for bfqq, freeing the inject limit from
	5356	+ * its relation with the think time.
	5357	+ */
	5358	+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
	5359	+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
	5360	+ msecs_to_jiffies(100)) \|\|
	5361	+ !has_short_ttime))
	5362	+ bfq_reset_inject_limit(bfqd, bfqq);
4558	5363	}
4559	5364
4560	5365	/*
..	..	@@ -4564,18 +5369,8 @@
4564	5369	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
4565	5370	struct request *rq)
4566	5371	{
4567		- struct bfq_io_cq *bic = RQ_BIC(rq);
4568		-
4569	5372	if (rq->cmd_flags & REQ_META)
4570	5373	bfqq->meta_pending++;
4571		-
4572		- bfq_update_io_thinktime(bfqd, bfqq);
4573		- bfq_update_has_short_ttime(bfqd, bfqq, bic);
4574		- bfq_update_io_seektime(bfqd, bfqq, rq);
4575		-
4576		- bfq_log_bfqq(bfqd, bfqq,
4577		- "rq_enqueued: has_short_ttime=%d (seeky %d)",
4578		- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
4579	5374
4580	5375	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4581	5376
..	..	@@ -4585,28 +5380,31 @@
4585	5380	bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
4586	5381
4587	5382	/*
4588		- * There is just this request queued: if the request
4589		- * is small and the queue is not to be expired, then
4590		- * just exit.
	5383	+ * There is just this request queued: if
	5384	+ * - the request is small, and
	5385	+ * - we are idling to boost throughput, and
	5386	+ * - the queue is not to be expired,
	5387	+ * then just exit.
4591	5388	*
4592	5389	* In this way, if the device is being idled to wait
4593	5390	* for a new request from the in-service queue, we
4594	5391	* avoid unplugging the device and committing the
4595		- * device to serve just a small request. On the
4596		- * contrary, we wait for the block layer to decide
4597		- * when to unplug the device: hopefully, new requests
4598		- * will be merged to this one quickly, then the device
4599		- * will be unplugged and larger requests will be
4600		- * dispatched.
	5392	+ * device to serve just a small request. In contrast
	5393	+ * we wait for the block layer to decide when to
	5394	+ * unplug the device: hopefully, new requests will be
	5395	+ * merged to this one quickly, then the device will be
	5396	+ * unplugged and larger requests will be dispatched.
4601	5397	*/
4602		- if (small_req && !budget_timeout)
	5398	+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
	5399	+ !budget_timeout)
4603	5400	return;
4604	5401
4605	5402	/*
4606		- * A large enough request arrived, or the queue is to
4607		- * be expired: in both cases disk idling is to be
4608		- * stopped, so clear wait_request flag and reset
4609		- * timer.
	5403	+ * A large enough request arrived, or idling is being
	5404	+ * performed to preserve service guarantees, or
	5405	+ * finally the queue is to be expired: in all these
	5406	+ * cases disk idling is to be stopped, so clear
	5407	+ * wait_request flag and reset timer.
4610	5408	*/
4611	5409	bfq_clear_bfqq_wait_request(bfqq);
4612	5410	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
..	..	@@ -4632,8 +5430,6 @@
4632	5430	bool waiting, idle_timer_disabled = false;
4633	5431
4634	5432	if (new_bfqq) {
4635		- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
4636		- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
4637	5433	/*
4638	5434	* Release the request's reference to the old bfqq
4639	5435	* and make sure one is taken to the shared queue.
..	..	@@ -4663,6 +5459,10 @@
4663	5459	bfqq = new_bfqq;
4664	5460	}
4665	5461
	5462	+ bfq_update_io_thinktime(bfqd, bfqq);
	5463	+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
	5464	+ bfq_update_io_seektime(bfqd, bfqq, rq);
	5465	+
4666	5466	waiting = bfqq && bfq_bfqq_wait_request(bfqq);
4667	5467	bfq_add_request(rq);
4668	5468	idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
..	..	@@ -4675,7 +5475,7 @@
4675	5475	return idle_timer_disabled;
4676	5476	}
4677	5477
4678		-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
	5478	+#ifdef CONFIG_BFQ_CGROUP_DEBUG
4679	5479	static void bfq_update_insert_stats(struct request_queue *q,
4680	5480	struct bfq_queue *bfqq,
4681	5481	bool idle_timer_disabled,
..	..	@@ -4694,18 +5494,20 @@
4694	5494	* In addition, the following queue lock guarantees that
4695	5495	* bfqq_group(bfqq) exists as well.
4696	5496	*/
4697		- spin_lock_irq(q->queue_lock);
	5497	+ spin_lock_irq(&q->queue_lock);
4698	5498	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4699	5499	if (idle_timer_disabled)
4700	5500	bfqg_stats_update_idle_time(bfqq_group(bfqq));
4701		- spin_unlock_irq(q->queue_lock);
	5501	+ spin_unlock_irq(&q->queue_lock);
4702	5502	}
4703	5503	#else
4704	5504	static inline void bfq_update_insert_stats(struct request_queue *q,
4705	5505	struct bfq_queue *bfqq,
4706	5506	bool idle_timer_disabled,
4707	5507	unsigned int cmd_flags) {}
4708		-#endif
	5508	+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
	5509	+
	5510	+static struct bfq_queue bfq_init_rq(struct request rq);
4709	5511
4710	5512	static void bfq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
4711	5513	bool at_head)
..	..	@@ -4716,18 +5518,19 @@
4716	5518	bool idle_timer_disabled = false;
4717	5519	unsigned int cmd_flags;
4718	5520
	5521	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
	5522	+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
	5523	+ bfqg_stats_update_legacy_io(q, rq);
	5524	+#endif
4719	5525	spin_lock_irq(&bfqd->lock);
	5526	+ bfqq = bfq_init_rq(rq);
4720	5527	if (blk_mq_sched_try_insert_merge(q, rq)) {
4721	5528	spin_unlock_irq(&bfqd->lock);
4722	5529	return;
4723	5530	}
4724	5531
4725		- spin_unlock_irq(&bfqd->lock);
4726		-
4727	5532	blk_mq_sched_request_inserted(rq);
4728	5533
4729		- spin_lock_irq(&bfqd->lock);
4730		- bfqq = bfq_init_rq(rq);
4731	5534	if (!bfqq \|\| at_head \|\| blk_rq_is_passthrough(rq)) {
4732	5535	if (at_head)
4733	5536	list_add(&rq->queuelist, &bfqd->dispatch);
..	..	@@ -4776,6 +5579,8 @@
4776	5579
4777	5580	static void bfq_update_hw_tag(struct bfq_data *bfqd)
4778	5581	{
	5582	+ struct bfq_queue *bfqq = bfqd->in_service_queue;
	5583	+
4779	5584	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
4780	5585	bfqd->rq_in_driver);
4781	5586
..	..	@@ -4788,7 +5593,18 @@
4788	5593	* sum is not exact, as it's not taking into account deactivated
4789	5594	* requests.
4790	5595	*/
4791		- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
	5596	+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
	5597	+ return;
	5598	+
	5599	+ /*
	5600	+ * If active queue hasn't enough requests and can idle, bfq might not
	5601	+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
	5602	+ * case
	5603	+ */
	5604	+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
	5605	+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
	5606	+ BFQ_HW_QUEUE_THRESHOLD &&
	5607	+ bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
4792	5608	return;
4793	5609
4794	5610	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
..	..	@@ -4797,6 +5613,9 @@
4797	5613	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
4798	5614	bfqd->max_rq_in_driver = 0;
4799	5615	bfqd->hw_tag_samples = 0;
	5616	+
	5617	+ bfqd->nonrot_with_queueing =
	5618	+ blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
4800	5619	}
4801	5620
4802	5621	static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
..	..	@@ -4852,6 +5671,7 @@
4852	5671	1UL<<(BFQ_RATE_SHIFT - 10))
4853	5672	bfq_update_rate_reset(bfqd, NULL);
4854	5673	bfqd->last_completion = now_ns;
	5674	+ bfqd->last_completed_rq_bfqq = bfqq;
4855	5675
4856	5676	/*
4857	5677	* If we are waiting to discover whether the request pattern
..	..	@@ -4859,11 +5679,14 @@
4859	5679	* isochronous, and both requisites for this condition to hold
4860	5680	* are now satisfied, then compute soft_rt_next_start (see the
4861	5681	* comments on the function bfq_bfqq_softrt_next_start()). We
4862		- * schedule this delayed check when bfqq expires, if it still
4863		- * has in-flight requests.
	5682	+ * do not compute soft_rt_next_start if bfqq is in interactive
	5683	+ * weight raising (see the comments in bfq_bfqq_expire() for
	5684	+ * an explanation). We schedule this delayed update when bfqq
	5685	+ * expires, if it still has in-flight requests.
4864	5686	*/
4865	5687	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4866		- RB_EMPTY_ROOT(&bfqq->sort_list))
	5688	+ RB_EMPTY_ROOT(&bfqq->sort_list) &&
	5689	+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
4867	5690	bfqq->soft_rt_next_start =
4868	5691	bfq_bfqq_softrt_next_start(bfqd, bfqq);
4869	5692
..	..	@@ -4921,6 +5744,167 @@
4921	5744	}
4922	5745
4923	5746	/*
	5747	+ * The processes associated with bfqq may happen to generate their
	5748	+ * cumulative I/O at a lower rate than the rate at which the device
	5749	+ * could serve the same I/O. This is rather probable, e.g., if only
	5750	+ * one process is associated with bfqq and the device is an SSD. It
	5751	+ * results in bfqq becoming often empty while in service. In this
	5752	+ * respect, if BFQ is allowed to switch to another queue when bfqq
	5753	+ * remains empty, then the device goes on being fed with I/O requests,
	5754	+ * and the throughput is not affected. In contrast, if BFQ is not
	5755	+ * allowed to switch to another queue---because bfqq is sync and
	5756	+ * I/O-dispatch needs to be plugged while bfqq is temporarily
	5757	+ * empty---then, during the service of bfqq, there will be frequent
	5758	+ * "service holes", i.e., time intervals during which bfqq gets empty
	5759	+ * and the device can only consume the I/O already queued in its
	5760	+ * hardware queues. During service holes, the device may even get to
	5761	+ * remaining idle. In the end, during the service of bfqq, the device
	5762	+ * is driven at a lower speed than the one it can reach with the kind
	5763	+ * of I/O flowing through bfqq.
	5764	+ *
	5765	+ * To counter this loss of throughput, BFQ implements a "request
	5766	+ * injection mechanism", which tries to fill the above service holes
	5767	+ * with I/O requests taken from other queues. The hard part in this
	5768	+ * mechanism is finding the right amount of I/O to inject, so as to
	5769	+ * both boost throughput and not break bfqq's bandwidth and latency
	5770	+ * guarantees. In this respect, the mechanism maintains a per-queue
	5771	+ * inject limit, computed as below. While bfqq is empty, the injection
	5772	+ * mechanism dispatches extra I/O requests only until the total number
	5773	+ * of I/O requests in flight---i.e., already dispatched but not yet
	5774	+ * completed---remains lower than this limit.
	5775	+ *
	5776	+ * A first definition comes in handy to introduce the algorithm by
	5777	+ * which the inject limit is computed. We define as first request for
	5778	+ * bfqq, an I/O request for bfqq that arrives while bfqq is in
	5779	+ * service, and causes bfqq to switch from empty to non-empty. The
	5780	+ * algorithm updates the limit as a function of the effect of
	5781	+ * injection on the service times of only the first requests of
	5782	+ * bfqq. The reason for this restriction is that these are the
	5783	+ * requests whose service time is affected most, because they are the
	5784	+ * first to arrive after injection possibly occurred.
	5785	+ *
	5786	+ * To evaluate the effect of injection, the algorithm measures the
	5787	+ * "total service time" of first requests. We define as total service
	5788	+ * time of an I/O request, the time that elapses since when the
	5789	+ * request is enqueued into bfqq, to when it is completed. This
	5790	+ * quantity allows the whole effect of injection to be measured. It is
	5791	+ * easy to see why. Suppose that some requests of other queues are
	5792	+ * actually injected while bfqq is empty, and that a new request R
	5793	+ * then arrives for bfqq. If the device does start to serve all or
	5794	+ * part of the injected requests during the service hole, then,
	5795	+ * because of this extra service, it may delay the next invocation of
	5796	+ * the dispatch hook of BFQ. Then, even after R gets eventually
	5797	+ * dispatched, the device may delay the actual service of R if it is
	5798	+ * still busy serving the extra requests, or if it decides to serve,
	5799	+ * before R, some extra request still present in its queues. As a
	5800	+ * conclusion, the cumulative extra delay caused by injection can be
	5801	+ * easily evaluated by just comparing the total service time of first
	5802	+ * requests with and without injection.
	5803	+ *
	5804	+ * The limit-update algorithm works as follows. On the arrival of a
	5805	+ * first request of bfqq, the algorithm measures the total time of the
	5806	+ * request only if one of the three cases below holds, and, for each
	5807	+ * case, it updates the limit as described below:
	5808	+ *
	5809	+ * (1) If there is no in-flight request. This gives a baseline for the
	5810	+ * total service time of the requests of bfqq. If the baseline has
	5811	+ * not been computed yet, then, after computing it, the limit is
	5812	+ * set to 1, to start boosting throughput, and to prepare the
	5813	+ * ground for the next case. If the baseline has already been
	5814	+ * computed, then it is updated, in case it results to be lower
	5815	+ * than the previous value.
	5816	+ *
	5817	+ * (2) If the limit is higher than 0 and there are in-flight
	5818	+ * requests. By comparing the total service time in this case with
	5819	+ * the above baseline, it is possible to know at which extent the
	5820	+ * current value of the limit is inflating the total service
	5821	+ * time. If the inflation is below a certain threshold, then bfqq
	5822	+ * is assumed to be suffering from no perceivable loss of its
	5823	+ * service guarantees, and the limit is even tentatively
	5824	+ * increased. If the inflation is above the threshold, then the
	5825	+ * limit is decreased. Due to the lack of any hysteresis, this
	5826	+ * logic makes the limit oscillate even in steady workload
	5827	+ * conditions. Yet we opted for it, because it is fast in reaching
	5828	+ * the best value for the limit, as a function of the current I/O
	5829	+ * workload. To reduce oscillations, this step is disabled for a
	5830	+ * short time interval after the limit happens to be decreased.
	5831	+ *
	5832	+ * (3) Periodically, after resetting the limit, to make sure that the
	5833	+ * limit eventually drops in case the workload changes. This is
	5834	+ * needed because, after the limit has gone safely up for a
	5835	+ * certain workload, it is impossible to guess whether the
	5836	+ * baseline total service time may have changed, without measuring
	5837	+ * it again without injection. A more effective version of this
	5838	+ * step might be to just sample the baseline, by interrupting
	5839	+ * injection only once, and then to reset/lower the limit only if
	5840	+ * the total service time with the current limit does happen to be
	5841	+ * too large.
	5842	+ *
	5843	+ * More details on each step are provided in the comments on the
	5844	+ * pieces of code that implement these steps: the branch handling the
	5845	+ * transition from empty to non empty in bfq_add_request(), the branch
	5846	+ * handling injection in bfq_select_queue(), and the function
	5847	+ * bfq_choose_bfqq_for_injection(). These comments also explain some
	5848	+ * exceptions, made by the injection mechanism in some special cases.
	5849	+ */
	5850	+static void bfq_update_inject_limit(struct bfq_data *bfqd,
	5851	+ struct bfq_queue *bfqq)
	5852	+{
	5853	+ u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
	5854	+ unsigned int old_limit = bfqq->inject_limit;
	5855	+
	5856	+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
	5857	+ u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
	5858	+
	5859	+ if (tot_time_ns >= threshold && old_limit > 0) {
	5860	+ bfqq->inject_limit--;
	5861	+ bfqq->decrease_time_jif = jiffies;
	5862	+ } else if (tot_time_ns < threshold &&
	5863	+ old_limit <= bfqd->max_rq_in_driver)
	5864	+ bfqq->inject_limit++;
	5865	+ }
	5866	+
	5867	+ /*
	5868	+ * Either we still have to compute the base value for the
	5869	+ * total service time, and there seem to be the right
	5870	+ * conditions to do it, or we can lower the last base value
	5871	+ * computed.
	5872	+ *
	5873	+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
	5874	+ * request in flight, because this function is in the code
	5875	+ * path that handles the completion of a request of bfqq, and,
	5876	+ * in particular, this function is executed before
	5877	+ * bfqd->rq_in_driver is decremented in such a code path.
	5878	+ */
	5879	+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) \|\|
	5880	+ tot_time_ns < bfqq->last_serv_time_ns) {
	5881	+ if (bfqq->last_serv_time_ns == 0) {
	5882	+ /*
	5883	+ * Now we certainly have a base value: make sure we
	5884	+ * start trying injection.
	5885	+ */
	5886	+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
	5887	+ }
	5888	+ bfqq->last_serv_time_ns = tot_time_ns;
	5889	+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
	5890	+ /*
	5891	+ * No I/O injected and no request still in service in
	5892	+ * the drive: these are the exact conditions for
	5893	+ * computing the base value of the total service time
	5894	+ * for bfqq. So let's update this value, because it is
	5895	+ * rather variable. For example, it varies if the size
	5896	+ * or the spatial locality of the I/O requests in bfqq
	5897	+ * change.
	5898	+ */
	5899	+ bfqq->last_serv_time_ns = tot_time_ns;
	5900	+
	5901	+
	5902	+ /* update complete, not waiting for any request completion any longer */
	5903	+ bfqd->waited_rq = NULL;
	5904	+ bfqd->rqs_injected = false;
	5905	+}
	5906	+
	5907	+/*
4924	5908	* Handle either a requeue or a finish for rq. The things to do are
4925	5909	* the same in both cases: all references to rq are to be dropped. In
4926	5910	* particular, rq is considered completed from the point of view of
..	..	@@ -4930,18 +5914,6 @@
4930	5914	{
4931	5915	struct bfq_queue *bfqq = RQ_BFQQ(rq);
4932	5916	struct bfq_data *bfqd;
4933		-
4934		- /*
4935		- * Requeue and finish hooks are invoked in blk-mq without
4936		- * checking whether the involved request is actually still
4937		- * referenced in the scheduler. To handle this fact, the
4938		- * following two checks make this function exit in case of
4939		- * spurious invocations, for which there is nothing to do.
4940		- *
4941		- * First, check whether rq has nothing to do with an elevator.
4942		- */
4943		- if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4944		- return;
4945	5917
4946	5918	/*
4947	5919	* rq either is not associated with any icq, or is an already
..	..	@@ -4963,6 +5935,9 @@
4963	5935	unsigned long flags;
4964	5936
4965	5937	spin_lock_irqsave(&bfqd->lock, flags);
	5938	+
	5939	+ if (rq == bfqd->waited_rq)
	5940	+ bfq_update_inject_limit(bfqd, bfqq);
4966	5941
4967	5942	bfq_completed_request(bfqq, bfqd);
4968	5943	bfq_finish_requeue_request_body(bfqq);
..	..	@@ -5012,6 +5987,8 @@
5012	5987	}
5013	5988
5014	5989	/*
	5990	+ * Removes the association between the current task and bfqq, assuming
	5991	+ * that bic points to the bfq iocontext of the task.
5015	5992	* Returns NULL if a new bfqq should be allocated, or the old bfqq if this
5016	5993	* was the last process referring to that bfqq.
5017	5994	*/
..	..	@@ -5031,7 +6008,7 @@
5031	6008
5032	6009	bfq_put_cooperator(bfqq);
5033	6010
5034		- bfq_put_queue(bfqq);
	6011	+ bfq_release_process_ref(bfqq->bfqd, bfqq);
5035	6012	return NULL;
5036	6013	}
5037	6014
..	..	@@ -5104,7 +6081,7 @@
5104	6081	* comments on bfq_init_rq for the reason behind this delayed
5105	6082	* preparation.
5106	6083	*/
5107		-static void bfq_prepare_request(struct request rq, struct bio bio)
	6084	+static void bfq_prepare_request(struct request *rq)
5108	6085	{
5109	6086	/*
5110	6087	* Regardless of whether we have an icq attached, we have to
..	..	@@ -5127,7 +6104,7 @@
5127	6104	* preparation is that, after the prepare_request hook is invoked for
5128	6105	* rq, rq may still be transformed into a request with no icq, i.e., a
5129	6106	* request not associated with any queue. No bfq hook is invoked to
5130		- * signal this tranformation. As a consequence, should these
	6107	+ * signal this transformation. As a consequence, should these
5131	6108	* preparation operations be performed when the prepare_request hook
5132	6109	* is invoked, and should rq be transformed one moment later, bfq
5133	6110	* would end up in an inconsistent state, because it would have
..	..	@@ -5218,7 +6195,29 @@
5218	6195	}
5219	6196	}
5220	6197
5221		- if (unlikely(bfq_bfqq_just_created(bfqq)))
	6198	+ /*
	6199	+ * Consider bfqq as possibly belonging to a burst of newly
	6200	+ * created queues only if:
	6201	+ * 1) A burst is actually happening (bfqd->burst_size > 0)
	6202	+ * or
	6203	+ * 2) There is no other active queue. In fact, if, in
	6204	+ * contrast, there are active queues not belonging to the
	6205	+ * possible burst bfqq may belong to, then there is no gain
	6206	+ * in considering bfqq as belonging to a burst, and
	6207	+ * therefore in not weight-raising bfqq. See comments on
	6208	+ * bfq_handle_burst().
	6209	+ *
	6210	+ * This filtering also helps eliminating false positives,
	6211	+ * occurring when bfqq does not belong to an actual large
	6212	+ * burst, but some background task (e.g., a service) happens
	6213	+ * to trigger the creation of new queues very close to when
	6214	+ * bfqq and its possible companion queues are created. See
	6215	+ * comments on bfq_handle_burst() for further details also on
	6216	+ * this issue.
	6217	+ */
	6218	+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
	6219	+ (bfqd->burst_size > 0 \|\|
	6220	+ bfq_tot_busy_queues(bfqd) == 0)))
5222	6221	bfq_handle_burst(bfqd, bfqq);
5223	6222
5224	6223	return bfqq;
..	..	@@ -5267,8 +6266,8 @@
5267	6266	bfq_bfqq_expire(bfqd, bfqq, true, reason);
5268	6267
5269	6268	schedule_dispatch:
5270		- spin_unlock_irqrestore(&bfqd->lock, flags);
5271	6269	bfq_schedule_dispatch(bfqd);
	6270	+ spin_unlock_irqrestore(&bfqd->lock, flags);
5272	6271	}
5273	6272
5274	6273	/*
..	..	@@ -5381,8 +6380,8 @@
5381	6380	struct blk_mq_tags *tags = hctx->sched_tags;
5382	6381	unsigned int min_shallow;
5383	6382
5384		- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
5385		- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
	6383	+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
	6384	+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
5386	6385	}
5387	6386
5388	6387	static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
..	..	@@ -5405,10 +6404,10 @@
5405	6404
5406	6405	hrtimer_cancel(&bfqd->idle_slice_timer);
5407	6406
5408		-#ifdef CONFIG_BFQ_GROUP_IOSCHED
5409	6407	/* release oom-queue reference to root group */
5410	6408	bfqg_and_blkg_put(bfqd->root_group);
5411	6409
	6410	+#ifdef CONFIG_BFQ_GROUP_IOSCHED
5412	6411	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
5413	6412	#else
5414	6413	spin_lock_irq(&bfqd->lock);
..	..	@@ -5454,9 +6453,9 @@
5454	6453	}
5455	6454	eq->elevator_data = bfqd;
5456	6455
5457		- spin_lock_irq(q->queue_lock);
	6456	+ spin_lock_irq(&q->queue_lock);
5458	6457	q->elevator = eq;
5459		- spin_unlock_irq(q->queue_lock);
	6458	+ spin_unlock_irq(&q->queue_lock);
5460	6459
5461	6460	/*
5462	6461	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
..	..	@@ -5488,7 +6487,7 @@
5488	6487	HRTIMER_MODE_REL);
5489	6488	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5490	6489
5491		- bfqd->queue_weights_tree = RB_ROOT;
	6490	+ bfqd->queue_weights_tree = RB_ROOT_CACHED;
5492	6491	bfqd->num_groups_with_pending_reqs = 0;
5493	6492
5494	6493	INIT_LIST_HEAD(&bfqd->active_list);
..	..	@@ -5496,6 +6495,7 @@
5496	6495	INIT_HLIST_HEAD(&bfqd->burst_list);
5497	6496
5498	6497	bfqd->hw_tag = -1;
	6498	+ bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
5499	6499
5500	6500	bfqd->bfq_max_budget = bfq_default_max_budget;
5501	6501
..	..	@@ -5796,7 +6796,7 @@
5796	6796	};
5797	6797
5798	6798	static struct elevator_type iosched_bfq_mq = {
5799		- .ops.mq = {
	6799	+ .ops = {
5800	6800	.limit_depth = bfq_limit_depth,
5801	6801	.prepare_request = bfq_prepare_request,
5802	6802	.requeue_request = bfq_finish_requeue_request,
..	..	@@ -5818,7 +6818,6 @@
5818	6818	.exit_sched = bfq_exit_queue,
5819	6819	},
5820	6820
5821		- .uses_mq = true,
5822	6821	.icq_size = sizeof(struct bfq_io_cq),
5823	6822	.icq_align = __alignof__(struct bfq_io_cq),
5824	6823	.elevator_attrs = bfq_attrs,