hc
2024-02-19 1c055e55a242a33e574e48be530e06770a210dcd
kernel/block/bfq-iosched.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Budget Fair Queueing (BFQ) I/O scheduler.
34 *
....@@ -12,21 +13,11 @@
1213 *
1314 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
1415 *
15
- * This program is free software; you can redistribute it and/or
16
- * modify it under the terms of the GNU General Public License as
17
- * published by the Free Software Foundation; either version 2 of the
18
- * License, or (at your option) any later version.
19
- *
20
- * This program is distributed in the hope that it will be useful,
21
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- * General Public License for more details.
24
- *
2516 * BFQ is a proportional-share I/O scheduler, with some extra
2617 * low-latency capabilities. BFQ also supports full hierarchical
2718 * scheduling through cgroups. Next paragraphs provide an introduction
2819 * on BFQ inner workings. Details on BFQ benefits, usage and
29
- * limitations can be found in Documentation/block/bfq-iosched.txt.
20
+ * limitations can be found in Documentation/block/bfq-iosched.rst.
3021 *
3122 * BFQ is a proportional-share storage-I/O scheduling algorithm based
3223 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
....@@ -167,6 +158,7 @@
167158 BFQ_BFQQ_FNS(coop);
168159 BFQ_BFQQ_FNS(split_coop);
169160 BFQ_BFQQ_FNS(softrt_update);
161
+BFQ_BFQQ_FNS(has_waker);
170162 #undef BFQ_BFQQ_FNS \
171163
172164 /* Expiration time of sync (0) and async (1) requests, in ns. */
....@@ -190,7 +182,7 @@
190182 /*
191183 * When a sync request is dispatched, the queue that contains that
192184 * request, and all the ancestor entities of that queue, are charged
193
- * with the number of sectors of the request. In constrast, if the
185
+ * with the number of sectors of the request. In contrast, if the
194186 * request is async, then the queue and its ancestor entities are
195187 * charged with the number of sectors of the request, multiplied by
196188 * the factor below. This throttles the bandwidth for async I/O,
....@@ -218,7 +210,7 @@
218210 * queue merging.
219211 *
220212 * As can be deduced from the low time limit below, queue merging, if
221
- * successful, happens at the very beggining of the I/O of the involved
213
+ * successful, happens at the very beginning of the I/O of the involved
222214 * cooperating processes, as a consequence of the arrival of the very
223215 * first requests from each cooperator. After that, there is very
224216 * little chance to find cooperators.
....@@ -231,13 +223,26 @@
231223 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
232224
233225 /* hw_tag detection: parallel requests threshold and min samples needed. */
234
-#define BFQ_HW_QUEUE_THRESHOLD 4
226
+#define BFQ_HW_QUEUE_THRESHOLD 3
235227 #define BFQ_HW_QUEUE_SAMPLES 32
236228
237229 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
238230 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
231
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
232
+ (get_sdist(last_pos, rq) > \
233
+ BFQQ_SEEK_THR && \
234
+ (!blk_queue_nonrot(bfqd->queue) || \
235
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
239236 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
240237 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
238
+/*
239
+ * Sync random I/O is likely to be confused with soft real-time I/O,
240
+ * because it is characterized by limited throughput and apparently
241
+ * isochronous arrival pattern. To avoid false positives, queues
242
+ * containing only random (seeky) I/O are prevented from being tagged
243
+ * as soft real-time.
244
+ */
245
+#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1)
241246
242247 /* Min number of samples required to perform peak-rate update */
243248 #define BFQ_RATE_MIN_SAMPLES 32
....@@ -368,6 +373,12 @@
368373
369374 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
370375 {
376
+ struct bfq_queue *old_bfqq = bic->bfqq[is_sync];
377
+
378
+ /* Clear bic pointer if bfqq is detached from this bic */
379
+ if (old_bfqq && old_bfqq->bic == bic)
380
+ old_bfqq->bic = NULL;
381
+
371382 bic->bfqq[is_sync] = bfqq;
372383 }
373384
....@@ -400,9 +411,9 @@
400411 unsigned long flags;
401412 struct bfq_io_cq *icq;
402413
403
- spin_lock_irqsave(q->queue_lock, flags);
414
+ spin_lock_irqsave(&q->queue_lock, flags);
404415 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
405
- spin_unlock_irqrestore(q->queue_lock, flags);
416
+ spin_unlock_irqrestore(&q->queue_lock, flags);
406417
407418 return icq;
408419 }
....@@ -416,6 +427,8 @@
416427 */
417428 void bfq_schedule_dispatch(struct bfq_data *bfqd)
418429 {
430
+ lockdep_assert_held(&bfqd->lock);
431
+
419432 if (bfqd->queued != 0) {
420433 bfq_log(bfqd, "schedule dispatch");
421434 blk_mq_run_hw_queues(bfqd->queue, true);
....@@ -423,13 +436,12 @@
423436 }
424437
425438 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
426
-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
427439
428440 #define bfq_sample_valid(samples) ((samples) > 80)
429441
430442 /*
431443 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
432
- * We choose the request that is closesr to the head right now. Distance
444
+ * We choose the request that is closer to the head right now. Distance
433445 * behind the head is penalized and only allowed to a certain extent.
434446 */
435447 static struct request *bfq_choose_req(struct bfq_data *bfqd,
....@@ -591,7 +603,16 @@
591603 bfq_merge_time_limit);
592604 }
593605
594
-void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
606
+/*
607
+ * The following function is not marked as __cold because it is
608
+ * actually cold, but for the same performance goal described in the
609
+ * comments on the likely() at the beginning of
610
+ * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
611
+ * execution time for the case where this function is not invoked, we
612
+ * had to add an unlikely() in each involved if().
613
+ */
614
+void __cold
615
+bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
595616 {
596617 struct rb_node **p, *parent;
597618 struct bfq_queue *__bfqq;
....@@ -600,6 +621,10 @@
600621 rb_erase(&bfqq->pos_node, bfqq->pos_root);
601622 bfqq->pos_root = NULL;
602623 }
624
+
625
+ /* oom_bfqq does not participate in queue merging */
626
+ if (bfqq == &bfqd->oom_bfqq)
627
+ return;
603628
604629 /*
605630 * bfqq cannot be merged any longer (see comments in
....@@ -625,52 +650,68 @@
625650 }
626651
627652 /*
628
- * Tell whether there are active queues with different weights or
629
- * active groups.
630
- */
631
-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
632
-{
633
- /*
634
- * For queue weights to differ, queue_weights_tree must contain
635
- * at least two nodes.
636
- */
637
- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
638
- (bfqd->queue_weights_tree.rb_node->rb_left ||
639
- bfqd->queue_weights_tree.rb_node->rb_right)
640
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
641
- ) ||
642
- (bfqd->num_groups_with_pending_reqs > 0
643
-#endif
644
- );
645
-}
646
-
647
-/*
648
- * The following function returns true if every queue must receive the
649
- * same share of the throughput (this condition is used when deciding
650
- * whether idling may be disabled, see the comments in the function
651
- * bfq_better_to_idle()).
653
+ * The following function returns false either if every active queue
654
+ * must receive the same share of the throughput (symmetric scenario),
655
+ * or, as a special case, if bfqq must receive a share of the
656
+ * throughput lower than or equal to the share that every other active
657
+ * queue must receive. If bfqq does sync I/O, then these are the only
658
+ * two cases where bfqq happens to be guaranteed its share of the
659
+ * throughput even if I/O dispatching is not plugged when bfqq remains
660
+ * temporarily empty (for more details, see the comments in the
661
+ * function bfq_better_to_idle()). For this reason, the return value
662
+ * of this function is used to check whether I/O-dispatch plugging can
663
+ * be avoided.
652664 *
653
- * Such a scenario occurs when:
665
+ * The above first case (symmetric scenario) occurs when:
654666 * 1) all active queues have the same weight,
655
- * 2) all active groups at the same level in the groups tree have the same
656
- * weight,
667
+ * 2) all active queues belong to the same I/O-priority class,
657668 * 3) all active groups at the same level in the groups tree have the same
669
+ * weight,
670
+ * 4) all active groups at the same level in the groups tree have the same
658671 * number of children.
659672 *
660673 * Unfortunately, keeping the necessary state for evaluating exactly
661674 * the last two symmetry sub-conditions above would be quite complex
662
- * and time consuming. Therefore this function evaluates, instead,
663
- * only the following stronger two sub-conditions, for which it is
675
+ * and time consuming. Therefore this function evaluates, instead,
676
+ * only the following stronger three sub-conditions, for which it is
664677 * much easier to maintain the needed state:
665678 * 1) all active queues have the same weight,
666
- * 2) there are no active groups.
679
+ * 2) all active queues belong to the same I/O-priority class,
680
+ * 3) there are no active groups.
667681 * In particular, the last condition is always true if hierarchical
668682 * support or the cgroups interface are not enabled, thus no state
669683 * needs to be maintained in this case.
670684 */
671
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
685
+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
686
+ struct bfq_queue *bfqq)
672687 {
673
- return !bfq_varied_queue_weights_or_active_groups(bfqd);
688
+ bool smallest_weight = bfqq &&
689
+ bfqq->weight_counter &&
690
+ bfqq->weight_counter ==
691
+ container_of(
692
+ rb_first_cached(&bfqd->queue_weights_tree),
693
+ struct bfq_weight_counter,
694
+ weights_node);
695
+
696
+ /*
697
+ * For queue weights to differ, queue_weights_tree must contain
698
+ * at least two nodes.
699
+ */
700
+ bool varied_queue_weights = !smallest_weight &&
701
+ !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
702
+ (bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
703
+ bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
704
+
705
+ bool multiple_classes_busy =
706
+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
707
+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
708
+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
709
+
710
+ return varied_queue_weights || multiple_classes_busy
711
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
712
+ || bfqd->num_groups_with_pending_reqs > 0
713
+#endif
714
+ ;
674715 }
675716
676717 /*
....@@ -687,10 +728,11 @@
687728 * should be low too.
688729 */
689730 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
690
- struct rb_root *root)
731
+ struct rb_root_cached *root)
691732 {
692733 struct bfq_entity *entity = &bfqq->entity;
693
- struct rb_node **new = &(root->rb_node), *parent = NULL;
734
+ struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
735
+ bool leftmost = true;
694736
695737 /*
696738 * Do not insert if the queue is already associated with a
....@@ -719,8 +761,10 @@
719761 }
720762 if (entity->weight < __counter->weight)
721763 new = &((*new)->rb_left);
722
- else
764
+ else {
723765 new = &((*new)->rb_right);
766
+ leftmost = false;
767
+ }
724768 }
725769
726770 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
....@@ -729,22 +773,22 @@
729773 /*
730774 * In the unlucky event of an allocation failure, we just
731775 * exit. This will cause the weight of queue to not be
732
- * considered in bfq_varied_queue_weights_or_active_groups,
733
- * which, in its turn, causes the scenario to be deemed
734
- * wrongly symmetric in case bfqq's weight would have been
735
- * the only weight making the scenario asymmetric. On the
736
- * bright side, no unbalance will however occur when bfqq
737
- * becomes inactive again (the invocation of this function
738
- * is triggered by an activation of queue). In fact,
739
- * bfq_weights_tree_remove does nothing if
740
- * !bfqq->weight_counter.
776
+ * considered in bfq_asymmetric_scenario, which, in its turn,
777
+ * causes the scenario to be deemed wrongly symmetric in case
778
+ * bfqq's weight would have been the only weight making the
779
+ * scenario asymmetric. On the bright side, no unbalance will
780
+ * however occur when bfqq becomes inactive again (the
781
+ * invocation of this function is triggered by an activation
782
+ * of queue). In fact, bfq_weights_tree_remove does nothing
783
+ * if !bfqq->weight_counter.
741784 */
742785 if (unlikely(!bfqq->weight_counter))
743786 return;
744787
745788 bfqq->weight_counter->weight = entity->weight;
746789 rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
747
- rb_insert_color(&bfqq->weight_counter->weights_node, root);
790
+ rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
791
+ leftmost);
748792
749793 inc_counter:
750794 bfqq->weight_counter->num_active++;
....@@ -759,7 +803,7 @@
759803 */
760804 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
761805 struct bfq_queue *bfqq,
762
- struct rb_root *root)
806
+ struct rb_root_cached *root)
763807 {
764808 if (!bfqq->weight_counter)
765809 return;
....@@ -768,7 +812,7 @@
768812 if (bfqq->weight_counter->num_active > 0)
769813 goto reset_entity_pointer;
770814
771
- rb_erase(&bfqq->weight_counter->weights_node, root);
815
+ rb_erase_cached(&bfqq->weight_counter->weights_node, root);
772816 kfree(bfqq->weight_counter);
773817
774818 reset_entity_pointer:
....@@ -882,7 +926,8 @@
882926 static unsigned long bfq_serv_to_charge(struct request *rq,
883927 struct bfq_queue *bfqq)
884928 {
885
- if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
929
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
930
+ bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
886931 return blk_rq_sectors(rq);
887932
888933 return blk_rq_sectors(rq) * bfq_async_charge_factor;
....@@ -916,8 +961,10 @@
916961 */
917962 return;
918963
919
- new_budget = max_t(unsigned long, bfqq->max_budget,
920
- bfq_serv_to_charge(next_rq, bfqq));
964
+ new_budget = max_t(unsigned long,
965
+ max_t(unsigned long, bfqq->max_budget,
966
+ bfq_serv_to_charge(next_rq, bfqq)),
967
+ entity->service);
921968 if (entity->budget != new_budget) {
922969 entity->budget = new_budget;
923970 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
....@@ -946,7 +993,7 @@
946993 * of several files
947994 * mplayer took 23 seconds to start, if constantly weight-raised.
948995 *
949
- * As for higher values than that accomodating the above bad
996
+ * As for higher values than that accommodating the above bad
950997 * scenario, tests show that higher values would often yield
951998 * the opposite of the desired result, i.e., would worsen
952999 * responsiveness by allowing non-interactive applications to
....@@ -985,6 +1032,7 @@
9851032 else
9861033 bfq_clear_bfqq_IO_bound(bfqq);
9871034
1035
+ bfqq->entity.new_weight = bic->saved_weight;
9881036 bfqq->ttime = bic->saved_ttime;
9891037 bfqq->wr_coeff = bic->saved_wr_coeff;
9901038 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
....@@ -1020,7 +1068,7 @@
10201068
10211069 static int bfqq_process_refs(struct bfq_queue *bfqq)
10221070 {
1023
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
1071
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
10241072 (bfqq->weight_counter != NULL);
10251073 }
10261074
....@@ -1032,8 +1080,18 @@
10321080
10331081 hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
10341082 hlist_del_init(&item->burst_list_node);
1035
- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1036
- bfqd->burst_size = 1;
1083
+
1084
+ /*
1085
+ * Start the creation of a new burst list only if there is no
1086
+ * active queue. See comments on the conditional invocation of
1087
+ * bfq_handle_burst().
1088
+ */
1089
+ if (bfq_tot_busy_queues(bfqd) == 0) {
1090
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1091
+ bfqd->burst_size = 1;
1092
+ } else
1093
+ bfqd->burst_size = 0;
1094
+
10371095 bfqd->burst_parent_entity = bfqq->entity.parent;
10381096 }
10391097
....@@ -1089,7 +1147,8 @@
10891147 * many parallel threads/processes. Examples are systemd during boot,
10901148 * or git grep. To help these processes get their job done as soon as
10911149 * possible, it is usually better to not grant either weight-raising
1092
- * or device idling to their queues.
1150
+ * or device idling to their queues, unless these queues must be
1151
+ * protected from the I/O flowing through other active queues.
10931152 *
10941153 * In this comment we describe, firstly, the reasons why this fact
10951154 * holds, and, secondly, the next function, which implements the main
....@@ -1101,7 +1160,10 @@
11011160 * cumulatively served, the sooner the target job of these queues gets
11021161 * completed. As a consequence, weight-raising any of these queues,
11031162 * which also implies idling the device for it, is almost always
1104
- * counterproductive. In most cases it just lowers throughput.
1163
+ * counterproductive, unless there are other active queues to isolate
1164
+ * these new queues from. If there no other active queues, then
1165
+ * weight-raising these new queues just lowers throughput in most
1166
+ * cases.
11051167 *
11061168 * On the other hand, a burst of queue creations may be caused also by
11071169 * the start of an application that does not consist of a lot of
....@@ -1135,14 +1197,16 @@
11351197 * are very rare. They typically occur if some service happens to
11361198 * start doing I/O exactly when the interactive task starts.
11371199 *
1138
- * Turning back to the next function, it implements all the steps
1139
- * needed to detect the occurrence of a large burst and to properly
1140
- * mark all the queues belonging to it (so that they can then be
1141
- * treated in a different way). This goal is achieved by maintaining a
1142
- * "burst list" that holds, temporarily, the queues that belong to the
1143
- * burst in progress. The list is then used to mark these queues as
1144
- * belonging to a large burst if the burst does become large. The main
1145
- * steps are the following.
1200
+ * Turning back to the next function, it is invoked only if there are
1201
+ * no active queues (apart from active queues that would belong to the
1202
+ * same, possible burst bfqq would belong to), and it implements all
1203
+ * the steps needed to detect the occurrence of a large burst and to
1204
+ * properly mark all the queues belonging to it (so that they can then
1205
+ * be treated in a different way). This goal is achieved by
1206
+ * maintaining a "burst list" that holds, temporarily, the queues that
1207
+ * belong to the burst in progress. The list is then used to mark
1208
+ * these queues as belonging to a large burst if the burst does become
1209
+ * large. The main steps are the following.
11461210 *
11471211 * . when the very first queue is created, the queue is inserted into the
11481212 * list (as it could be the first queue in a possible burst)
....@@ -1376,21 +1440,31 @@
13761440 * mechanism may be re-designed in such a way to make it possible to
13771441 * know whether preemption is needed without needing to update service
13781442 * trees). In addition, queue preemptions almost always cause random
1379
- * I/O, and thus loss of throughput. Because of these facts, the next
1380
- * function adopts the following simple scheme to avoid both costly
1381
- * operations and too frequent preemptions: it requests the expiration
1382
- * of the in-service queue (unconditionally) only for queues that need
1383
- * to recover a hole, or that either are weight-raised or deserve to
1384
- * be weight-raised.
1443
+ * I/O, which may in turn cause loss of throughput. Finally, there may
1444
+ * even be no in-service queue when the next function is invoked (so,
1445
+ * no queue to compare timestamps with). Because of these facts, the
1446
+ * next function adopts the following simple scheme to avoid costly
1447
+ * operations, too frequent preemptions and too many dependencies on
1448
+ * the state of the scheduler: it requests the expiration of the
1449
+ * in-service queue (unconditionally) only for queues that need to
1450
+ * recover a hole. Then it delegates to other parts of the code the
1451
+ * responsibility of handling the above case 2.
13851452 */
13861453 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
13871454 struct bfq_queue *bfqq,
1388
- bool arrived_in_time,
1389
- bool wr_or_deserves_wr)
1455
+ bool arrived_in_time)
13901456 {
13911457 struct bfq_entity *entity = &bfqq->entity;
13921458
1393
- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
1459
+ /*
1460
+ * In the next compound condition, we check also whether there
1461
+ * is some budget left, because otherwise there is no point in
1462
+ * trying to go on serving bfqq with this same budget: bfqq
1463
+ * would be expired immediately after being selected for
1464
+ * service. This would only cause useless overhead.
1465
+ */
1466
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
1467
+ bfq_bfqq_budget_left(bfqq) > 0) {
13941468 /*
13951469 * We do not clear the flag non_blocking_wait_rq here, as
13961470 * the latter is used in bfq_activate_bfqq to signal
....@@ -1433,7 +1507,7 @@
14331507 entity->budget = max_t(unsigned long, bfqq->max_budget,
14341508 bfq_serv_to_charge(bfqq->next_rq, bfqq));
14351509 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1436
- return wr_or_deserves_wr;
1510
+ return false;
14371511 }
14381512
14391513 /*
....@@ -1551,6 +1625,36 @@
15511625 bfqd->bfq_wr_min_idle_time);
15521626 }
15531627
1628
+
1629
+/*
1630
+ * Return true if bfqq is in a higher priority class, or has a higher
1631
+ * weight than the in-service queue.
1632
+ */
1633
+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
1634
+ struct bfq_queue *in_serv_bfqq)
1635
+{
1636
+ int bfqq_weight, in_serv_weight;
1637
+
1638
+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
1639
+ return true;
1640
+
1641
+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
1642
+ bfqq_weight = bfqq->entity.weight;
1643
+ in_serv_weight = in_serv_bfqq->entity.weight;
1644
+ } else {
1645
+ if (bfqq->entity.parent)
1646
+ bfqq_weight = bfqq->entity.parent->weight;
1647
+ else
1648
+ bfqq_weight = bfqq->entity.weight;
1649
+ if (in_serv_bfqq->entity.parent)
1650
+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
1651
+ else
1652
+ in_serv_weight = in_serv_bfqq->entity.weight;
1653
+ }
1654
+
1655
+ return bfqq_weight > in_serv_weight;
1656
+}
1657
+
15541658 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
15551659 struct bfq_queue *bfqq,
15561660 int old_wr_coeff,
....@@ -1579,6 +1683,7 @@
15791683 */
15801684 in_burst = bfq_bfqq_in_large_burst(bfqq);
15811685 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
1686
+ !BFQQ_TOTALLY_SEEKY(bfqq) &&
15821687 !in_burst &&
15831688 time_is_before_jiffies(bfqq->soft_rt_next_start) &&
15841689 bfqq->dispatched == 0;
....@@ -1594,8 +1699,7 @@
15941699 */
15951700 bfqq_wants_to_preempt =
15961701 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1597
- arrived_in_time,
1598
- wr_or_deserves_wr);
1702
+ arrived_in_time);
15991703
16001704 /*
16011705 * If bfqq happened to be activated in a burst, but has been
....@@ -1660,19 +1764,109 @@
16601764
16611765 /*
16621766 * Expire in-service queue only if preemption may be needed
1663
- * for guarantees. In this respect, the function
1664
- * next_queue_may_preempt just checks a simple, necessary
1665
- * condition, and not a sufficient condition based on
1666
- * timestamps. In fact, for the latter condition to be
1667
- * evaluated, timestamps would need first to be updated, and
1668
- * this operation is quite costly (see the comments on the
1669
- * function bfq_bfqq_update_budg_for_activation).
1767
+ * for guarantees. In particular, we care only about two
1768
+ * cases. The first is that bfqq has to recover a service
1769
+ * hole, as explained in the comments on
1770
+ * bfq_bfqq_update_budg_for_activation(), i.e., that
1771
+ * bfqq_wants_to_preempt is true. However, if bfqq does not
1772
+ * carry time-critical I/O, then bfqq's bandwidth is less
1773
+ * important than that of queues that carry time-critical I/O.
1774
+ * So, as a further constraint, we consider this case only if
1775
+ * bfqq is at least as weight-raised, i.e., at least as time
1776
+ * critical, as the in-service queue.
1777
+ *
1778
+ * The second case is that bfqq is in a higher priority class,
1779
+ * or has a higher weight than the in-service queue. If this
1780
+ * condition does not hold, we don't care because, even if
1781
+ * bfqq does not start to be served immediately, the resulting
1782
+ * delay for bfqq's I/O is however lower or much lower than
1783
+ * the ideal completion time to be guaranteed to bfqq's I/O.
1784
+ *
1785
+ * In both cases, preemption is needed only if, according to
1786
+ * the timestamps of both bfqq and of the in-service queue,
1787
+ * bfqq actually is the next queue to serve. So, to reduce
1788
+ * useless preemptions, the return value of
1789
+ * next_queue_may_preempt() is considered in the next compound
1790
+ * condition too. Yet next_queue_may_preempt() just checks a
1791
+ * simple, necessary condition for bfqq to be the next queue
1792
+ * to serve. In fact, to evaluate a sufficient condition, the
1793
+ * timestamps of the in-service queue would need to be
1794
+ * updated, and this operation is quite costly (see the
1795
+ * comments on bfq_bfqq_update_budg_for_activation()).
16701796 */
1671
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
1672
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
1797
+ if (bfqd->in_service_queue &&
1798
+ ((bfqq_wants_to_preempt &&
1799
+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
1800
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
16731801 next_queue_may_preempt(bfqd))
16741802 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
16751803 false, BFQQE_PREEMPTED);
1804
+}
1805
+
1806
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
1807
+ struct bfq_queue *bfqq)
1808
+{
1809
+ /* invalidate baseline total service time */
1810
+ bfqq->last_serv_time_ns = 0;
1811
+
1812
+ /*
1813
+ * Reset pointer in case we are waiting for
1814
+ * some request completion.
1815
+ */
1816
+ bfqd->waited_rq = NULL;
1817
+
1818
+ /*
1819
+ * If bfqq has a short think time, then start by setting the
1820
+ * inject limit to 0 prudentially, because the service time of
1821
+ * an injected I/O request may be higher than the think time
1822
+ * of bfqq, and therefore, if one request was injected when
1823
+ * bfqq remains empty, this injected request might delay the
1824
+ * service of the next I/O request for bfqq significantly. In
1825
+ * case bfqq can actually tolerate some injection, then the
1826
+ * adaptive update will however raise the limit soon. This
1827
+ * lucky circumstance holds exactly because bfqq has a short
1828
+ * think time, and thus, after remaining empty, is likely to
1829
+ * get new I/O enqueued---and then completed---before being
1830
+ * expired. This is the very pattern that gives the
1831
+ * limit-update algorithm the chance to measure the effect of
1832
+ * injection on request service times, and then to update the
1833
+ * limit accordingly.
1834
+ *
1835
+ * However, in the following special case, the inject limit is
1836
+ * left to 1 even if the think time is short: bfqq's I/O is
1837
+ * synchronized with that of some other queue, i.e., bfqq may
1838
+ * receive new I/O only after the I/O of the other queue is
1839
+ * completed. Keeping the inject limit to 1 allows the
1840
+ * blocking I/O to be served while bfqq is in service. And
1841
+ * this is very convenient both for bfqq and for overall
1842
+ * throughput, as explained in detail in the comments in
1843
+ * bfq_update_has_short_ttime().
1844
+ *
1845
+ * On the opposite end, if bfqq has a long think time, then
1846
+ * start directly by 1, because:
1847
+ * a) on the bright side, keeping at most one request in
1848
+ * service in the drive is unlikely to cause any harm to the
1849
+ * latency of bfqq's requests, as the service time of a single
1850
+ * request is likely to be lower than the think time of bfqq;
1851
+ * b) on the downside, after becoming empty, bfqq is likely to
1852
+ * expire before getting its next request. With this request
1853
+ * arrival pattern, it is very hard to sample total service
1854
+ * times and update the inject limit accordingly (see comments
1855
+ * on bfq_update_inject_limit()). So the limit is likely to be
1856
+ * never, or at least seldom, updated. As a consequence, by
1857
+ * setting the limit to 1, we avoid that no injection ever
1858
+ * occurs with bfqq. On the downside, this proactive step
1859
+ * further reduces chances to actually compute the baseline
1860
+ * total service time. Thus it reduces chances to execute the
1861
+ * limit-update algorithm and possibly raise the limit to more
1862
+ * than 1.
1863
+ */
1864
+ if (bfq_bfqq_has_short_ttime(bfqq))
1865
+ bfqq->inject_limit = 0;
1866
+ else
1867
+ bfqq->inject_limit = 1;
1868
+
1869
+ bfqq->decrease_time_jif = jiffies;
16761870 }
16771871
16781872 static void bfq_add_request(struct request *rq)
....@@ -1687,6 +1881,180 @@
16871881 bfqq->queued[rq_is_sync(rq)]++;
16881882 bfqd->queued++;
16891883
1884
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1885
+ /*
1886
+ * Detect whether bfqq's I/O seems synchronized with
1887
+ * that of some other queue, i.e., whether bfqq, after
1888
+ * remaining empty, happens to receive new I/O only
1889
+ * right after some I/O request of the other queue has
1890
+ * been completed. We call waker queue the other
1891
+ * queue, and we assume, for simplicity, that bfqq may
1892
+ * have at most one waker queue.
1893
+ *
1894
+ * A remarkable throughput boost can be reached by
1895
+ * unconditionally injecting the I/O of the waker
1896
+ * queue, every time a new bfq_dispatch_request
1897
+ * happens to be invoked while I/O is being plugged
1898
+ * for bfqq. In addition to boosting throughput, this
1899
+ * unblocks bfqq's I/O, thereby improving bandwidth
1900
+ * and latency for bfqq. Note that these same results
1901
+ * may be achieved with the general injection
1902
+ * mechanism, but less effectively. For details on
1903
+ * this aspect, see the comments on the choice of the
1904
+ * queue for injection in bfq_select_queue().
1905
+ *
1906
+ * Turning back to the detection of a waker queue, a
1907
+ * queue Q is deemed as a waker queue for bfqq if, for
1908
+ * two consecutive times, bfqq happens to become non
1909
+ * empty right after a request of Q has been
1910
+ * completed. In particular, on the first time, Q is
1911
+ * tentatively set as a candidate waker queue, while
1912
+ * on the second time, the flag
1913
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
1914
+ * is a waker queue for bfqq. These detection steps
1915
+ * are performed only if bfqq has a long think time,
1916
+ * so as to make it more likely that bfqq's I/O is
1917
+ * actually being blocked by a synchronization. This
1918
+ * last filter, plus the above two-times requirement,
1919
+ * make false positives less likely.
1920
+ *
1921
+ * NOTE
1922
+ *
1923
+ * The sooner a waker queue is detected, the sooner
1924
+ * throughput can be boosted by injecting I/O from the
1925
+ * waker queue. Fortunately, detection is likely to be
1926
+ * actually fast, for the following reasons. While
1927
+ * blocked by synchronization, bfqq has a long think
1928
+ * time. This implies that bfqq's inject limit is at
1929
+ * least equal to 1 (see the comments in
1930
+ * bfq_update_inject_limit()). So, thanks to
1931
+ * injection, the waker queue is likely to be served
1932
+ * during the very first I/O-plugging time interval
1933
+ * for bfqq. This triggers the first step of the
1934
+ * detection mechanism. Thanks again to injection, the
1935
+ * candidate waker queue is then likely to be
1936
+ * confirmed no later than during the next
1937
+ * I/O-plugging interval for bfqq.
1938
+ */
1939
+ if (bfqd->last_completed_rq_bfqq &&
1940
+ !bfq_bfqq_has_short_ttime(bfqq) &&
1941
+ ktime_get_ns() - bfqd->last_completion <
1942
+ 200 * NSEC_PER_USEC) {
1943
+ if (bfqd->last_completed_rq_bfqq != bfqq &&
1944
+ bfqd->last_completed_rq_bfqq !=
1945
+ bfqq->waker_bfqq) {
1946
+ /*
1947
+ * First synchronization detected with
1948
+ * a candidate waker queue, or with a
1949
+ * different candidate waker queue
1950
+ * from the current one.
1951
+ */
1952
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
1953
+
1954
+ /*
1955
+ * If the waker queue disappears, then
1956
+ * bfqq->waker_bfqq must be reset. To
1957
+ * this goal, we maintain in each
1958
+ * waker queue a list, woken_list, of
1959
+ * all the queues that reference the
1960
+ * waker queue through their
1961
+ * waker_bfqq pointer. When the waker
1962
+ * queue exits, the waker_bfqq pointer
1963
+ * of all the queues in the woken_list
1964
+ * is reset.
1965
+ *
1966
+ * In addition, if bfqq is already in
1967
+ * the woken_list of a waker queue,
1968
+ * then, before being inserted into
1969
+ * the woken_list of a new waker
1970
+ * queue, bfqq must be removed from
1971
+ * the woken_list of the old waker
1972
+ * queue.
1973
+ */
1974
+ if (!hlist_unhashed(&bfqq->woken_list_node))
1975
+ hlist_del_init(&bfqq->woken_list_node);
1976
+ hlist_add_head(&bfqq->woken_list_node,
1977
+ &bfqd->last_completed_rq_bfqq->woken_list);
1978
+
1979
+ bfq_clear_bfqq_has_waker(bfqq);
1980
+ } else if (bfqd->last_completed_rq_bfqq ==
1981
+ bfqq->waker_bfqq &&
1982
+ !bfq_bfqq_has_waker(bfqq)) {
1983
+ /*
1984
+ * synchronization with waker_bfqq
1985
+ * seen for the second time
1986
+ */
1987
+ bfq_mark_bfqq_has_waker(bfqq);
1988
+ }
1989
+ }
1990
+
1991
+ /*
1992
+ * Periodically reset inject limit, to make sure that
1993
+ * the latter eventually drops in case workload
1994
+ * changes, see step (3) in the comments on
1995
+ * bfq_update_inject_limit().
1996
+ */
1997
+ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
1998
+ msecs_to_jiffies(1000)))
1999
+ bfq_reset_inject_limit(bfqd, bfqq);
2000
+
2001
+ /*
2002
+ * The following conditions must hold to setup a new
2003
+ * sampling of total service time, and then a new
2004
+ * update of the inject limit:
2005
+ * - bfqq is in service, because the total service
2006
+ * time is evaluated only for the I/O requests of
2007
+ * the queues in service;
2008
+ * - this is the right occasion to compute or to
2009
+ * lower the baseline total service time, because
2010
+ * there are actually no requests in the drive,
2011
+ * or
2012
+ * the baseline total service time is available, and
2013
+ * this is the right occasion to compute the other
2014
+ * quantity needed to update the inject limit, i.e.,
2015
+ * the total service time caused by the amount of
2016
+ * injection allowed by the current value of the
2017
+ * limit. It is the right occasion because injection
2018
+ * has actually been performed during the service
2019
+ * hole, and there are still in-flight requests,
2020
+ * which are very likely to be exactly the injected
2021
+ * requests, or part of them;
2022
+ * - the minimum interval for sampling the total
2023
+ * service time and updating the inject limit has
2024
+ * elapsed.
2025
+ */
2026
+ if (bfqq == bfqd->in_service_queue &&
2027
+ (bfqd->rq_in_driver == 0 ||
2028
+ (bfqq->last_serv_time_ns > 0 &&
2029
+ bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
2030
+ time_is_before_eq_jiffies(bfqq->decrease_time_jif +
2031
+ msecs_to_jiffies(10))) {
2032
+ bfqd->last_empty_occupied_ns = ktime_get_ns();
2033
+ /*
2034
+ * Start the state machine for measuring the
2035
+ * total service time of rq: setting
2036
+ * wait_dispatch will cause bfqd->waited_rq to
2037
+ * be set when rq will be dispatched.
2038
+ */
2039
+ bfqd->wait_dispatch = true;
2040
+ /*
2041
+ * If there is no I/O in service in the drive,
2042
+ * then possible injection occurred before the
2043
+ * arrival of rq will not affect the total
2044
+ * service time of rq. So the injection limit
2045
+ * must not be updated as a function of such
2046
+ * total service time, unless new injection
2047
+ * occurs before rq is completed. To have the
2048
+ * injection limit updated only in the latter
2049
+ * case, reset rqs_injected here (rqs_injected
2050
+ * will be set in case injection is performed
2051
+ * on bfqq before rq is completed).
2052
+ */
2053
+ if (bfqd->rq_in_driver == 0)
2054
+ bfqd->rqs_injected = false;
2055
+ }
2056
+ }
2057
+
16902058 elv_rb_add(&bfqq->sort_list, rq);
16912059
16922060 /*
....@@ -1698,8 +2066,9 @@
16982066
16992067 /*
17002068 * Adjust priority tree position, if next_rq changes.
2069
+ * See comments on bfq_pos_tree_add_move() for the unlikely().
17012070 */
1702
- if (prev != bfqq->next_rq)
2071
+ if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
17032072 bfq_pos_tree_add_move(bfqd, bfqq);
17042073
17052074 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
....@@ -1839,7 +2208,9 @@
18392208 bfqq->pos_root = NULL;
18402209 }
18412210 } else {
1842
- bfq_pos_tree_add_move(bfqd, bfqq);
2211
+ /* see comments on bfq_pos_tree_add_move() for the unlikely() */
2212
+ if (unlikely(!bfqd->nonrot_with_queueing))
2213
+ bfq_pos_tree_add_move(bfqd, bfqq);
18432214 }
18442215
18452216 if (rq->cmd_flags & REQ_META)
....@@ -1847,9 +2218,9 @@
18472218
18482219 }
18492220
1850
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
2221
+static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
2222
+ unsigned int nr_segs)
18512223 {
1852
- struct request_queue *q = hctx->queue;
18532224 struct bfq_data *bfqd = q->elevator->elevator_data;
18542225 struct request *free = NULL;
18552226 /*
....@@ -1864,13 +2235,20 @@
18642235
18652236 spin_lock_irq(&bfqd->lock);
18662237
1867
- if (bic)
2238
+ if (bic) {
2239
+ /*
2240
+ * Make sure cgroup info is uptodate for current process before
2241
+ * considering the merge.
2242
+ */
2243
+ bfq_bic_update_cgroup(bic, bio);
2244
+
18682245 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
1869
- else
2246
+ } else {
18702247 bfqd->bio_bfqq = NULL;
2248
+ }
18712249 bfqd->bio_bic = bic;
18722250
1873
- ret = blk_mq_sched_try_merge(q, bio, &free);
2251
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
18742252
18752253 if (free)
18762254 blk_mq_free_request(free);
....@@ -1888,13 +2266,14 @@
18882266 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
18892267 if (__rq && elv_bio_merge_ok(__rq, bio)) {
18902268 *req = __rq;
2269
+
2270
+ if (blk_discard_mergable(__rq))
2271
+ return ELEVATOR_DISCARD_MERGE;
18912272 return ELEVATOR_FRONT_MERGE;
18922273 }
18932274
18942275 return ELEVATOR_NO_MERGE;
18952276 }
1896
-
1897
-static struct bfq_queue *bfq_init_rq(struct request *rq);
18982277
18992278 static void bfq_request_merged(struct request_queue *q, struct request *req,
19002279 enum elv_merge type)
....@@ -1904,7 +2283,7 @@
19042283 blk_rq_pos(req) <
19052284 blk_rq_pos(container_of(rb_prev(&req->rb_node),
19062285 struct request, rb_node))) {
1907
- struct bfq_queue *bfqq = bfq_init_rq(req);
2286
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
19082287 struct bfq_data *bfqd;
19092288 struct request *prev, *next_rq;
19102289
....@@ -1929,7 +2308,12 @@
19292308 */
19302309 if (prev != bfqq->next_rq) {
19312310 bfq_updated_next_req(bfqd, bfqq);
1932
- bfq_pos_tree_add_move(bfqd, bfqq);
2311
+ /*
2312
+ * See comments on bfq_pos_tree_add_move() for
2313
+ * the unlikely().
2314
+ */
2315
+ if (unlikely(!bfqd->nonrot_with_queueing))
2316
+ bfq_pos_tree_add_move(bfqd, bfqq);
19332317 }
19342318 }
19352319 }
....@@ -1951,8 +2335,8 @@
19512335 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
19522336 struct request *next)
19532337 {
1954
- struct bfq_queue *bfqq = bfq_init_rq(rq),
1955
- *next_bfqq = bfq_init_rq(next);
2338
+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
2339
+ *next_bfqq = RQ_BFQQ(next);
19562340
19572341 if (!bfqq)
19582342 return;
....@@ -2131,6 +2515,14 @@
21312515 if (process_refs == 0 || new_process_refs == 0)
21322516 return NULL;
21332517
2518
+ /*
2519
+ * Make sure merged queues belong to the same parent. Parents could
2520
+ * have changed since the time we decided the two queues are suitable
2521
+ * for merging.
2522
+ */
2523
+ if (new_bfqq->entity.parent != bfqq->entity.parent)
2524
+ return NULL;
2525
+
21342526 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
21352527 new_bfqq->pid);
21362528
....@@ -2155,6 +2547,15 @@
21552547 * are likely to increase the throughput.
21562548 */
21572549 bfqq->new_bfqq = new_bfqq;
2550
+ /*
2551
+ * The above assignment schedules the following redirections:
2552
+ * each time some I/O for bfqq arrives, the process that
2553
+ * generated that I/O is disassociated from bfqq and
2554
+ * associated with new_bfqq. Here we increases new_bfqq->ref
2555
+ * in advance, adding the number of processes that are
2556
+ * expected to be associated with new_bfqq as they happen to
2557
+ * issue I/O.
2558
+ */
21582559 new_bfqq->ref += process_refs;
21592560 return new_bfqq;
21602561 }
....@@ -2214,6 +2615,50 @@
22142615 {
22152616 struct bfq_queue *in_service_bfqq, *new_bfqq;
22162617
2618
+ /* if a merge has already been setup, then proceed with that first */
2619
+ if (bfqq->new_bfqq)
2620
+ return bfqq->new_bfqq;
2621
+
2622
+ /*
2623
+ * Do not perform queue merging if the device is non
2624
+ * rotational and performs internal queueing. In fact, such a
2625
+ * device reaches a high speed through internal parallelism
2626
+ * and pipelining. This means that, to reach a high
2627
+ * throughput, it must have many requests enqueued at the same
2628
+ * time. But, in this configuration, the internal scheduling
2629
+ * algorithm of the device does exactly the job of queue
2630
+ * merging: it reorders requests so as to obtain as much as
2631
+ * possible a sequential I/O pattern. As a consequence, with
2632
+ * the workload generated by processes doing interleaved I/O,
2633
+ * the throughput reached by the device is likely to be the
2634
+ * same, with and without queue merging.
2635
+ *
2636
+ * Disabling merging also provides a remarkable benefit in
2637
+ * terms of throughput. Merging tends to make many workloads
2638
+ * artificially more uneven, because of shared queues
2639
+ * remaining non empty for incomparably more time than
2640
+ * non-merged queues. This may accentuate workload
2641
+ * asymmetries. For example, if one of the queues in a set of
2642
+ * merged queues has a higher weight than a normal queue, then
2643
+ * the shared queue may inherit such a high weight and, by
2644
+ * staying almost always active, may force BFQ to perform I/O
2645
+ * plugging most of the time. This evidently makes it harder
2646
+ * for BFQ to let the device reach a high throughput.
2647
+ *
2648
+ * Finally, the likely() macro below is not used because one
2649
+ * of the two branches is more likely than the other, but to
2650
+ * have the code path after the following if() executed as
2651
+ * fast as possible for the case of a non rotational device
2652
+ * with queueing. We want it because this is the fastest kind
2653
+ * of device. On the opposite end, the likely() may lengthen
2654
+ * the execution time of BFQ for the case of slower devices
2655
+ * (rotational or at least without queueing). But in this case
2656
+ * the execution time of BFQ matters very little, if not at
2657
+ * all.
2658
+ */
2659
+ if (likely(bfqd->nonrot_with_queueing))
2660
+ return NULL;
2661
+
22172662 /*
22182663 * Prevent bfqq from being merged if it has been created too
22192664 * long ago. The idea is that true cooperating processes, and
....@@ -2228,14 +2673,11 @@
22282673 if (bfq_too_late_for_merging(bfqq))
22292674 return NULL;
22302675
2231
- if (bfqq->new_bfqq)
2232
- return bfqq->new_bfqq;
2233
-
22342676 if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
22352677 return NULL;
22362678
22372679 /* If there is only one backlogged queue, don't search. */
2238
- if (bfqd->busy_queues == 1)
2680
+ if (bfq_tot_busy_queues(bfqd) == 1)
22392681 return NULL;
22402682
22412683 in_service_bfqq = bfqd->in_service_queue;
....@@ -2277,6 +2719,7 @@
22772719 if (!bic)
22782720 return;
22792721
2722
+ bic->saved_weight = bfqq->entity.orig_weight;
22802723 bic->saved_ttime = bfqq->ttime;
22812724 bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
22822725 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
....@@ -2295,6 +2738,7 @@
22952738 * to enjoy weight raising if split soon.
22962739 */
22972740 bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
2741
+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
22982742 bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
22992743 bic->saved_last_wr_start_finish = jiffies;
23002744 } else {
....@@ -2304,6 +2748,26 @@
23042748 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
23052749 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
23062750 }
2751
+}
2752
+
2753
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2754
+{
2755
+ /*
2756
+ * To prevent bfqq's service guarantees from being violated,
2757
+ * bfqq may be left busy, i.e., queued for service, even if
2758
+ * empty (see comments in __bfq_bfqq_expire() for
2759
+ * details). But, if no process will send requests to bfqq any
2760
+ * longer, then there is no point in keeping bfqq queued for
2761
+ * service. In addition, keeping bfqq queued for service, but
2762
+ * with no process ref any longer, may have caused bfqq to be
2763
+ * freed when dequeued from service. But this is assumed to
2764
+ * never happen.
2765
+ */
2766
+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
2767
+ bfqq != bfqd->in_service_queue)
2768
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
2769
+
2770
+ bfq_put_queue(bfqq);
23072771 }
23082772
23092773 static void
....@@ -2352,7 +2816,7 @@
23522816 /*
23532817 * Merge queues (that is, let bic redirect its requests to new_bfqq)
23542818 */
2355
- bic_set_bfqq(bic, new_bfqq, 1);
2819
+ bic_set_bfqq(bic, new_bfqq, true);
23562820 bfq_mark_bfqq_coop(new_bfqq);
23572821 /*
23582822 * new_bfqq now belongs to at least two bics (it is a shared queue):
....@@ -2365,9 +2829,18 @@
23652829 * assignment causes no harm).
23662830 */
23672831 new_bfqq->bic = NULL;
2832
+ /*
2833
+ * If the queue is shared, the pid is the pid of one of the associated
2834
+ * processes. Which pid depends on the exact sequence of merge events
2835
+ * the queue underwent. So printing such a pid is useless and confusing
2836
+ * because it reports a random pid between those of the associated
2837
+ * processes.
2838
+ * We mark such a queue with a pid -1, and then print SHARED instead of
2839
+ * a pid in logging messages.
2840
+ */
2841
+ new_bfqq->pid = -1;
23682842 bfqq->bic = NULL;
2369
- /* release process reference to bfqq */
2370
- bfq_put_queue(bfqq);
2843
+ bfq_release_process_ref(bfqd, bfqq);
23712844 }
23722845
23732846 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
....@@ -2399,8 +2872,8 @@
23992872 /*
24002873 * bic still points to bfqq, then it has not yet been
24012874 * redirected to some other bfq_queue, and a queue
2402
- * merge beween bfqq and new_bfqq can be safely
2403
- * fulfillled, i.e., bic can be redirected to new_bfqq
2875
+ * merge between bfqq and new_bfqq can be safely
2876
+ * fulfilled, i.e., bic can be redirected to new_bfqq
24042877 * and bfqq can be put.
24052878 */
24062879 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
....@@ -2535,12 +3008,14 @@
25353008 * queue).
25363009 */
25373010 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2538
- bfq_symmetric_scenario(bfqd))
3011
+ !bfq_asymmetric_scenario(bfqd, bfqq))
25393012 sl = min_t(u64, sl, BFQ_MIN_TT);
25403013 else if (bfqq->wr_coeff > 1)
25413014 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
25423015
25433016 bfqd->last_idling_start = ktime_get();
3017
+ bfqd->last_idling_start_jiffies = jiffies;
3018
+
25443019 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
25453020 HRTIMER_MODE_REL);
25463021 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
....@@ -2764,7 +3239,7 @@
27643239
27653240 if ((bfqd->rq_in_driver > 0 ||
27663241 now_ns - bfqd->last_completion < BFQ_MIN_TT)
2767
- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
3242
+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
27683243 bfqd->sequential_samples++;
27693244
27703245 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
....@@ -2816,7 +3291,209 @@
28163291 bfq_remove_request(q, rq);
28173292 }
28183293
2819
-static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3294
+/*
3295
+ * There is a case where idling does not have to be performed for
3296
+ * throughput concerns, but to preserve the throughput share of
3297
+ * the process associated with bfqq.
3298
+ *
3299
+ * To introduce this case, we can note that allowing the drive
3300
+ * to enqueue more than one request at a time, and hence
3301
+ * delegating de facto final scheduling decisions to the
3302
+ * drive's internal scheduler, entails loss of control on the
3303
+ * actual request service order. In particular, the critical
3304
+ * situation is when requests from different processes happen
3305
+ * to be present, at the same time, in the internal queue(s)
3306
+ * of the drive. In such a situation, the drive, by deciding
3307
+ * the service order of the internally-queued requests, does
3308
+ * determine also the actual throughput distribution among
3309
+ * these processes. But the drive typically has no notion or
3310
+ * concern about per-process throughput distribution, and
3311
+ * makes its decisions only on a per-request basis. Therefore,
3312
+ * the service distribution enforced by the drive's internal
3313
+ * scheduler is likely to coincide with the desired throughput
3314
+ * distribution only in a completely symmetric, or favorably
3315
+ * skewed scenario where:
3316
+ * (i-a) each of these processes must get the same throughput as
3317
+ * the others,
3318
+ * (i-b) in case (i-a) does not hold, it holds that the process
3319
+ * associated with bfqq must receive a lower or equal
3320
+ * throughput than any of the other processes;
3321
+ * (ii) the I/O of each process has the same properties, in
3322
+ * terms of locality (sequential or random), direction
3323
+ * (reads or writes), request sizes, greediness
3324
+ * (from I/O-bound to sporadic), and so on;
3325
+
3326
+ * In fact, in such a scenario, the drive tends to treat the requests
3327
+ * of each process in about the same way as the requests of the
3328
+ * others, and thus to provide each of these processes with about the
3329
+ * same throughput. This is exactly the desired throughput
3330
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3331
+ * even more convenient distribution for (the process associated with)
3332
+ * bfqq.
3333
+ *
3334
+ * In contrast, in any asymmetric or unfavorable scenario, device
3335
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
3336
+ * that bfqq receives its assigned fraction of the device throughput
3337
+ * (see [1] for details).
3338
+ *
3339
+ * The problem is that idling may significantly reduce throughput with
3340
+ * certain combinations of types of I/O and devices. An important
3341
+ * example is sync random I/O on flash storage with command
3342
+ * queueing. So, unless bfqq falls in cases where idling also boosts
3343
+ * throughput, it is important to check conditions (i-a), i(-b) and
3344
+ * (ii) accurately, so as to avoid idling when not strictly needed for
3345
+ * service guarantees.
3346
+ *
3347
+ * Unfortunately, it is extremely difficult to thoroughly check
3348
+ * condition (ii). And, in case there are active groups, it becomes
3349
+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
3350
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
3351
+ * become false 'indirectly', it is enough that an active group
3352
+ * contains more active processes or sub-groups than some other active
3353
+ * group. More precisely, for conditions (i-a) or (i-b) to become
3354
+ * false because of such a group, it is not even necessary that the
3355
+ * group is (still) active: it is sufficient that, even if the group
3356
+ * has become inactive, some of its descendant processes still have
3357
+ * some request already dispatched but still waiting for
3358
+ * completion. In fact, requests have still to be guaranteed their
3359
+ * share of the throughput even after being dispatched. In this
3360
+ * respect, it is easy to show that, if a group frequently becomes
3361
+ * inactive while still having in-flight requests, and if, when this
3362
+ * happens, the group is not considered in the calculation of whether
3363
+ * the scenario is asymmetric, then the group may fail to be
3364
+ * guaranteed its fair share of the throughput (basically because
3365
+ * idling may not be performed for the descendant processes of the
3366
+ * group, but it had to be). We address this issue with the following
3367
+ * bi-modal behavior, implemented in the function
3368
+ * bfq_asymmetric_scenario().
3369
+ *
3370
+ * If there are groups with requests waiting for completion
3371
+ * (as commented above, some of these groups may even be
3372
+ * already inactive), then the scenario is tagged as
3373
+ * asymmetric, conservatively, without checking any of the
3374
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3375
+ * This behavior matches also the fact that groups are created
3376
+ * exactly if controlling I/O is a primary concern (to
3377
+ * preserve bandwidth and latency guarantees).
3378
+ *
3379
+ * On the opposite end, if there are no groups with requests waiting
3380
+ * for completion, then only conditions (i-a) and (i-b) are actually
3381
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3382
+ * idling is not performed, regardless of whether condition (ii)
3383
+ * holds. In other words, only if conditions (i-a) and (i-b) do not
3384
+ * hold, then idling is allowed, and the device tends to be prevented
3385
+ * from queueing many requests, possibly of several processes. Since
3386
+ * there are no groups with requests waiting for completion, then, to
3387
+ * control conditions (i-a) and (i-b) it is enough to check just
3388
+ * whether all the queues with requests waiting for completion also
3389
+ * have the same weight.
3390
+ *
3391
+ * Not checking condition (ii) evidently exposes bfqq to the
3392
+ * risk of getting less throughput than its fair share.
3393
+ * However, for queues with the same weight, a further
3394
+ * mechanism, preemption, mitigates or even eliminates this
3395
+ * problem. And it does so without consequences on overall
3396
+ * throughput. This mechanism and its benefits are explained
3397
+ * in the next three paragraphs.
3398
+ *
3399
+ * Even if a queue, say Q, is expired when it remains idle, Q
3400
+ * can still preempt the new in-service queue if the next
3401
+ * request of Q arrives soon (see the comments on
3402
+ * bfq_bfqq_update_budg_for_activation). If all queues and
3403
+ * groups have the same weight, this form of preemption,
3404
+ * combined with the hole-recovery heuristic described in the
3405
+ * comments on function bfq_bfqq_update_budg_for_activation,
3406
+ * are enough to preserve a correct bandwidth distribution in
3407
+ * the mid term, even without idling. In fact, even if not
3408
+ * idling allows the internal queues of the device to contain
3409
+ * many requests, and thus to reorder requests, we can rather
3410
+ * safely assume that the internal scheduler still preserves a
3411
+ * minimum of mid-term fairness.
3412
+ *
3413
+ * More precisely, this preemption-based, idleless approach
3414
+ * provides fairness in terms of IOPS, and not sectors per
3415
+ * second. This can be seen with a simple example. Suppose
3416
+ * that there are two queues with the same weight, but that
3417
+ * the first queue receives requests of 8 sectors, while the
3418
+ * second queue receives requests of 1024 sectors. In
3419
+ * addition, suppose that each of the two queues contains at
3420
+ * most one request at a time, which implies that each queue
3421
+ * always remains idle after it is served. Finally, after
3422
+ * remaining idle, each queue receives very quickly a new
3423
+ * request. It follows that the two queues are served
3424
+ * alternatively, preempting each other if needed. This
3425
+ * implies that, although both queues have the same weight,
3426
+ * the queue with large requests receives a service that is
3427
+ * 1024/8 times as high as the service received by the other
3428
+ * queue.
3429
+ *
3430
+ * The motivation for using preemption instead of idling (for
3431
+ * queues with the same weight) is that, by not idling,
3432
+ * service guarantees are preserved (completely or at least in
3433
+ * part) without minimally sacrificing throughput. And, if
3434
+ * there is no active group, then the primary expectation for
3435
+ * this device is probably a high throughput.
3436
+ *
3437
+ * We are now left only with explaining the two sub-conditions in the
3438
+ * additional compound condition that is checked below for deciding
3439
+ * whether the scenario is asymmetric. To explain the first
3440
+ * sub-condition, we need to add that the function
3441
+ * bfq_asymmetric_scenario checks the weights of only
3442
+ * non-weight-raised queues, for efficiency reasons (see comments on
3443
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
3444
+ * is checked explicitly here. More precisely, the compound condition
3445
+ * below takes into account also the fact that, even if bfqq is being
3446
+ * weight-raised, the scenario is still symmetric if all queues with
3447
+ * requests waiting for completion happen to be
3448
+ * weight-raised. Actually, we should be even more precise here, and
3449
+ * differentiate between interactive weight raising and soft real-time
3450
+ * weight raising.
3451
+ *
3452
+ * The second sub-condition checked in the compound condition is
3453
+ * whether there is a fair amount of already in-flight I/O not
3454
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
3455
+ * following reason. The drive may decide to serve in-flight
3456
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
3457
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
3458
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
3459
+ * basically uncontrolled amount of I/O from other queues may be
3460
+ * dispatched too, possibly causing the service of bfqq's I/O to be
3461
+ * delayed even longer in the drive. This problem gets more and more
3462
+ * serious as the speed and the queue depth of the drive grow,
3463
+ * because, as these two quantities grow, the probability to find no
3464
+ * queue busy but many requests in flight grows too. By contrast,
3465
+ * plugging I/O dispatching minimizes the delay induced by already
3466
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
3467
+ * lose because of this delay.
3468
+ *
3469
+ * As a side note, it is worth considering that the above
3470
+ * device-idling countermeasures may however fail in the following
3471
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
3472
+ * in a time period during which all symmetry sub-conditions hold, and
3473
+ * therefore the device is allowed to enqueue many requests, but at
3474
+ * some later point in time some sub-condition stops to hold, then it
3475
+ * may become impossible to make requests be served in the desired
3476
+ * order until all the requests already queued in the device have been
3477
+ * served. The last sub-condition commented above somewhat mitigates
3478
+ * this problem for weight-raised queues.
3479
+ */
3480
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3481
+ struct bfq_queue *bfqq)
3482
+{
3483
+ /* No point in idling for bfqq if it won't get requests any longer */
3484
+ if (unlikely(!bfqq_process_refs(bfqq)))
3485
+ return false;
3486
+
3487
+ return (bfqq->wr_coeff > 1 &&
3488
+ (bfqd->wr_busy_queues <
3489
+ bfq_tot_busy_queues(bfqd) ||
3490
+ bfqd->rq_in_driver >=
3491
+ bfqq->dispatched + 4)) ||
3492
+ bfq_asymmetric_scenario(bfqd, bfqq);
3493
+}
3494
+
3495
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3496
+ enum bfqq_expiration reason)
28203497 {
28213498 /*
28223499 * If this bfqq is shared between multiple processes, check
....@@ -2827,7 +3504,22 @@
28273504 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
28283505 bfq_mark_bfqq_split_coop(bfqq);
28293506
2830
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
3507
+ /*
3508
+ * Consider queues with a higher finish virtual time than
3509
+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
3510
+ * true, then bfqq's bandwidth would be violated if an
3511
+ * uncontrolled amount of I/O from these queues were
3512
+ * dispatched while bfqq is waiting for its new I/O to
3513
+ * arrive. This is exactly what may happen if this is a forced
3514
+ * expiration caused by a preemption attempt, and if bfqq is
3515
+ * not re-scheduled. To prevent this from happening, re-queue
3516
+ * bfqq if it needs I/O-dispatch plugging, even if it is
3517
+ * empty. By doing so, bfqq is granted to be served before the
3518
+ * above queues (provided that bfqq is of course eligible).
3519
+ */
3520
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3521
+ !(reason == BFQQE_PREEMPTED &&
3522
+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
28313523 if (bfqq->dispatched == 0)
28323524 /*
28333525 * Overloading budget_timeout field to store
....@@ -2842,8 +3534,11 @@
28423534 bfq_requeue_bfqq(bfqd, bfqq, true);
28433535 /*
28443536 * Resort priority tree of potential close cooperators.
3537
+ * See comments on bfq_pos_tree_add_move() for the unlikely().
28453538 */
2846
- bfq_pos_tree_add_move(bfqd, bfqq);
3539
+ if (unlikely(!bfqd->nonrot_with_queueing &&
3540
+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
3541
+ bfq_pos_tree_add_move(bfqd, bfqq);
28473542 }
28483543
28493544 /*
....@@ -3217,13 +3912,6 @@
32173912 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
32183913 }
32193914
3220
-static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
3221
-{
3222
- return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
3223
- blk_queue_nonrot(bfqq->bfqd->queue) &&
3224
- bfqq->bfqd->hw_tag;
3225
-}
3226
-
32273915 /**
32283916 * bfq_bfqq_expire - expire a queue.
32293917 * @bfqd: device owning the queue.
....@@ -3299,16 +3987,32 @@
32993987 * requests, then the request pattern is isochronous
33003988 * (see the comments on the function
33013989 * bfq_bfqq_softrt_next_start()). Thus we can compute
3302
- * soft_rt_next_start. If, instead, the queue still
3303
- * has outstanding requests, then we have to wait for
3304
- * the completion of all the outstanding requests to
3305
- * discover whether the request pattern is actually
3306
- * isochronous.
3990
+ * soft_rt_next_start. And we do it, unless bfqq is in
3991
+ * interactive weight raising. We do not do it in the
3992
+ * latter subcase, for the following reason. bfqq may
3993
+ * be conveying the I/O needed to load a soft
3994
+ * real-time application. Such an application will
3995
+ * actually exhibit a soft real-time I/O pattern after
3996
+ * it finally starts doing its job. But, if
3997
+ * soft_rt_next_start is computed here for an
3998
+ * interactive bfqq, and bfqq had received a lot of
3999
+ * service before remaining with no outstanding
4000
+ * request (likely to happen on a fast device), then
4001
+ * soft_rt_next_start would be assigned such a high
4002
+ * value that, for a very long time, bfqq would be
4003
+ * prevented from being possibly considered as soft
4004
+ * real time.
4005
+ *
4006
+ * If, instead, the queue still has outstanding
4007
+ * requests, then we have to wait for the completion
4008
+ * of all the outstanding requests to discover whether
4009
+ * the request pattern is actually isochronous.
33074010 */
3308
- if (bfqq->dispatched == 0)
4011
+ if (bfqq->dispatched == 0 &&
4012
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
33094013 bfqq->soft_rt_next_start =
33104014 bfq_bfqq_softrt_next_start(bfqd, bfqq);
3311
- else {
4015
+ else if (bfqq->dispatched > 0) {
33124016 /*
33134017 * Schedule an update of soft_rt_next_start to when
33144018 * the task may be discovered to be isochronous.
....@@ -3322,15 +4026,21 @@
33224026 slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
33234027
33244028 /*
4029
+ * bfqq expired, so no total service time needs to be computed
4030
+ * any longer: reset state machine for measuring total service
4031
+ * times.
4032
+ */
4033
+ bfqd->rqs_injected = bfqd->wait_dispatch = false;
4034
+ bfqd->waited_rq = NULL;
4035
+
4036
+ /*
33254037 * Increase, decrease or leave budget unchanged according to
33264038 * reason.
33274039 */
33284040 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3329
- if (__bfq_bfqq_expire(bfqd, bfqq))
4041
+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
33304042 /* bfqq is gone, no more actions on it */
33314043 return;
3332
-
3333
- bfqq->injected_service = 0;
33344044
33354045 /* mark bfqq as waiting a request only if a bic still points to it */
33364046 if (!bfq_bfqq_busy(bfqq) &&
....@@ -3399,52 +4109,16 @@
33994109 bfq_bfqq_budget_timeout(bfqq);
34004110 }
34014111
3402
-/*
3403
- * For a queue that becomes empty, device idling is allowed only if
3404
- * this function returns true for the queue. As a consequence, since
3405
- * device idling plays a critical role in both throughput boosting and
3406
- * service guarantees, the return value of this function plays a
3407
- * critical role in both these aspects as well.
3408
- *
3409
- * In a nutshell, this function returns true only if idling is
3410
- * beneficial for throughput or, even if detrimental for throughput,
3411
- * idling is however necessary to preserve service guarantees (low
3412
- * latency, desired throughput distribution, ...). In particular, on
3413
- * NCQ-capable devices, this function tries to return false, so as to
3414
- * help keep the drives' internal queues full, whenever this helps the
3415
- * device boost the throughput without causing any service-guarantee
3416
- * issue.
3417
- *
3418
- * In more detail, the return value of this function is obtained by,
3419
- * first, computing a number of boolean variables that take into
3420
- * account throughput and service-guarantee issues, and, then,
3421
- * combining these variables in a logical expression. Most of the
3422
- * issues taken into account are not trivial. We discuss these issues
3423
- * individually while introducing the variables.
3424
- */
3425
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
4112
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
4113
+ struct bfq_queue *bfqq)
34264114 {
3427
- struct bfq_data *bfqd = bfqq->bfqd;
34284115 bool rot_without_queueing =
34294116 !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
34304117 bfqq_sequential_and_IO_bound,
3431
- idling_boosts_thr, idling_boosts_thr_without_issues,
3432
- idling_needed_for_service_guarantees,
3433
- asymmetric_scenario;
4118
+ idling_boosts_thr;
34344119
3435
- if (bfqd->strict_guarantees)
3436
- return true;
3437
-
3438
- /*
3439
- * Idling is performed only if slice_idle > 0. In addition, we
3440
- * do not idle if
3441
- * (a) bfqq is async
3442
- * (b) bfqq is in the idle io prio class: in this case we do
3443
- * not idle because we want to minimize the bandwidth that
3444
- * queues in this class can steal to higher-priority queues
3445
- */
3446
- if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
3447
- bfq_class_idle(bfqq))
4120
+ /* No point in idling for bfqq if it won't get requests any longer */
4121
+ if (unlikely(!bfqq_process_refs(bfqq)))
34484122 return false;
34494123
34504124 bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
....@@ -3477,8 +4151,7 @@
34774151 bfqq_sequential_and_IO_bound);
34784152
34794153 /*
3480
- * The value of the next variable,
3481
- * idling_boosts_thr_without_issues, is equal to that of
4154
+ * The return value of this function is equal to that of
34824155 * idling_boosts_thr, unless a special case holds. In this
34834156 * special case, described below, idling may cause problems to
34844157 * weight-raised queues.
....@@ -3495,217 +4168,85 @@
34954168 * which enqueue several requests in advance, and further
34964169 * reorder internally-queued requests.
34974170 *
3498
- * For this reason, we force to false the value of
3499
- * idling_boosts_thr_without_issues if there are weight-raised
3500
- * busy queues. In this case, and if bfqq is not weight-raised,
3501
- * this guarantees that the device is not idled for bfqq (if,
3502
- * instead, bfqq is weight-raised, then idling will be
3503
- * guaranteed by another variable, see below). Combined with
3504
- * the timestamping rules of BFQ (see [1] for details), this
3505
- * behavior causes bfqq, and hence any sync non-weight-raised
3506
- * queue, to get a lower number of requests served, and thus
3507
- * to ask for a lower number of requests from the request
3508
- * pool, before the busy weight-raised queues get served
3509
- * again. This often mitigates starvation problems in the
3510
- * presence of heavy write workloads and NCQ, thereby
3511
- * guaranteeing a higher application and system responsiveness
3512
- * in these hostile scenarios.
4171
+ * For this reason, we force to false the return value if
4172
+ * there are weight-raised busy queues. In this case, and if
4173
+ * bfqq is not weight-raised, this guarantees that the device
4174
+ * is not idled for bfqq (if, instead, bfqq is weight-raised,
4175
+ * then idling will be guaranteed by another variable, see
4176
+ * below). Combined with the timestamping rules of BFQ (see
4177
+ * [1] for details), this behavior causes bfqq, and hence any
4178
+ * sync non-weight-raised queue, to get a lower number of
4179
+ * requests served, and thus to ask for a lower number of
4180
+ * requests from the request pool, before the busy
4181
+ * weight-raised queues get served again. This often mitigates
4182
+ * starvation problems in the presence of heavy write
4183
+ * workloads and NCQ, thereby guaranteeing a higher
4184
+ * application and system responsiveness in these hostile
4185
+ * scenarios.
35134186 */
3514
- idling_boosts_thr_without_issues = idling_boosts_thr &&
4187
+ return idling_boosts_thr &&
35154188 bfqd->wr_busy_queues == 0;
4189
+}
4190
+
4191
+/*
4192
+ * For a queue that becomes empty, device idling is allowed only if
4193
+ * this function returns true for that queue. As a consequence, since
4194
+ * device idling plays a critical role for both throughput boosting
4195
+ * and service guarantees, the return value of this function plays a
4196
+ * critical role as well.
4197
+ *
4198
+ * In a nutshell, this function returns true only if idling is
4199
+ * beneficial for throughput or, even if detrimental for throughput,
4200
+ * idling is however necessary to preserve service guarantees (low
4201
+ * latency, desired throughput distribution, ...). In particular, on
4202
+ * NCQ-capable devices, this function tries to return false, so as to
4203
+ * help keep the drives' internal queues full, whenever this helps the
4204
+ * device boost the throughput without causing any service-guarantee
4205
+ * issue.
4206
+ *
4207
+ * Most of the issues taken into account to get the return value of
4208
+ * this function are not trivial. We discuss these issues in the two
4209
+ * functions providing the main pieces of information needed by this
4210
+ * function.
4211
+ */
4212
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
4213
+{
4214
+ struct bfq_data *bfqd = bfqq->bfqd;
4215
+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
4216
+
4217
+ /* No point in idling for bfqq if it won't get requests any longer */
4218
+ if (unlikely(!bfqq_process_refs(bfqq)))
4219
+ return false;
4220
+
4221
+ if (unlikely(bfqd->strict_guarantees))
4222
+ return true;
35164223
35174224 /*
3518
- * There is then a case where idling must be performed not
3519
- * for throughput concerns, but to preserve service
3520
- * guarantees.
3521
- *
3522
- * To introduce this case, we can note that allowing the drive
3523
- * to enqueue more than one request at a time, and hence
3524
- * delegating de facto final scheduling decisions to the
3525
- * drive's internal scheduler, entails loss of control on the
3526
- * actual request service order. In particular, the critical
3527
- * situation is when requests from different processes happen
3528
- * to be present, at the same time, in the internal queue(s)
3529
- * of the drive. In such a situation, the drive, by deciding
3530
- * the service order of the internally-queued requests, does
3531
- * determine also the actual throughput distribution among
3532
- * these processes. But the drive typically has no notion or
3533
- * concern about per-process throughput distribution, and
3534
- * makes its decisions only on a per-request basis. Therefore,
3535
- * the service distribution enforced by the drive's internal
3536
- * scheduler is likely to coincide with the desired
3537
- * device-throughput distribution only in a completely
3538
- * symmetric scenario where:
3539
- * (i) each of these processes must get the same throughput as
3540
- * the others;
3541
- * (ii) the I/O of each process has the same properties, in
3542
- * terms of locality (sequential or random), direction
3543
- * (reads or writes), request sizes, greediness
3544
- * (from I/O-bound to sporadic), and so on.
3545
- * In fact, in such a scenario, the drive tends to treat
3546
- * the requests of each of these processes in about the same
3547
- * way as the requests of the others, and thus to provide
3548
- * each of these processes with about the same throughput
3549
- * (which is exactly the desired throughput distribution). In
3550
- * contrast, in any asymmetric scenario, device idling is
3551
- * certainly needed to guarantee that bfqq receives its
3552
- * assigned fraction of the device throughput (see [1] for
3553
- * details).
3554
- * The problem is that idling may significantly reduce
3555
- * throughput with certain combinations of types of I/O and
3556
- * devices. An important example is sync random I/O, on flash
3557
- * storage with command queueing. So, unless bfqq falls in the
3558
- * above cases where idling also boosts throughput, it would
3559
- * be important to check conditions (i) and (ii) accurately,
3560
- * so as to avoid idling when not strictly needed for service
3561
- * guarantees.
3562
- *
3563
- * Unfortunately, it is extremely difficult to thoroughly
3564
- * check condition (ii). And, in case there are active groups,
3565
- * it becomes very difficult to check condition (i) too. In
3566
- * fact, if there are active groups, then, for condition (i)
3567
- * to become false, it is enough that an active group contains
3568
- * more active processes or sub-groups than some other active
3569
- * group. More precisely, for condition (i) to hold because of
3570
- * such a group, it is not even necessary that the group is
3571
- * (still) active: it is sufficient that, even if the group
3572
- * has become inactive, some of its descendant processes still
3573
- * have some request already dispatched but still waiting for
3574
- * completion. In fact, requests have still to be guaranteed
3575
- * their share of the throughput even after being
3576
- * dispatched. In this respect, it is easy to show that, if a
3577
- * group frequently becomes inactive while still having
3578
- * in-flight requests, and if, when this happens, the group is
3579
- * not considered in the calculation of whether the scenario
3580
- * is asymmetric, then the group may fail to be guaranteed its
3581
- * fair share of the throughput (basically because idling may
3582
- * not be performed for the descendant processes of the group,
3583
- * but it had to be). We address this issue with the
3584
- * following bi-modal behavior, implemented in the function
3585
- * bfq_symmetric_scenario().
3586
- *
3587
- * If there are groups with requests waiting for completion
3588
- * (as commented above, some of these groups may even be
3589
- * already inactive), then the scenario is tagged as
3590
- * asymmetric, conservatively, without checking any of the
3591
- * conditions (i) and (ii). So the device is idled for bfqq.
3592
- * This behavior matches also the fact that groups are created
3593
- * exactly if controlling I/O is a primary concern (to
3594
- * preserve bandwidth and latency guarantees).
3595
- *
3596
- * On the opposite end, if there are no groups with requests
3597
- * waiting for completion, then only condition (i) is actually
3598
- * controlled, i.e., provided that condition (i) holds, idling
3599
- * is not performed, regardless of whether condition (ii)
3600
- * holds. In other words, only if condition (i) does not hold,
3601
- * then idling is allowed, and the device tends to be
3602
- * prevented from queueing many requests, possibly of several
3603
- * processes. Since there are no groups with requests waiting
3604
- * for completion, then, to control condition (i) it is enough
3605
- * to check just whether all the queues with requests waiting
3606
- * for completion also have the same weight.
3607
- *
3608
- * Not checking condition (ii) evidently exposes bfqq to the
3609
- * risk of getting less throughput than its fair share.
3610
- * However, for queues with the same weight, a further
3611
- * mechanism, preemption, mitigates or even eliminates this
3612
- * problem. And it does so without consequences on overall
3613
- * throughput. This mechanism and its benefits are explained
3614
- * in the next three paragraphs.
3615
- *
3616
- * Even if a queue, say Q, is expired when it remains idle, Q
3617
- * can still preempt the new in-service queue if the next
3618
- * request of Q arrives soon (see the comments on
3619
- * bfq_bfqq_update_budg_for_activation). If all queues and
3620
- * groups have the same weight, this form of preemption,
3621
- * combined with the hole-recovery heuristic described in the
3622
- * comments on function bfq_bfqq_update_budg_for_activation,
3623
- * are enough to preserve a correct bandwidth distribution in
3624
- * the mid term, even without idling. In fact, even if not
3625
- * idling allows the internal queues of the device to contain
3626
- * many requests, and thus to reorder requests, we can rather
3627
- * safely assume that the internal scheduler still preserves a
3628
- * minimum of mid-term fairness.
3629
- *
3630
- * More precisely, this preemption-based, idleless approach
3631
- * provides fairness in terms of IOPS, and not sectors per
3632
- * second. This can be seen with a simple example. Suppose
3633
- * that there are two queues with the same weight, but that
3634
- * the first queue receives requests of 8 sectors, while the
3635
- * second queue receives requests of 1024 sectors. In
3636
- * addition, suppose that each of the two queues contains at
3637
- * most one request at a time, which implies that each queue
3638
- * always remains idle after it is served. Finally, after
3639
- * remaining idle, each queue receives very quickly a new
3640
- * request. It follows that the two queues are served
3641
- * alternatively, preempting each other if needed. This
3642
- * implies that, although both queues have the same weight,
3643
- * the queue with large requests receives a service that is
3644
- * 1024/8 times as high as the service received by the other
3645
- * queue.
3646
- *
3647
- * The motivation for using preemption instead of idling (for
3648
- * queues with the same weight) is that, by not idling,
3649
- * service guarantees are preserved (completely or at least in
3650
- * part) without minimally sacrificing throughput. And, if
3651
- * there is no active group, then the primary expectation for
3652
- * this device is probably a high throughput.
3653
- *
3654
- * We are now left only with explaining the additional
3655
- * compound condition that is checked below for deciding
3656
- * whether the scenario is asymmetric. To explain this
3657
- * compound condition, we need to add that the function
3658
- * bfq_symmetric_scenario checks the weights of only
3659
- * non-weight-raised queues, for efficiency reasons (see
3660
- * comments on bfq_weights_tree_add()). Then the fact that
3661
- * bfqq is weight-raised is checked explicitly here. More
3662
- * precisely, the compound condition below takes into account
3663
- * also the fact that, even if bfqq is being weight-raised,
3664
- * the scenario is still symmetric if all queues with requests
3665
- * waiting for completion happen to be
3666
- * weight-raised. Actually, we should be even more precise
3667
- * here, and differentiate between interactive weight raising
3668
- * and soft real-time weight raising.
3669
- *
3670
- * As a side note, it is worth considering that the above
3671
- * device-idling countermeasures may however fail in the
3672
- * following unlucky scenario: if idling is (correctly)
3673
- * disabled in a time period during which all symmetry
3674
- * sub-conditions hold, and hence the device is allowed to
3675
- * enqueue many requests, but at some later point in time some
3676
- * sub-condition stops to hold, then it may become impossible
3677
- * to let requests be served in the desired order until all
3678
- * the requests already queued in the device have been served.
4225
+ * Idling is performed only if slice_idle > 0. In addition, we
4226
+ * do not idle if
4227
+ * (a) bfqq is async
4228
+ * (b) bfqq is in the idle io prio class: in this case we do
4229
+ * not idle because we want to minimize the bandwidth that
4230
+ * queues in this class can steal to higher-priority queues
36794231 */
3680
- asymmetric_scenario = (bfqq->wr_coeff > 1 &&
3681
- bfqd->wr_busy_queues < bfqd->busy_queues) ||
3682
- !bfq_symmetric_scenario(bfqd);
4232
+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
4233
+ bfq_class_idle(bfqq))
4234
+ return false;
4235
+
4236
+ idling_boosts_thr_with_no_issue =
4237
+ idling_boosts_thr_without_issues(bfqd, bfqq);
4238
+
4239
+ idling_needed_for_service_guar =
4240
+ idling_needed_for_service_guarantees(bfqd, bfqq);
36834241
36844242 /*
3685
- * Finally, there is a case where maximizing throughput is the
3686
- * best choice even if it may cause unfairness toward
3687
- * bfqq. Such a case is when bfqq became active in a burst of
3688
- * queue activations. Queues that became active during a large
3689
- * burst benefit only from throughput, as discussed in the
3690
- * comments on bfq_handle_burst. Thus, if bfqq became active
3691
- * in a burst and not idling the device maximizes throughput,
3692
- * then the device must no be idled, because not idling the
3693
- * device provides bfqq and all other queues in the burst with
3694
- * maximum benefit. Combining this and the above case, we can
3695
- * now establish when idling is actually needed to preserve
3696
- * service guarantees.
3697
- */
3698
- idling_needed_for_service_guarantees =
3699
- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
3700
-
3701
- /*
3702
- * We have now all the components we need to compute the
4243
+ * We have now the two components we need to compute the
37034244 * return value of the function, which is true only if idling
37044245 * either boosts the throughput (without issues), or is
37054246 * necessary to preserve service guarantees.
37064247 */
3707
- return idling_boosts_thr_without_issues ||
3708
- idling_needed_for_service_guarantees;
4248
+ return idling_boosts_thr_with_no_issue ||
4249
+ idling_needed_for_service_guar;
37094250 }
37104251
37114252 /*
....@@ -3724,26 +4265,98 @@
37244265 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
37254266 }
37264267
3727
-static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
4268
+/*
4269
+ * This function chooses the queue from which to pick the next extra
4270
+ * I/O request to inject, if it finds a compatible queue. See the
4271
+ * comments on bfq_update_inject_limit() for details on the injection
4272
+ * mechanism, and for the definitions of the quantities mentioned
4273
+ * below.
4274
+ */
4275
+static struct bfq_queue *
4276
+bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
37284277 {
3729
- struct bfq_queue *bfqq;
4278
+ struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
4279
+ unsigned int limit = in_serv_bfqq->inject_limit;
4280
+ /*
4281
+ * If
4282
+ * - bfqq is not weight-raised and therefore does not carry
4283
+ * time-critical I/O,
4284
+ * or
4285
+ * - regardless of whether bfqq is weight-raised, bfqq has
4286
+ * however a long think time, during which it can absorb the
4287
+ * effect of an appropriate number of extra I/O requests
4288
+ * from other queues (see bfq_update_inject_limit for
4289
+ * details on the computation of this number);
4290
+ * then injection can be performed without restrictions.
4291
+ */
4292
+ bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 ||
4293
+ !bfq_bfqq_has_short_ttime(in_serv_bfqq);
37304294
37314295 /*
3732
- * A linear search; but, with a high probability, very few
3733
- * steps are needed to find a candidate queue, i.e., a queue
3734
- * with enough budget left for its next request. In fact:
4296
+ * If
4297
+ * - the baseline total service time could not be sampled yet,
4298
+ * so the inject limit happens to be still 0, and
4299
+ * - a lot of time has elapsed since the plugging of I/O
4300
+ * dispatching started, so drive speed is being wasted
4301
+ * significantly;
4302
+ * then temporarily raise inject limit to one request.
4303
+ */
4304
+ if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
4305
+ bfq_bfqq_wait_request(in_serv_bfqq) &&
4306
+ time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
4307
+ bfqd->bfq_slice_idle)
4308
+ )
4309
+ limit = 1;
4310
+
4311
+ if (bfqd->rq_in_driver >= limit)
4312
+ return NULL;
4313
+
4314
+ /*
4315
+ * Linear search of the source queue for injection; but, with
4316
+ * a high probability, very few steps are needed to find a
4317
+ * candidate queue, i.e., a queue with enough budget left for
4318
+ * its next request. In fact:
37354319 * - BFQ dynamically updates the budget of every queue so as
37364320 * to accommodate the expected backlog of the queue;
37374321 * - if a queue gets all its requests dispatched as injected
37384322 * service, then the queue is removed from the active list
3739
- * (and re-added only if it gets new requests, but with
3740
- * enough budget for its new backlog).
4323
+ * (and re-added only if it gets new requests, but then it
4324
+ * is assigned again enough budget for its new backlog).
37414325 */
37424326 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
37434327 if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
4328
+ (in_serv_always_inject || bfqq->wr_coeff > 1) &&
37444329 bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
3745
- bfq_bfqq_budget_left(bfqq))
3746
- return bfqq;
4330
+ bfq_bfqq_budget_left(bfqq)) {
4331
+ /*
4332
+ * Allow for only one large in-flight request
4333
+ * on non-rotational devices, for the
4334
+ * following reason. On non-rotationl drives,
4335
+ * large requests take much longer than
4336
+ * smaller requests to be served. In addition,
4337
+ * the drive prefers to serve large requests
4338
+ * w.r.t. to small ones, if it can choose. So,
4339
+ * having more than one large requests queued
4340
+ * in the drive may easily make the next first
4341
+ * request of the in-service queue wait for so
4342
+ * long to break bfqq's service guarantees. On
4343
+ * the bright side, large requests let the
4344
+ * drive reach a very high throughput, even if
4345
+ * there is only one in-flight large request
4346
+ * at a time.
4347
+ */
4348
+ if (blk_queue_nonrot(bfqd->queue) &&
4349
+ blk_rq_sectors(bfqq->next_rq) >=
4350
+ BFQQ_SECT_THR_NONROT)
4351
+ limit = min_t(unsigned int, 1, limit);
4352
+ else
4353
+ limit = in_serv_bfqq->inject_limit;
4354
+
4355
+ if (bfqd->rq_in_driver < limit) {
4356
+ bfqd->rqs_injected = true;
4357
+ return bfqq;
4358
+ }
4359
+ }
37474360
37484361 return NULL;
37494362 }
....@@ -3830,14 +4443,105 @@
38304443 * for a new request, or has requests waiting for a completion and
38314444 * may idle after their completion, then keep it anyway.
38324445 *
3833
- * Yet, to boost throughput, inject service from other queues if
3834
- * possible.
4446
+ * Yet, inject service from other queues if it boosts
4447
+ * throughput and is possible.
38354448 */
38364449 if (bfq_bfqq_wait_request(bfqq) ||
38374450 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3838
- if (bfq_bfqq_injectable(bfqq) &&
3839
- bfqq->injected_service * bfqq->inject_coeff <
3840
- bfqq->entity.service * 10)
4451
+ struct bfq_queue *async_bfqq =
4452
+ bfqq->bic && bfqq->bic->bfqq[0] &&
4453
+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
4454
+ bfqq->bic->bfqq[0]->next_rq ?
4455
+ bfqq->bic->bfqq[0] : NULL;
4456
+
4457
+ /*
4458
+ * The next three mutually-exclusive ifs decide
4459
+ * whether to try injection, and choose the queue to
4460
+ * pick an I/O request from.
4461
+ *
4462
+ * The first if checks whether the process associated
4463
+ * with bfqq has also async I/O pending. If so, it
4464
+ * injects such I/O unconditionally. Injecting async
4465
+ * I/O from the same process can cause no harm to the
4466
+ * process. On the contrary, it can only increase
4467
+ * bandwidth and reduce latency for the process.
4468
+ *
4469
+ * The second if checks whether there happens to be a
4470
+ * non-empty waker queue for bfqq, i.e., a queue whose
4471
+ * I/O needs to be completed for bfqq to receive new
4472
+ * I/O. This happens, e.g., if bfqq is associated with
4473
+ * a process that does some sync. A sync generates
4474
+ * extra blocking I/O, which must be completed before
4475
+ * the process associated with bfqq can go on with its
4476
+ * I/O. If the I/O of the waker queue is not served,
4477
+ * then bfqq remains empty, and no I/O is dispatched,
4478
+ * until the idle timeout fires for bfqq. This is
4479
+ * likely to result in lower bandwidth and higher
4480
+ * latencies for bfqq, and in a severe loss of total
4481
+ * throughput. The best action to take is therefore to
4482
+ * serve the waker queue as soon as possible. So do it
4483
+ * (without relying on the third alternative below for
4484
+ * eventually serving waker_bfqq's I/O; see the last
4485
+ * paragraph for further details). This systematic
4486
+ * injection of I/O from the waker queue does not
4487
+ * cause any delay to bfqq's I/O. On the contrary,
4488
+ * next bfqq's I/O is brought forward dramatically,
4489
+ * for it is not blocked for milliseconds.
4490
+ *
4491
+ * The third if checks whether bfqq is a queue for
4492
+ * which it is better to avoid injection. It is so if
4493
+ * bfqq delivers more throughput when served without
4494
+ * any further I/O from other queues in the middle, or
4495
+ * if the service times of bfqq's I/O requests both
4496
+ * count more than overall throughput, and may be
4497
+ * easily increased by injection (this happens if bfqq
4498
+ * has a short think time). If none of these
4499
+ * conditions holds, then a candidate queue for
4500
+ * injection is looked for through
4501
+ * bfq_choose_bfqq_for_injection(). Note that the
4502
+ * latter may return NULL (for example if the inject
4503
+ * limit for bfqq is currently 0).
4504
+ *
4505
+ * NOTE: motivation for the second alternative
4506
+ *
4507
+ * Thanks to the way the inject limit is updated in
4508
+ * bfq_update_has_short_ttime(), it is rather likely
4509
+ * that, if I/O is being plugged for bfqq and the
4510
+ * waker queue has pending I/O requests that are
4511
+ * blocking bfqq's I/O, then the third alternative
4512
+ * above lets the waker queue get served before the
4513
+ * I/O-plugging timeout fires. So one may deem the
4514
+ * second alternative superfluous. It is not, because
4515
+ * the third alternative may be way less effective in
4516
+ * case of a synchronization. For two main
4517
+ * reasons. First, throughput may be low because the
4518
+ * inject limit may be too low to guarantee the same
4519
+ * amount of injected I/O, from the waker queue or
4520
+ * other queues, that the second alternative
4521
+ * guarantees (the second alternative unconditionally
4522
+ * injects a pending I/O request of the waker queue
4523
+ * for each bfq_dispatch_request()). Second, with the
4524
+ * third alternative, the duration of the plugging,
4525
+ * i.e., the time before bfqq finally receives new I/O,
4526
+ * may not be minimized, because the waker queue may
4527
+ * happen to be served only after other queues.
4528
+ */
4529
+ if (async_bfqq &&
4530
+ icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4531
+ bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4532
+ bfq_bfqq_budget_left(async_bfqq))
4533
+ bfqq = bfqq->bic->bfqq[0];
4534
+ else if (bfq_bfqq_has_waker(bfqq) &&
4535
+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
4536
+ bfqq->waker_bfqq->next_rq &&
4537
+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
4538
+ bfqq->waker_bfqq) <=
4539
+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
4540
+ )
4541
+ bfqq = bfqq->waker_bfqq;
4542
+ else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4543
+ (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
4544
+ !bfq_bfqq_has_short_ttime(bfqq)))
38414545 bfqq = bfq_choose_bfqq_for_injection(bfqd);
38424546 else
38434547 bfqq = NULL;
....@@ -3929,15 +4633,15 @@
39294633
39304634 bfq_bfqq_served(bfqq, service_to_charge);
39314635
4636
+ if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
4637
+ bfqd->wait_dispatch = false;
4638
+ bfqd->waited_rq = rq;
4639
+ }
4640
+
39324641 bfq_dispatch_remove(bfqd->queue, rq);
39334642
3934
- if (bfqq != bfqd->in_service_queue) {
3935
- if (likely(bfqd->in_service_queue))
3936
- bfqd->in_service_queue->injected_service +=
3937
- bfq_serv_to_charge(rq, bfqq);
3938
-
4643
+ if (bfqq != bfqd->in_service_queue)
39394644 goto return_rq;
3940
- }
39414645
39424646 /*
39434647 * If weight raising has to terminate for bfqq, then next
....@@ -3957,7 +4661,7 @@
39574661 * belongs to CLASS_IDLE and other queues are waiting for
39584662 * service.
39594663 */
3960
- if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
4664
+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
39614665 goto return_rq;
39624666
39634667 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
....@@ -3975,7 +4679,7 @@
39754679 * most a call to dispatch for nothing
39764680 */
39774681 return !list_empty_careful(&bfqd->dispatch) ||
3978
- bfqd->busy_queues > 0;
4682
+ bfq_tot_busy_queues(bfqd) > 0;
39794683 }
39804684
39814685 static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
....@@ -4029,9 +4733,10 @@
40294733 goto start_rq;
40304734 }
40314735
4032
- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
4736
+ bfq_log(bfqd, "dispatch requests: %d busy queues",
4737
+ bfq_tot_busy_queues(bfqd));
40334738
4034
- if (bfqd->busy_queues == 0)
4739
+ if (bfq_tot_busy_queues(bfqd) == 0)
40354740 goto exit;
40364741
40374742 /*
....@@ -4043,7 +4748,7 @@
40434748 * some unlucky request wait for as long as the device
40444749 * wishes.
40454750 *
4046
- * Of course, serving one request at at time may cause loss of
4751
+ * Of course, serving one request at a time may cause loss of
40474752 * throughput.
40484753 */
40494754 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
....@@ -4065,7 +4770,7 @@
40654770 return rq;
40664771 }
40674772
4068
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4773
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
40694774 static void bfq_update_dispatch_stats(struct request_queue *q,
40704775 struct request *rq,
40714776 struct bfq_queue *in_serv_queue,
....@@ -4089,7 +4794,7 @@
40894794 * In addition, the following queue lock guarantees that
40904795 * bfqq_group(bfqq) exists as well.
40914796 */
4092
- spin_lock_irq(q->queue_lock);
4797
+ spin_lock_irq(&q->queue_lock);
40934798 if (idle_timer_disabled)
40944799 /*
40954800 * Since the idle timer has been disabled,
....@@ -4108,21 +4813,21 @@
41084813 bfqg_stats_set_start_empty_time(bfqg);
41094814 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
41104815 }
4111
- spin_unlock_irq(q->queue_lock);
4816
+ spin_unlock_irq(&q->queue_lock);
41124817 }
41134818 #else
41144819 static inline void bfq_update_dispatch_stats(struct request_queue *q,
41154820 struct request *rq,
41164821 struct bfq_queue *in_serv_queue,
41174822 bool idle_timer_disabled) {}
4118
-#endif
4823
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
41194824
41204825 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
41214826 {
41224827 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
41234828 struct request *rq;
41244829 struct bfq_queue *in_serv_queue;
4125
- bool waiting_rq, idle_timer_disabled;
4830
+ bool waiting_rq, idle_timer_disabled = false;
41264831
41274832 spin_lock_irq(&bfqd->lock);
41284833
....@@ -4130,14 +4835,15 @@
41304835 waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
41314836
41324837 rq = __bfq_dispatch_request(hctx);
4133
-
4134
- idle_timer_disabled =
4135
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
4838
+ if (in_serv_queue == bfqd->in_service_queue) {
4839
+ idle_timer_disabled =
4840
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
4841
+ }
41364842
41374843 spin_unlock_irq(&bfqd->lock);
4138
-
4139
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
4140
- idle_timer_disabled);
4844
+ bfq_update_dispatch_stats(hctx->queue, rq,
4845
+ idle_timer_disabled ? in_serv_queue : NULL,
4846
+ idle_timer_disabled);
41414847
41424848 return rq;
41434849 }
....@@ -4151,9 +4857,9 @@
41514857 */
41524858 void bfq_put_queue(struct bfq_queue *bfqq)
41534859 {
4154
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
4860
+ struct bfq_queue *item;
4861
+ struct hlist_node *n;
41554862 struct bfq_group *bfqg = bfqq_group(bfqq);
4156
-#endif
41574863
41584864 if (bfqq->bfqd)
41594865 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
....@@ -4195,13 +4901,41 @@
41954901 bfqq->bfqd->burst_size--;
41964902 }
41974903
4904
+ /*
4905
+ * bfqq does not exist any longer, so it cannot be woken by
4906
+ * any other queue, and cannot wake any other queue. Then bfqq
4907
+ * must be removed from the woken list of its possible waker
4908
+ * queue, and all queues in the woken list of bfqq must stop
4909
+ * having a waker queue. Strictly speaking, these updates
4910
+ * should be performed when bfqq remains with no I/O source
4911
+ * attached to it, which happens before bfqq gets freed. In
4912
+ * particular, this happens when the last process associated
4913
+ * with bfqq exits or gets associated with a different
4914
+ * queue. However, both events lead to bfqq being freed soon,
4915
+ * and dangling references would come out only after bfqq gets
4916
+ * freed. So these updates are done here, as a simple and safe
4917
+ * way to handle all cases.
4918
+ */
4919
+ /* remove bfqq from woken list */
4920
+ if (!hlist_unhashed(&bfqq->woken_list_node))
4921
+ hlist_del_init(&bfqq->woken_list_node);
4922
+
4923
+ /* reset waker for all queues in woken list */
4924
+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
4925
+ woken_list_node) {
4926
+ item->waker_bfqq = NULL;
4927
+ bfq_clear_bfqq_has_waker(item);
4928
+ hlist_del_init(&item->woken_list_node);
4929
+ }
4930
+
4931
+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
4932
+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
4933
+
41984934 kmem_cache_free(bfq_pool, bfqq);
4199
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
42004935 bfqg_and_blkg_put(bfqg);
4201
-#endif
42024936 }
42034937
4204
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
4938
+void bfq_put_cooperator(struct bfq_queue *bfqq)
42054939 {
42064940 struct bfq_queue *__bfqq, *next;
42074941
....@@ -4223,7 +4957,7 @@
42234957 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
42244958 {
42254959 if (bfqq == bfqd->in_service_queue) {
4226
- __bfq_bfqq_expire(bfqd, bfqq);
4960
+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
42274961 bfq_schedule_dispatch(bfqd);
42284962 }
42294963
....@@ -4231,7 +4965,7 @@
42314965
42324966 bfq_put_cooperator(bfqq);
42334967
4234
- bfq_put_queue(bfqq); /* release process reference */
4968
+ bfq_release_process_ref(bfqd, bfqq);
42354969 }
42364970
42374971 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
....@@ -4246,9 +4980,8 @@
42464980 unsigned long flags;
42474981
42484982 spin_lock_irqsave(&bfqd->lock, flags);
4249
- bfqq->bic = NULL;
4250
- bfq_exit_bfqq(bfqd, bfqq);
42514983 bic_set_bfqq(bic, NULL, is_sync);
4984
+ bfq_exit_bfqq(bfqd, bfqq);
42524985 spin_unlock_irqrestore(&bfqd->lock, flags);
42534986 }
42544987 }
....@@ -4281,7 +5014,7 @@
42815014 pr_err("bdi %s: bfq: bad prio class %d\n",
42825015 bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
42835016 ioprio_class);
4284
- /* fall through */
5017
+ fallthrough;
42855018 case IOPRIO_CLASS_NONE:
42865019 /*
42875020 * No prio set, inherit CPU scheduling settings.
....@@ -4334,10 +5067,11 @@
43345067
43355068 bfqq = bic_to_bfqq(bic, false);
43365069 if (bfqq) {
4337
- /* release process reference on this queue */
4338
- bfq_put_queue(bfqq);
4339
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
5070
+ struct bfq_queue *old_bfqq = bfqq;
5071
+
5072
+ bfqq = bfq_get_queue(bfqd, bio, false, bic);
43405073 bic_set_bfqq(bic, bfqq, false);
5074
+ bfq_release_process_ref(bfqd, old_bfqq);
43415075 }
43425076
43435077 bfqq = bic_to_bfqq(bic, true);
....@@ -4351,6 +5085,8 @@
43515085 RB_CLEAR_NODE(&bfqq->entity.rb_node);
43525086 INIT_LIST_HEAD(&bfqq->fifo);
43535087 INIT_HLIST_NODE(&bfqq->burst_list_node);
5088
+ INIT_HLIST_NODE(&bfqq->woken_list_node);
5089
+ INIT_HLIST_HEAD(&bfqq->woken_list);
43545090
43555091 bfqq->ref = 0;
43565092 bfqq->bfqd = bfqd;
....@@ -4369,13 +5105,6 @@
43695105 bfq_mark_bfqq_has_short_ttime(bfqq);
43705106 bfq_mark_bfqq_sync(bfqq);
43715107 bfq_mark_bfqq_just_created(bfqq);
4372
- /*
4373
- * Aggressively inject a lot of service: up to 90%.
4374
- * This coefficient remains constant during bfqq life,
4375
- * but this behavior might be changed, after enough
4376
- * testing and tuning.
4377
- */
4378
- bfqq->inject_coeff = 1;
43795108 } else
43805109 bfq_clear_bfqq_sync(bfqq);
43815110
....@@ -4419,7 +5148,7 @@
44195148 return &bfqg->async_bfqq[0][ioprio];
44205149 case IOPRIO_CLASS_NONE:
44215150 ioprio = IOPRIO_NORM;
4422
- /* fall through */
5151
+ fallthrough;
44235152 case IOPRIO_CLASS_BE:
44245153 return &bfqg->async_bfqq[1][ioprio];
44255154 case IOPRIO_CLASS_IDLE:
....@@ -4439,14 +5168,7 @@
44395168 struct bfq_queue *bfqq;
44405169 struct bfq_group *bfqg;
44415170
4442
- rcu_read_lock();
4443
-
4444
- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
4445
- if (!bfqg) {
4446
- bfqq = &bfqd->oom_bfqq;
4447
- goto out;
4448
- }
4449
-
5171
+ bfqg = bfq_bio_bfqg(bfqd, bio);
44505172 if (!is_sync) {
44515173 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
44525174 ioprio);
....@@ -4490,7 +5212,6 @@
44905212 out:
44915213 bfqq->ref++; /* get a process reference to this queue */
44925214 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
4493
- rcu_read_unlock();
44945215 return bfqq;
44955216 }
44965217
....@@ -4513,17 +5234,19 @@
45135234 struct request *rq)
45145235 {
45155236 bfqq->seek_history <<= 1;
4516
- bfqq->seek_history |=
4517
- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
4518
- (!blk_queue_nonrot(bfqd->queue) ||
4519
- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5237
+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
5238
+
5239
+ if (bfqq->wr_coeff > 1 &&
5240
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
5241
+ BFQQ_TOTALLY_SEEKY(bfqq))
5242
+ bfq_bfqq_end_wr(bfqq);
45205243 }
45215244
45225245 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
45235246 struct bfq_queue *bfqq,
45245247 struct bfq_io_cq *bic)
45255248 {
4526
- bool has_short_ttime = true;
5249
+ bool has_short_ttime = true, state_changed;
45275250
45285251 /*
45295252 * No need to update has_short_ttime if bfqq is async or in
....@@ -4548,13 +5271,102 @@
45485271 bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
45495272 has_short_ttime = false;
45505273
4551
- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
4552
- has_short_ttime);
5274
+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
45535275
45545276 if (has_short_ttime)
45555277 bfq_mark_bfqq_has_short_ttime(bfqq);
45565278 else
45575279 bfq_clear_bfqq_has_short_ttime(bfqq);
5280
+
5281
+ /*
5282
+ * Until the base value for the total service time gets
5283
+ * finally computed for bfqq, the inject limit does depend on
5284
+ * the think-time state (short|long). In particular, the limit
5285
+ * is 0 or 1 if the think time is deemed, respectively, as
5286
+ * short or long (details in the comments in
5287
+ * bfq_update_inject_limit()). Accordingly, the next
5288
+ * instructions reset the inject limit if the think-time state
5289
+ * has changed and the above base value is still to be
5290
+ * computed.
5291
+ *
5292
+ * However, the reset is performed only if more than 100 ms
5293
+ * have elapsed since the last update of the inject limit, or
5294
+ * (inclusive) if the change is from short to long think
5295
+ * time. The reason for this waiting is as follows.
5296
+ *
5297
+ * bfqq may have a long think time because of a
5298
+ * synchronization with some other queue, i.e., because the
5299
+ * I/O of some other queue may need to be completed for bfqq
5300
+ * to receive new I/O. Details in the comments on the choice
5301
+ * of the queue for injection in bfq_select_queue().
5302
+ *
5303
+ * As stressed in those comments, if such a synchronization is
5304
+ * actually in place, then, without injection on bfqq, the
5305
+ * blocking I/O cannot happen to served while bfqq is in
5306
+ * service. As a consequence, if bfqq is granted
5307
+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
5308
+ * is dispatched, until the idle timeout fires. This is likely
5309
+ * to result in lower bandwidth and higher latencies for bfqq,
5310
+ * and in a severe loss of total throughput.
5311
+ *
5312
+ * On the opposite end, a non-zero inject limit may allow the
5313
+ * I/O that blocks bfqq to be executed soon, and therefore
5314
+ * bfqq to receive new I/O soon.
5315
+ *
5316
+ * But, if the blocking gets actually eliminated, then the
5317
+ * next think-time sample for bfqq may be very low. This in
5318
+ * turn may cause bfqq's think time to be deemed
5319
+ * short. Without the 100 ms barrier, this new state change
5320
+ * would cause the body of the next if to be executed
5321
+ * immediately. But this would set to 0 the inject
5322
+ * limit. Without injection, the blocking I/O would cause the
5323
+ * think time of bfqq to become long again, and therefore the
5324
+ * inject limit to be raised again, and so on. The only effect
5325
+ * of such a steady oscillation between the two think-time
5326
+ * states would be to prevent effective injection on bfqq.
5327
+ *
5328
+ * In contrast, if the inject limit is not reset during such a
5329
+ * long time interval as 100 ms, then the number of short
5330
+ * think time samples can grow significantly before the reset
5331
+ * is performed. As a consequence, the think time state can
5332
+ * become stable before the reset. Therefore there will be no
5333
+ * state change when the 100 ms elapse, and no reset of the
5334
+ * inject limit. The inject limit remains steadily equal to 1
5335
+ * both during and after the 100 ms. So injection can be
5336
+ * performed at all times, and throughput gets boosted.
5337
+ *
5338
+ * An inject limit equal to 1 is however in conflict, in
5339
+ * general, with the fact that the think time of bfqq is
5340
+ * short, because injection may be likely to delay bfqq's I/O
5341
+ * (as explained in the comments in
5342
+ * bfq_update_inject_limit()). But this does not happen in
5343
+ * this special case, because bfqq's low think time is due to
5344
+ * an effective handling of a synchronization, through
5345
+ * injection. In this special case, bfqq's I/O does not get
5346
+ * delayed by injection; on the contrary, bfqq's I/O is
5347
+ * brought forward, because it is not blocked for
5348
+ * milliseconds.
5349
+ *
5350
+ * In addition, serving the blocking I/O much sooner, and much
5351
+ * more frequently than once per I/O-plugging timeout, makes
5352
+ * it much quicker to detect a waker queue (the concept of
5353
+ * waker queue is defined in the comments in
5354
+ * bfq_add_request()). This makes it possible to start sooner
5355
+ * to boost throughput more effectively, by injecting the I/O
5356
+ * of the waker queue unconditionally on every
5357
+ * bfq_dispatch_request().
5358
+ *
5359
+ * One last, important benefit of not resetting the inject
5360
+ * limit before 100 ms is that, during this time interval, the
5361
+ * base value for the total service time is likely to get
5362
+ * finally computed for bfqq, freeing the inject limit from
5363
+ * its relation with the think time.
5364
+ */
5365
+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
5366
+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
5367
+ msecs_to_jiffies(100)) ||
5368
+ !has_short_ttime))
5369
+ bfq_reset_inject_limit(bfqd, bfqq);
45585370 }
45595371
45605372 /*
....@@ -4564,18 +5376,8 @@
45645376 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
45655377 struct request *rq)
45665378 {
4567
- struct bfq_io_cq *bic = RQ_BIC(rq);
4568
-
45695379 if (rq->cmd_flags & REQ_META)
45705380 bfqq->meta_pending++;
4571
-
4572
- bfq_update_io_thinktime(bfqd, bfqq);
4573
- bfq_update_has_short_ttime(bfqd, bfqq, bic);
4574
- bfq_update_io_seektime(bfqd, bfqq, rq);
4575
-
4576
- bfq_log_bfqq(bfqd, bfqq,
4577
- "rq_enqueued: has_short_ttime=%d (seeky %d)",
4578
- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
45795381
45805382 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
45815383
....@@ -4585,28 +5387,31 @@
45855387 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
45865388
45875389 /*
4588
- * There is just this request queued: if the request
4589
- * is small and the queue is not to be expired, then
4590
- * just exit.
5390
+ * There is just this request queued: if
5391
+ * - the request is small, and
5392
+ * - we are idling to boost throughput, and
5393
+ * - the queue is not to be expired,
5394
+ * then just exit.
45915395 *
45925396 * In this way, if the device is being idled to wait
45935397 * for a new request from the in-service queue, we
45945398 * avoid unplugging the device and committing the
4595
- * device to serve just a small request. On the
4596
- * contrary, we wait for the block layer to decide
4597
- * when to unplug the device: hopefully, new requests
4598
- * will be merged to this one quickly, then the device
4599
- * will be unplugged and larger requests will be
4600
- * dispatched.
5399
+ * device to serve just a small request. In contrast
5400
+ * we wait for the block layer to decide when to
5401
+ * unplug the device: hopefully, new requests will be
5402
+ * merged to this one quickly, then the device will be
5403
+ * unplugged and larger requests will be dispatched.
46015404 */
4602
- if (small_req && !budget_timeout)
5405
+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
5406
+ !budget_timeout)
46035407 return;
46045408
46055409 /*
4606
- * A large enough request arrived, or the queue is to
4607
- * be expired: in both cases disk idling is to be
4608
- * stopped, so clear wait_request flag and reset
4609
- * timer.
5410
+ * A large enough request arrived, or idling is being
5411
+ * performed to preserve service guarantees, or
5412
+ * finally the queue is to be expired: in all these
5413
+ * cases disk idling is to be stopped, so clear
5414
+ * wait_request flag and reset timer.
46105415 */
46115416 bfq_clear_bfqq_wait_request(bfqq);
46125417 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
....@@ -4632,8 +5437,6 @@
46325437 bool waiting, idle_timer_disabled = false;
46335438
46345439 if (new_bfqq) {
4635
- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
4636
- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
46375440 /*
46385441 * Release the request's reference to the old bfqq
46395442 * and make sure one is taken to the shared queue.
....@@ -4663,6 +5466,10 @@
46635466 bfqq = new_bfqq;
46645467 }
46655468
5469
+ bfq_update_io_thinktime(bfqd, bfqq);
5470
+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
5471
+ bfq_update_io_seektime(bfqd, bfqq, rq);
5472
+
46665473 waiting = bfqq && bfq_bfqq_wait_request(bfqq);
46675474 bfq_add_request(rq);
46685475 idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
....@@ -4675,7 +5482,7 @@
46755482 return idle_timer_disabled;
46765483 }
46775484
4678
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
5485
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
46795486 static void bfq_update_insert_stats(struct request_queue *q,
46805487 struct bfq_queue *bfqq,
46815488 bool idle_timer_disabled,
....@@ -4694,18 +5501,20 @@
46945501 * In addition, the following queue lock guarantees that
46955502 * bfqq_group(bfqq) exists as well.
46965503 */
4697
- spin_lock_irq(q->queue_lock);
5504
+ spin_lock_irq(&q->queue_lock);
46985505 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
46995506 if (idle_timer_disabled)
47005507 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4701
- spin_unlock_irq(q->queue_lock);
5508
+ spin_unlock_irq(&q->queue_lock);
47025509 }
47035510 #else
47045511 static inline void bfq_update_insert_stats(struct request_queue *q,
47055512 struct bfq_queue *bfqq,
47065513 bool idle_timer_disabled,
47075514 unsigned int cmd_flags) {}
4708
-#endif
5515
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
5516
+
5517
+static struct bfq_queue *bfq_init_rq(struct request *rq);
47095518
47105519 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
47115520 bool at_head)
....@@ -4716,18 +5525,19 @@
47165525 bool idle_timer_disabled = false;
47175526 unsigned int cmd_flags;
47185527
5528
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
5529
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
5530
+ bfqg_stats_update_legacy_io(q, rq);
5531
+#endif
47195532 spin_lock_irq(&bfqd->lock);
5533
+ bfqq = bfq_init_rq(rq);
47205534 if (blk_mq_sched_try_insert_merge(q, rq)) {
47215535 spin_unlock_irq(&bfqd->lock);
47225536 return;
47235537 }
47245538
4725
- spin_unlock_irq(&bfqd->lock);
4726
-
47275539 blk_mq_sched_request_inserted(rq);
47285540
4729
- spin_lock_irq(&bfqd->lock);
4730
- bfqq = bfq_init_rq(rq);
47315541 if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
47325542 if (at_head)
47335543 list_add(&rq->queuelist, &bfqd->dispatch);
....@@ -4776,6 +5586,8 @@
47765586
47775587 static void bfq_update_hw_tag(struct bfq_data *bfqd)
47785588 {
5589
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
5590
+
47795591 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
47805592 bfqd->rq_in_driver);
47815593
....@@ -4788,7 +5600,18 @@
47885600 * sum is not exact, as it's not taking into account deactivated
47895601 * requests.
47905602 */
4791
- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
5603
+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
5604
+ return;
5605
+
5606
+ /*
5607
+ * If active queue hasn't enough requests and can idle, bfq might not
5608
+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
5609
+ * case
5610
+ */
5611
+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
5612
+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
5613
+ BFQ_HW_QUEUE_THRESHOLD &&
5614
+ bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
47925615 return;
47935616
47945617 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
....@@ -4797,6 +5620,9 @@
47975620 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
47985621 bfqd->max_rq_in_driver = 0;
47995622 bfqd->hw_tag_samples = 0;
5623
+
5624
+ bfqd->nonrot_with_queueing =
5625
+ blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
48005626 }
48015627
48025628 static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
....@@ -4852,6 +5678,7 @@
48525678 1UL<<(BFQ_RATE_SHIFT - 10))
48535679 bfq_update_rate_reset(bfqd, NULL);
48545680 bfqd->last_completion = now_ns;
5681
+ bfqd->last_completed_rq_bfqq = bfqq;
48555682
48565683 /*
48575684 * If we are waiting to discover whether the request pattern
....@@ -4859,11 +5686,14 @@
48595686 * isochronous, and both requisites for this condition to hold
48605687 * are now satisfied, then compute soft_rt_next_start (see the
48615688 * comments on the function bfq_bfqq_softrt_next_start()). We
4862
- * schedule this delayed check when bfqq expires, if it still
4863
- * has in-flight requests.
5689
+ * do not compute soft_rt_next_start if bfqq is in interactive
5690
+ * weight raising (see the comments in bfq_bfqq_expire() for
5691
+ * an explanation). We schedule this delayed update when bfqq
5692
+ * expires, if it still has in-flight requests.
48645693 */
48655694 if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4866
- RB_EMPTY_ROOT(&bfqq->sort_list))
5695
+ RB_EMPTY_ROOT(&bfqq->sort_list) &&
5696
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
48675697 bfqq->soft_rt_next_start =
48685698 bfq_bfqq_softrt_next_start(bfqd, bfqq);
48695699
....@@ -4921,6 +5751,167 @@
49215751 }
49225752
49235753 /*
5754
+ * The processes associated with bfqq may happen to generate their
5755
+ * cumulative I/O at a lower rate than the rate at which the device
5756
+ * could serve the same I/O. This is rather probable, e.g., if only
5757
+ * one process is associated with bfqq and the device is an SSD. It
5758
+ * results in bfqq becoming often empty while in service. In this
5759
+ * respect, if BFQ is allowed to switch to another queue when bfqq
5760
+ * remains empty, then the device goes on being fed with I/O requests,
5761
+ * and the throughput is not affected. In contrast, if BFQ is not
5762
+ * allowed to switch to another queue---because bfqq is sync and
5763
+ * I/O-dispatch needs to be plugged while bfqq is temporarily
5764
+ * empty---then, during the service of bfqq, there will be frequent
5765
+ * "service holes", i.e., time intervals during which bfqq gets empty
5766
+ * and the device can only consume the I/O already queued in its
5767
+ * hardware queues. During service holes, the device may even get to
5768
+ * remaining idle. In the end, during the service of bfqq, the device
5769
+ * is driven at a lower speed than the one it can reach with the kind
5770
+ * of I/O flowing through bfqq.
5771
+ *
5772
+ * To counter this loss of throughput, BFQ implements a "request
5773
+ * injection mechanism", which tries to fill the above service holes
5774
+ * with I/O requests taken from other queues. The hard part in this
5775
+ * mechanism is finding the right amount of I/O to inject, so as to
5776
+ * both boost throughput and not break bfqq's bandwidth and latency
5777
+ * guarantees. In this respect, the mechanism maintains a per-queue
5778
+ * inject limit, computed as below. While bfqq is empty, the injection
5779
+ * mechanism dispatches extra I/O requests only until the total number
5780
+ * of I/O requests in flight---i.e., already dispatched but not yet
5781
+ * completed---remains lower than this limit.
5782
+ *
5783
+ * A first definition comes in handy to introduce the algorithm by
5784
+ * which the inject limit is computed. We define as first request for
5785
+ * bfqq, an I/O request for bfqq that arrives while bfqq is in
5786
+ * service, and causes bfqq to switch from empty to non-empty. The
5787
+ * algorithm updates the limit as a function of the effect of
5788
+ * injection on the service times of only the first requests of
5789
+ * bfqq. The reason for this restriction is that these are the
5790
+ * requests whose service time is affected most, because they are the
5791
+ * first to arrive after injection possibly occurred.
5792
+ *
5793
+ * To evaluate the effect of injection, the algorithm measures the
5794
+ * "total service time" of first requests. We define as total service
5795
+ * time of an I/O request, the time that elapses since when the
5796
+ * request is enqueued into bfqq, to when it is completed. This
5797
+ * quantity allows the whole effect of injection to be measured. It is
5798
+ * easy to see why. Suppose that some requests of other queues are
5799
+ * actually injected while bfqq is empty, and that a new request R
5800
+ * then arrives for bfqq. If the device does start to serve all or
5801
+ * part of the injected requests during the service hole, then,
5802
+ * because of this extra service, it may delay the next invocation of
5803
+ * the dispatch hook of BFQ. Then, even after R gets eventually
5804
+ * dispatched, the device may delay the actual service of R if it is
5805
+ * still busy serving the extra requests, or if it decides to serve,
5806
+ * before R, some extra request still present in its queues. As a
5807
+ * conclusion, the cumulative extra delay caused by injection can be
5808
+ * easily evaluated by just comparing the total service time of first
5809
+ * requests with and without injection.
5810
+ *
5811
+ * The limit-update algorithm works as follows. On the arrival of a
5812
+ * first request of bfqq, the algorithm measures the total time of the
5813
+ * request only if one of the three cases below holds, and, for each
5814
+ * case, it updates the limit as described below:
5815
+ *
5816
+ * (1) If there is no in-flight request. This gives a baseline for the
5817
+ * total service time of the requests of bfqq. If the baseline has
5818
+ * not been computed yet, then, after computing it, the limit is
5819
+ * set to 1, to start boosting throughput, and to prepare the
5820
+ * ground for the next case. If the baseline has already been
5821
+ * computed, then it is updated, in case it results to be lower
5822
+ * than the previous value.
5823
+ *
5824
+ * (2) If the limit is higher than 0 and there are in-flight
5825
+ * requests. By comparing the total service time in this case with
5826
+ * the above baseline, it is possible to know at which extent the
5827
+ * current value of the limit is inflating the total service
5828
+ * time. If the inflation is below a certain threshold, then bfqq
5829
+ * is assumed to be suffering from no perceivable loss of its
5830
+ * service guarantees, and the limit is even tentatively
5831
+ * increased. If the inflation is above the threshold, then the
5832
+ * limit is decreased. Due to the lack of any hysteresis, this
5833
+ * logic makes the limit oscillate even in steady workload
5834
+ * conditions. Yet we opted for it, because it is fast in reaching
5835
+ * the best value for the limit, as a function of the current I/O
5836
+ * workload. To reduce oscillations, this step is disabled for a
5837
+ * short time interval after the limit happens to be decreased.
5838
+ *
5839
+ * (3) Periodically, after resetting the limit, to make sure that the
5840
+ * limit eventually drops in case the workload changes. This is
5841
+ * needed because, after the limit has gone safely up for a
5842
+ * certain workload, it is impossible to guess whether the
5843
+ * baseline total service time may have changed, without measuring
5844
+ * it again without injection. A more effective version of this
5845
+ * step might be to just sample the baseline, by interrupting
5846
+ * injection only once, and then to reset/lower the limit only if
5847
+ * the total service time with the current limit does happen to be
5848
+ * too large.
5849
+ *
5850
+ * More details on each step are provided in the comments on the
5851
+ * pieces of code that implement these steps: the branch handling the
5852
+ * transition from empty to non empty in bfq_add_request(), the branch
5853
+ * handling injection in bfq_select_queue(), and the function
5854
+ * bfq_choose_bfqq_for_injection(). These comments also explain some
5855
+ * exceptions, made by the injection mechanism in some special cases.
5856
+ */
5857
+static void bfq_update_inject_limit(struct bfq_data *bfqd,
5858
+ struct bfq_queue *bfqq)
5859
+{
5860
+ u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
5861
+ unsigned int old_limit = bfqq->inject_limit;
5862
+
5863
+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
5864
+ u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
5865
+
5866
+ if (tot_time_ns >= threshold && old_limit > 0) {
5867
+ bfqq->inject_limit--;
5868
+ bfqq->decrease_time_jif = jiffies;
5869
+ } else if (tot_time_ns < threshold &&
5870
+ old_limit <= bfqd->max_rq_in_driver)
5871
+ bfqq->inject_limit++;
5872
+ }
5873
+
5874
+ /*
5875
+ * Either we still have to compute the base value for the
5876
+ * total service time, and there seem to be the right
5877
+ * conditions to do it, or we can lower the last base value
5878
+ * computed.
5879
+ *
5880
+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
5881
+ * request in flight, because this function is in the code
5882
+ * path that handles the completion of a request of bfqq, and,
5883
+ * in particular, this function is executed before
5884
+ * bfqd->rq_in_driver is decremented in such a code path.
5885
+ */
5886
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
5887
+ tot_time_ns < bfqq->last_serv_time_ns) {
5888
+ if (bfqq->last_serv_time_ns == 0) {
5889
+ /*
5890
+ * Now we certainly have a base value: make sure we
5891
+ * start trying injection.
5892
+ */
5893
+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
5894
+ }
5895
+ bfqq->last_serv_time_ns = tot_time_ns;
5896
+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
5897
+ /*
5898
+ * No I/O injected and no request still in service in
5899
+ * the drive: these are the exact conditions for
5900
+ * computing the base value of the total service time
5901
+ * for bfqq. So let's update this value, because it is
5902
+ * rather variable. For example, it varies if the size
5903
+ * or the spatial locality of the I/O requests in bfqq
5904
+ * change.
5905
+ */
5906
+ bfqq->last_serv_time_ns = tot_time_ns;
5907
+
5908
+
5909
+ /* update complete, not waiting for any request completion any longer */
5910
+ bfqd->waited_rq = NULL;
5911
+ bfqd->rqs_injected = false;
5912
+}
5913
+
5914
+/*
49245915 * Handle either a requeue or a finish for rq. The things to do are
49255916 * the same in both cases: all references to rq are to be dropped. In
49265917 * particular, rq is considered completed from the point of view of
....@@ -4930,18 +5921,6 @@
49305921 {
49315922 struct bfq_queue *bfqq = RQ_BFQQ(rq);
49325923 struct bfq_data *bfqd;
4933
-
4934
- /*
4935
- * Requeue and finish hooks are invoked in blk-mq without
4936
- * checking whether the involved request is actually still
4937
- * referenced in the scheduler. To handle this fact, the
4938
- * following two checks make this function exit in case of
4939
- * spurious invocations, for which there is nothing to do.
4940
- *
4941
- * First, check whether rq has nothing to do with an elevator.
4942
- */
4943
- if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4944
- return;
49455924
49465925 /*
49475926 * rq either is not associated with any icq, or is an already
....@@ -4963,6 +5942,9 @@
49635942 unsigned long flags;
49645943
49655944 spin_lock_irqsave(&bfqd->lock, flags);
5945
+
5946
+ if (rq == bfqd->waited_rq)
5947
+ bfq_update_inject_limit(bfqd, bfqq);
49665948
49675949 bfq_completed_request(bfqq, bfqd);
49685950 bfq_finish_requeue_request_body(bfqq);
....@@ -5012,6 +5994,8 @@
50125994 }
50135995
50145996 /*
5997
+ * Removes the association between the current task and bfqq, assuming
5998
+ * that bic points to the bfq iocontext of the task.
50155999 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
50166000 * was the last process referring to that bfqq.
50176001 */
....@@ -5027,11 +6011,11 @@
50276011 return bfqq;
50286012 }
50296013
5030
- bic_set_bfqq(bic, NULL, 1);
6014
+ bic_set_bfqq(bic, NULL, true);
50316015
50326016 bfq_put_cooperator(bfqq);
50336017
5034
- bfq_put_queue(bfqq);
6018
+ bfq_release_process_ref(bfqq->bfqd, bfqq);
50356019 return NULL;
50366020 }
50376021
....@@ -5104,7 +6088,7 @@
51046088 * comments on bfq_init_rq for the reason behind this delayed
51056089 * preparation.
51066090 */
5107
-static void bfq_prepare_request(struct request *rq, struct bio *bio)
6091
+static void bfq_prepare_request(struct request *rq)
51086092 {
51096093 /*
51106094 * Regardless of whether we have an icq attached, we have to
....@@ -5127,7 +6111,7 @@
51276111 * preparation is that, after the prepare_request hook is invoked for
51286112 * rq, rq may still be transformed into a request with no icq, i.e., a
51296113 * request not associated with any queue. No bfq hook is invoked to
5130
- * signal this tranformation. As a consequence, should these
6114
+ * signal this transformation. As a consequence, should these
51316115 * preparation operations be performed when the prepare_request hook
51326116 * is invoked, and should rq be transformed one moment later, bfq
51336117 * would end up in an inconsistent state, because it would have
....@@ -5218,7 +6202,29 @@
52186202 }
52196203 }
52206204
5221
- if (unlikely(bfq_bfqq_just_created(bfqq)))
6205
+ /*
6206
+ * Consider bfqq as possibly belonging to a burst of newly
6207
+ * created queues only if:
6208
+ * 1) A burst is actually happening (bfqd->burst_size > 0)
6209
+ * or
6210
+ * 2) There is no other active queue. In fact, if, in
6211
+ * contrast, there are active queues not belonging to the
6212
+ * possible burst bfqq may belong to, then there is no gain
6213
+ * in considering bfqq as belonging to a burst, and
6214
+ * therefore in not weight-raising bfqq. See comments on
6215
+ * bfq_handle_burst().
6216
+ *
6217
+ * This filtering also helps eliminating false positives,
6218
+ * occurring when bfqq does not belong to an actual large
6219
+ * burst, but some background task (e.g., a service) happens
6220
+ * to trigger the creation of new queues very close to when
6221
+ * bfqq and its possible companion queues are created. See
6222
+ * comments on bfq_handle_burst() for further details also on
6223
+ * this issue.
6224
+ */
6225
+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
6226
+ (bfqd->burst_size > 0 ||
6227
+ bfq_tot_busy_queues(bfqd) == 0)))
52226228 bfq_handle_burst(bfqd, bfqq);
52236229
52246230 return bfqq;
....@@ -5267,8 +6273,8 @@
52676273 bfq_bfqq_expire(bfqd, bfqq, true, reason);
52686274
52696275 schedule_dispatch:
5270
- spin_unlock_irqrestore(&bfqd->lock, flags);
52716276 bfq_schedule_dispatch(bfqd);
6277
+ spin_unlock_irqrestore(&bfqd->lock, flags);
52726278 }
52736279
52746280 /*
....@@ -5381,8 +6387,8 @@
53816387 struct blk_mq_tags *tags = hctx->sched_tags;
53826388 unsigned int min_shallow;
53836389
5384
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
5385
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
6390
+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
6391
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
53866392 }
53876393
53886394 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
....@@ -5405,10 +6411,10 @@
54056411
54066412 hrtimer_cancel(&bfqd->idle_slice_timer);
54076413
5408
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
54096414 /* release oom-queue reference to root group */
54106415 bfqg_and_blkg_put(bfqd->root_group);
54116416
6417
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
54126418 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
54136419 #else
54146420 spin_lock_irq(&bfqd->lock);
....@@ -5454,9 +6460,9 @@
54546460 }
54556461 eq->elevator_data = bfqd;
54566462
5457
- spin_lock_irq(q->queue_lock);
6463
+ spin_lock_irq(&q->queue_lock);
54586464 q->elevator = eq;
5459
- spin_unlock_irq(q->queue_lock);
6465
+ spin_unlock_irq(&q->queue_lock);
54606466
54616467 /*
54626468 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
....@@ -5488,7 +6494,7 @@
54886494 HRTIMER_MODE_REL);
54896495 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
54906496
5491
- bfqd->queue_weights_tree = RB_ROOT;
6497
+ bfqd->queue_weights_tree = RB_ROOT_CACHED;
54926498 bfqd->num_groups_with_pending_reqs = 0;
54936499
54946500 INIT_LIST_HEAD(&bfqd->active_list);
....@@ -5496,6 +6502,7 @@
54966502 INIT_HLIST_HEAD(&bfqd->burst_list);
54976503
54986504 bfqd->hw_tag = -1;
6505
+ bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
54996506
55006507 bfqd->bfq_max_budget = bfq_default_max_budget;
55016508
....@@ -5796,7 +6803,7 @@
57966803 };
57976804
57986805 static struct elevator_type iosched_bfq_mq = {
5799
- .ops.mq = {
6806
+ .ops = {
58006807 .limit_depth = bfq_limit_depth,
58016808 .prepare_request = bfq_prepare_request,
58026809 .requeue_request = bfq_finish_requeue_request,
....@@ -5818,7 +6825,6 @@
58186825 .exit_sched = bfq_exit_queue,
58196826 },
58206827
5821
- .uses_mq = true,
58226828 .icq_size = sizeof(struct bfq_io_cq),
58236829 .icq_align = __alignof__(struct bfq_io_cq),
58246830 .elevator_attrs = bfq_attrs,