hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/block/bfq-iosched.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Budget Fair Queueing (BFQ) I/O scheduler.
34 *
....@@ -12,21 +13,11 @@
1213 *
1314 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
1415 *
15
- * This program is free software; you can redistribute it and/or
16
- * modify it under the terms of the GNU General Public License as
17
- * published by the Free Software Foundation; either version 2 of the
18
- * License, or (at your option) any later version.
19
- *
20
- * This program is distributed in the hope that it will be useful,
21
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23
- * General Public License for more details.
24
- *
2516 * BFQ is a proportional-share I/O scheduler, with some extra
2617 * low-latency capabilities. BFQ also supports full hierarchical
2718 * scheduling through cgroups. Next paragraphs provide an introduction
2819 * on BFQ inner workings. Details on BFQ benefits, usage and
29
- * limitations can be found in Documentation/block/bfq-iosched.txt.
20
+ * limitations can be found in Documentation/block/bfq-iosched.rst.
3021 *
3122 * BFQ is a proportional-share storage-I/O scheduling algorithm based
3223 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
....@@ -167,6 +158,7 @@
167158 BFQ_BFQQ_FNS(coop);
168159 BFQ_BFQQ_FNS(split_coop);
169160 BFQ_BFQQ_FNS(softrt_update);
161
+BFQ_BFQQ_FNS(has_waker);
170162 #undef BFQ_BFQQ_FNS \
171163
172164 /* Expiration time of sync (0) and async (1) requests, in ns. */
....@@ -190,7 +182,7 @@
190182 /*
191183 * When a sync request is dispatched, the queue that contains that
192184 * request, and all the ancestor entities of that queue, are charged
193
- * with the number of sectors of the request. In constrast, if the
185
+ * with the number of sectors of the request. In contrast, if the
194186 * request is async, then the queue and its ancestor entities are
195187 * charged with the number of sectors of the request, multiplied by
196188 * the factor below. This throttles the bandwidth for async I/O,
....@@ -218,7 +210,7 @@
218210 * queue merging.
219211 *
220212 * As can be deduced from the low time limit below, queue merging, if
221
- * successful, happens at the very beggining of the I/O of the involved
213
+ * successful, happens at the very beginning of the I/O of the involved
222214 * cooperating processes, as a consequence of the arrival of the very
223215 * first requests from each cooperator. After that, there is very
224216 * little chance to find cooperators.
....@@ -231,13 +223,26 @@
231223 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
232224
233225 /* hw_tag detection: parallel requests threshold and min samples needed. */
234
-#define BFQ_HW_QUEUE_THRESHOLD 4
226
+#define BFQ_HW_QUEUE_THRESHOLD 3
235227 #define BFQ_HW_QUEUE_SAMPLES 32
236228
237229 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
238230 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
231
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
232
+ (get_sdist(last_pos, rq) > \
233
+ BFQQ_SEEK_THR && \
234
+ (!blk_queue_nonrot(bfqd->queue) || \
235
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
239236 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
240237 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
238
+/*
239
+ * Sync random I/O is likely to be confused with soft real-time I/O,
240
+ * because it is characterized by limited throughput and apparently
241
+ * isochronous arrival pattern. To avoid false positives, queues
242
+ * containing only random (seeky) I/O are prevented from being tagged
243
+ * as soft real-time.
244
+ */
245
+#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1)
241246
242247 /* Min number of samples required to perform peak-rate update */
243248 #define BFQ_RATE_MIN_SAMPLES 32
....@@ -400,9 +405,9 @@
400405 unsigned long flags;
401406 struct bfq_io_cq *icq;
402407
403
- spin_lock_irqsave(q->queue_lock, flags);
408
+ spin_lock_irqsave(&q->queue_lock, flags);
404409 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
405
- spin_unlock_irqrestore(q->queue_lock, flags);
410
+ spin_unlock_irqrestore(&q->queue_lock, flags);
406411
407412 return icq;
408413 }
....@@ -416,6 +421,8 @@
416421 */
417422 void bfq_schedule_dispatch(struct bfq_data *bfqd)
418423 {
424
+ lockdep_assert_held(&bfqd->lock);
425
+
419426 if (bfqd->queued != 0) {
420427 bfq_log(bfqd, "schedule dispatch");
421428 blk_mq_run_hw_queues(bfqd->queue, true);
....@@ -423,13 +430,12 @@
423430 }
424431
425432 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
426
-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
427433
428434 #define bfq_sample_valid(samples) ((samples) > 80)
429435
430436 /*
431437 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
432
- * We choose the request that is closesr to the head right now. Distance
438
+ * We choose the request that is closer to the head right now. Distance
433439 * behind the head is penalized and only allowed to a certain extent.
434440 */
435441 static struct request *bfq_choose_req(struct bfq_data *bfqd,
....@@ -591,7 +597,16 @@
591597 bfq_merge_time_limit);
592598 }
593599
594
-void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
600
+/*
601
+ * The following function is not marked as __cold because it is
602
+ * actually cold, but for the same performance goal described in the
603
+ * comments on the likely() at the beginning of
604
+ * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
605
+ * execution time for the case where this function is not invoked, we
606
+ * had to add an unlikely() in each involved if().
607
+ */
608
+void __cold
609
+bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
595610 {
596611 struct rb_node **p, *parent;
597612 struct bfq_queue *__bfqq;
....@@ -600,6 +615,10 @@
600615 rb_erase(&bfqq->pos_node, bfqq->pos_root);
601616 bfqq->pos_root = NULL;
602617 }
618
+
619
+ /* oom_bfqq does not participate in queue merging */
620
+ if (bfqq == &bfqd->oom_bfqq)
621
+ return;
603622
604623 /*
605624 * bfqq cannot be merged any longer (see comments in
....@@ -625,52 +644,68 @@
625644 }
626645
627646 /*
628
- * Tell whether there are active queues with different weights or
629
- * active groups.
630
- */
631
-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
632
-{
633
- /*
634
- * For queue weights to differ, queue_weights_tree must contain
635
- * at least two nodes.
636
- */
637
- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
638
- (bfqd->queue_weights_tree.rb_node->rb_left ||
639
- bfqd->queue_weights_tree.rb_node->rb_right)
640
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
641
- ) ||
642
- (bfqd->num_groups_with_pending_reqs > 0
643
-#endif
644
- );
645
-}
646
-
647
-/*
648
- * The following function returns true if every queue must receive the
649
- * same share of the throughput (this condition is used when deciding
650
- * whether idling may be disabled, see the comments in the function
651
- * bfq_better_to_idle()).
647
+ * The following function returns false either if every active queue
648
+ * must receive the same share of the throughput (symmetric scenario),
649
+ * or, as a special case, if bfqq must receive a share of the
650
+ * throughput lower than or equal to the share that every other active
651
+ * queue must receive. If bfqq does sync I/O, then these are the only
652
+ * two cases where bfqq happens to be guaranteed its share of the
653
+ * throughput even if I/O dispatching is not plugged when bfqq remains
654
+ * temporarily empty (for more details, see the comments in the
655
+ * function bfq_better_to_idle()). For this reason, the return value
656
+ * of this function is used to check whether I/O-dispatch plugging can
657
+ * be avoided.
652658 *
653
- * Such a scenario occurs when:
659
+ * The above first case (symmetric scenario) occurs when:
654660 * 1) all active queues have the same weight,
655
- * 2) all active groups at the same level in the groups tree have the same
656
- * weight,
661
+ * 2) all active queues belong to the same I/O-priority class,
657662 * 3) all active groups at the same level in the groups tree have the same
663
+ * weight,
664
+ * 4) all active groups at the same level in the groups tree have the same
658665 * number of children.
659666 *
660667 * Unfortunately, keeping the necessary state for evaluating exactly
661668 * the last two symmetry sub-conditions above would be quite complex
662
- * and time consuming. Therefore this function evaluates, instead,
663
- * only the following stronger two sub-conditions, for which it is
669
+ * and time consuming. Therefore this function evaluates, instead,
670
+ * only the following stronger three sub-conditions, for which it is
664671 * much easier to maintain the needed state:
665672 * 1) all active queues have the same weight,
666
- * 2) there are no active groups.
673
+ * 2) all active queues belong to the same I/O-priority class,
674
+ * 3) there are no active groups.
667675 * In particular, the last condition is always true if hierarchical
668676 * support or the cgroups interface are not enabled, thus no state
669677 * needs to be maintained in this case.
670678 */
671
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
679
+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
680
+ struct bfq_queue *bfqq)
672681 {
673
- return !bfq_varied_queue_weights_or_active_groups(bfqd);
682
+ bool smallest_weight = bfqq &&
683
+ bfqq->weight_counter &&
684
+ bfqq->weight_counter ==
685
+ container_of(
686
+ rb_first_cached(&bfqd->queue_weights_tree),
687
+ struct bfq_weight_counter,
688
+ weights_node);
689
+
690
+ /*
691
+ * For queue weights to differ, queue_weights_tree must contain
692
+ * at least two nodes.
693
+ */
694
+ bool varied_queue_weights = !smallest_weight &&
695
+ !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
696
+ (bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
697
+ bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
698
+
699
+ bool multiple_classes_busy =
700
+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
701
+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
702
+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
703
+
704
+ return varied_queue_weights || multiple_classes_busy
705
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
706
+ || bfqd->num_groups_with_pending_reqs > 0
707
+#endif
708
+ ;
674709 }
675710
676711 /*
....@@ -687,10 +722,11 @@
687722 * should be low too.
688723 */
689724 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
690
- struct rb_root *root)
725
+ struct rb_root_cached *root)
691726 {
692727 struct bfq_entity *entity = &bfqq->entity;
693
- struct rb_node **new = &(root->rb_node), *parent = NULL;
728
+ struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
729
+ bool leftmost = true;
694730
695731 /*
696732 * Do not insert if the queue is already associated with a
....@@ -719,8 +755,10 @@
719755 }
720756 if (entity->weight < __counter->weight)
721757 new = &((*new)->rb_left);
722
- else
758
+ else {
723759 new = &((*new)->rb_right);
760
+ leftmost = false;
761
+ }
724762 }
725763
726764 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
....@@ -729,22 +767,22 @@
729767 /*
730768 * In the unlucky event of an allocation failure, we just
731769 * exit. This will cause the weight of queue to not be
732
- * considered in bfq_varied_queue_weights_or_active_groups,
733
- * which, in its turn, causes the scenario to be deemed
734
- * wrongly symmetric in case bfqq's weight would have been
735
- * the only weight making the scenario asymmetric. On the
736
- * bright side, no unbalance will however occur when bfqq
737
- * becomes inactive again (the invocation of this function
738
- * is triggered by an activation of queue). In fact,
739
- * bfq_weights_tree_remove does nothing if
740
- * !bfqq->weight_counter.
770
+ * considered in bfq_asymmetric_scenario, which, in its turn,
771
+ * causes the scenario to be deemed wrongly symmetric in case
772
+ * bfqq's weight would have been the only weight making the
773
+ * scenario asymmetric. On the bright side, no unbalance will
774
+ * however occur when bfqq becomes inactive again (the
775
+ * invocation of this function is triggered by an activation
776
+ * of queue). In fact, bfq_weights_tree_remove does nothing
777
+ * if !bfqq->weight_counter.
741778 */
742779 if (unlikely(!bfqq->weight_counter))
743780 return;
744781
745782 bfqq->weight_counter->weight = entity->weight;
746783 rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
747
- rb_insert_color(&bfqq->weight_counter->weights_node, root);
784
+ rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
785
+ leftmost);
748786
749787 inc_counter:
750788 bfqq->weight_counter->num_active++;
....@@ -759,7 +797,7 @@
759797 */
760798 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
761799 struct bfq_queue *bfqq,
762
- struct rb_root *root)
800
+ struct rb_root_cached *root)
763801 {
764802 if (!bfqq->weight_counter)
765803 return;
....@@ -768,7 +806,7 @@
768806 if (bfqq->weight_counter->num_active > 0)
769807 goto reset_entity_pointer;
770808
771
- rb_erase(&bfqq->weight_counter->weights_node, root);
809
+ rb_erase_cached(&bfqq->weight_counter->weights_node, root);
772810 kfree(bfqq->weight_counter);
773811
774812 reset_entity_pointer:
....@@ -882,7 +920,8 @@
882920 static unsigned long bfq_serv_to_charge(struct request *rq,
883921 struct bfq_queue *bfqq)
884922 {
885
- if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
923
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
924
+ bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
886925 return blk_rq_sectors(rq);
887926
888927 return blk_rq_sectors(rq) * bfq_async_charge_factor;
....@@ -916,8 +955,10 @@
916955 */
917956 return;
918957
919
- new_budget = max_t(unsigned long, bfqq->max_budget,
920
- bfq_serv_to_charge(next_rq, bfqq));
958
+ new_budget = max_t(unsigned long,
959
+ max_t(unsigned long, bfqq->max_budget,
960
+ bfq_serv_to_charge(next_rq, bfqq)),
961
+ entity->service);
921962 if (entity->budget != new_budget) {
922963 entity->budget = new_budget;
923964 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
....@@ -946,7 +987,7 @@
946987 * of several files
947988 * mplayer took 23 seconds to start, if constantly weight-raised.
948989 *
949
- * As for higher values than that accomodating the above bad
990
+ * As for higher values than that accommodating the above bad
950991 * scenario, tests show that higher values would often yield
951992 * the opposite of the desired result, i.e., would worsen
952993 * responsiveness by allowing non-interactive applications to
....@@ -985,6 +1026,7 @@
9851026 else
9861027 bfq_clear_bfqq_IO_bound(bfqq);
9871028
1029
+ bfqq->entity.new_weight = bic->saved_weight;
9881030 bfqq->ttime = bic->saved_ttime;
9891031 bfqq->wr_coeff = bic->saved_wr_coeff;
9901032 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
....@@ -1020,7 +1062,7 @@
10201062
10211063 static int bfqq_process_refs(struct bfq_queue *bfqq)
10221064 {
1023
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
1065
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
10241066 (bfqq->weight_counter != NULL);
10251067 }
10261068
....@@ -1032,8 +1074,18 @@
10321074
10331075 hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
10341076 hlist_del_init(&item->burst_list_node);
1035
- hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1036
- bfqd->burst_size = 1;
1077
+
1078
+ /*
1079
+ * Start the creation of a new burst list only if there is no
1080
+ * active queue. See comments on the conditional invocation of
1081
+ * bfq_handle_burst().
1082
+ */
1083
+ if (bfq_tot_busy_queues(bfqd) == 0) {
1084
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1085
+ bfqd->burst_size = 1;
1086
+ } else
1087
+ bfqd->burst_size = 0;
1088
+
10371089 bfqd->burst_parent_entity = bfqq->entity.parent;
10381090 }
10391091
....@@ -1089,7 +1141,8 @@
10891141 * many parallel threads/processes. Examples are systemd during boot,
10901142 * or git grep. To help these processes get their job done as soon as
10911143 * possible, it is usually better to not grant either weight-raising
1092
- * or device idling to their queues.
1144
+ * or device idling to their queues, unless these queues must be
1145
+ * protected from the I/O flowing through other active queues.
10931146 *
10941147 * In this comment we describe, firstly, the reasons why this fact
10951148 * holds, and, secondly, the next function, which implements the main
....@@ -1101,7 +1154,10 @@
11011154 * cumulatively served, the sooner the target job of these queues gets
11021155 * completed. As a consequence, weight-raising any of these queues,
11031156 * which also implies idling the device for it, is almost always
1104
- * counterproductive. In most cases it just lowers throughput.
1157
+ * counterproductive, unless there are other active queues to isolate
1158
+ * these new queues from. If there no other active queues, then
1159
+ * weight-raising these new queues just lowers throughput in most
1160
+ * cases.
11051161 *
11061162 * On the other hand, a burst of queue creations may be caused also by
11071163 * the start of an application that does not consist of a lot of
....@@ -1135,14 +1191,16 @@
11351191 * are very rare. They typically occur if some service happens to
11361192 * start doing I/O exactly when the interactive task starts.
11371193 *
1138
- * Turning back to the next function, it implements all the steps
1139
- * needed to detect the occurrence of a large burst and to properly
1140
- * mark all the queues belonging to it (so that they can then be
1141
- * treated in a different way). This goal is achieved by maintaining a
1142
- * "burst list" that holds, temporarily, the queues that belong to the
1143
- * burst in progress. The list is then used to mark these queues as
1144
- * belonging to a large burst if the burst does become large. The main
1145
- * steps are the following.
1194
+ * Turning back to the next function, it is invoked only if there are
1195
+ * no active queues (apart from active queues that would belong to the
1196
+ * same, possible burst bfqq would belong to), and it implements all
1197
+ * the steps needed to detect the occurrence of a large burst and to
1198
+ * properly mark all the queues belonging to it (so that they can then
1199
+ * be treated in a different way). This goal is achieved by
1200
+ * maintaining a "burst list" that holds, temporarily, the queues that
1201
+ * belong to the burst in progress. The list is then used to mark
1202
+ * these queues as belonging to a large burst if the burst does become
1203
+ * large. The main steps are the following.
11461204 *
11471205 * . when the very first queue is created, the queue is inserted into the
11481206 * list (as it could be the first queue in a possible burst)
....@@ -1376,21 +1434,31 @@
13761434 * mechanism may be re-designed in such a way to make it possible to
13771435 * know whether preemption is needed without needing to update service
13781436 * trees). In addition, queue preemptions almost always cause random
1379
- * I/O, and thus loss of throughput. Because of these facts, the next
1380
- * function adopts the following simple scheme to avoid both costly
1381
- * operations and too frequent preemptions: it requests the expiration
1382
- * of the in-service queue (unconditionally) only for queues that need
1383
- * to recover a hole, or that either are weight-raised or deserve to
1384
- * be weight-raised.
1437
+ * I/O, which may in turn cause loss of throughput. Finally, there may
1438
+ * even be no in-service queue when the next function is invoked (so,
1439
+ * no queue to compare timestamps with). Because of these facts, the
1440
+ * next function adopts the following simple scheme to avoid costly
1441
+ * operations, too frequent preemptions and too many dependencies on
1442
+ * the state of the scheduler: it requests the expiration of the
1443
+ * in-service queue (unconditionally) only for queues that need to
1444
+ * recover a hole. Then it delegates to other parts of the code the
1445
+ * responsibility of handling the above case 2.
13851446 */
13861447 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
13871448 struct bfq_queue *bfqq,
1388
- bool arrived_in_time,
1389
- bool wr_or_deserves_wr)
1449
+ bool arrived_in_time)
13901450 {
13911451 struct bfq_entity *entity = &bfqq->entity;
13921452
1393
- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
1453
+ /*
1454
+ * In the next compound condition, we check also whether there
1455
+ * is some budget left, because otherwise there is no point in
1456
+ * trying to go on serving bfqq with this same budget: bfqq
1457
+ * would be expired immediately after being selected for
1458
+ * service. This would only cause useless overhead.
1459
+ */
1460
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
1461
+ bfq_bfqq_budget_left(bfqq) > 0) {
13941462 /*
13951463 * We do not clear the flag non_blocking_wait_rq here, as
13961464 * the latter is used in bfq_activate_bfqq to signal
....@@ -1433,7 +1501,7 @@
14331501 entity->budget = max_t(unsigned long, bfqq->max_budget,
14341502 bfq_serv_to_charge(bfqq->next_rq, bfqq));
14351503 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1436
- return wr_or_deserves_wr;
1504
+ return false;
14371505 }
14381506
14391507 /*
....@@ -1551,6 +1619,36 @@
15511619 bfqd->bfq_wr_min_idle_time);
15521620 }
15531621
1622
+
1623
+/*
1624
+ * Return true if bfqq is in a higher priority class, or has a higher
1625
+ * weight than the in-service queue.
1626
+ */
1627
+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
1628
+ struct bfq_queue *in_serv_bfqq)
1629
+{
1630
+ int bfqq_weight, in_serv_weight;
1631
+
1632
+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
1633
+ return true;
1634
+
1635
+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
1636
+ bfqq_weight = bfqq->entity.weight;
1637
+ in_serv_weight = in_serv_bfqq->entity.weight;
1638
+ } else {
1639
+ if (bfqq->entity.parent)
1640
+ bfqq_weight = bfqq->entity.parent->weight;
1641
+ else
1642
+ bfqq_weight = bfqq->entity.weight;
1643
+ if (in_serv_bfqq->entity.parent)
1644
+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
1645
+ else
1646
+ in_serv_weight = in_serv_bfqq->entity.weight;
1647
+ }
1648
+
1649
+ return bfqq_weight > in_serv_weight;
1650
+}
1651
+
15541652 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
15551653 struct bfq_queue *bfqq,
15561654 int old_wr_coeff,
....@@ -1579,6 +1677,7 @@
15791677 */
15801678 in_burst = bfq_bfqq_in_large_burst(bfqq);
15811679 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
1680
+ !BFQQ_TOTALLY_SEEKY(bfqq) &&
15821681 !in_burst &&
15831682 time_is_before_jiffies(bfqq->soft_rt_next_start) &&
15841683 bfqq->dispatched == 0;
....@@ -1594,8 +1693,7 @@
15941693 */
15951694 bfqq_wants_to_preempt =
15961695 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1597
- arrived_in_time,
1598
- wr_or_deserves_wr);
1696
+ arrived_in_time);
15991697
16001698 /*
16011699 * If bfqq happened to be activated in a burst, but has been
....@@ -1660,19 +1758,109 @@
16601758
16611759 /*
16621760 * Expire in-service queue only if preemption may be needed
1663
- * for guarantees. In this respect, the function
1664
- * next_queue_may_preempt just checks a simple, necessary
1665
- * condition, and not a sufficient condition based on
1666
- * timestamps. In fact, for the latter condition to be
1667
- * evaluated, timestamps would need first to be updated, and
1668
- * this operation is quite costly (see the comments on the
1669
- * function bfq_bfqq_update_budg_for_activation).
1761
+ * for guarantees. In particular, we care only about two
1762
+ * cases. The first is that bfqq has to recover a service
1763
+ * hole, as explained in the comments on
1764
+ * bfq_bfqq_update_budg_for_activation(), i.e., that
1765
+ * bfqq_wants_to_preempt is true. However, if bfqq does not
1766
+ * carry time-critical I/O, then bfqq's bandwidth is less
1767
+ * important than that of queues that carry time-critical I/O.
1768
+ * So, as a further constraint, we consider this case only if
1769
+ * bfqq is at least as weight-raised, i.e., at least as time
1770
+ * critical, as the in-service queue.
1771
+ *
1772
+ * The second case is that bfqq is in a higher priority class,
1773
+ * or has a higher weight than the in-service queue. If this
1774
+ * condition does not hold, we don't care because, even if
1775
+ * bfqq does not start to be served immediately, the resulting
1776
+ * delay for bfqq's I/O is however lower or much lower than
1777
+ * the ideal completion time to be guaranteed to bfqq's I/O.
1778
+ *
1779
+ * In both cases, preemption is needed only if, according to
1780
+ * the timestamps of both bfqq and of the in-service queue,
1781
+ * bfqq actually is the next queue to serve. So, to reduce
1782
+ * useless preemptions, the return value of
1783
+ * next_queue_may_preempt() is considered in the next compound
1784
+ * condition too. Yet next_queue_may_preempt() just checks a
1785
+ * simple, necessary condition for bfqq to be the next queue
1786
+ * to serve. In fact, to evaluate a sufficient condition, the
1787
+ * timestamps of the in-service queue would need to be
1788
+ * updated, and this operation is quite costly (see the
1789
+ * comments on bfq_bfqq_update_budg_for_activation()).
16701790 */
1671
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
1672
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
1791
+ if (bfqd->in_service_queue &&
1792
+ ((bfqq_wants_to_preempt &&
1793
+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
1794
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
16731795 next_queue_may_preempt(bfqd))
16741796 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
16751797 false, BFQQE_PREEMPTED);
1798
+}
1799
+
1800
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
1801
+ struct bfq_queue *bfqq)
1802
+{
1803
+ /* invalidate baseline total service time */
1804
+ bfqq->last_serv_time_ns = 0;
1805
+
1806
+ /*
1807
+ * Reset pointer in case we are waiting for
1808
+ * some request completion.
1809
+ */
1810
+ bfqd->waited_rq = NULL;
1811
+
1812
+ /*
1813
+ * If bfqq has a short think time, then start by setting the
1814
+ * inject limit to 0 prudentially, because the service time of
1815
+ * an injected I/O request may be higher than the think time
1816
+ * of bfqq, and therefore, if one request was injected when
1817
+ * bfqq remains empty, this injected request might delay the
1818
+ * service of the next I/O request for bfqq significantly. In
1819
+ * case bfqq can actually tolerate some injection, then the
1820
+ * adaptive update will however raise the limit soon. This
1821
+ * lucky circumstance holds exactly because bfqq has a short
1822
+ * think time, and thus, after remaining empty, is likely to
1823
+ * get new I/O enqueued---and then completed---before being
1824
+ * expired. This is the very pattern that gives the
1825
+ * limit-update algorithm the chance to measure the effect of
1826
+ * injection on request service times, and then to update the
1827
+ * limit accordingly.
1828
+ *
1829
+ * However, in the following special case, the inject limit is
1830
+ * left to 1 even if the think time is short: bfqq's I/O is
1831
+ * synchronized with that of some other queue, i.e., bfqq may
1832
+ * receive new I/O only after the I/O of the other queue is
1833
+ * completed. Keeping the inject limit to 1 allows the
1834
+ * blocking I/O to be served while bfqq is in service. And
1835
+ * this is very convenient both for bfqq and for overall
1836
+ * throughput, as explained in detail in the comments in
1837
+ * bfq_update_has_short_ttime().
1838
+ *
1839
+ * On the opposite end, if bfqq has a long think time, then
1840
+ * start directly by 1, because:
1841
+ * a) on the bright side, keeping at most one request in
1842
+ * service in the drive is unlikely to cause any harm to the
1843
+ * latency of bfqq's requests, as the service time of a single
1844
+ * request is likely to be lower than the think time of bfqq;
1845
+ * b) on the downside, after becoming empty, bfqq is likely to
1846
+ * expire before getting its next request. With this request
1847
+ * arrival pattern, it is very hard to sample total service
1848
+ * times and update the inject limit accordingly (see comments
1849
+ * on bfq_update_inject_limit()). So the limit is likely to be
1850
+ * never, or at least seldom, updated. As a consequence, by
1851
+ * setting the limit to 1, we avoid that no injection ever
1852
+ * occurs with bfqq. On the downside, this proactive step
1853
+ * further reduces chances to actually compute the baseline
1854
+ * total service time. Thus it reduces chances to execute the
1855
+ * limit-update algorithm and possibly raise the limit to more
1856
+ * than 1.
1857
+ */
1858
+ if (bfq_bfqq_has_short_ttime(bfqq))
1859
+ bfqq->inject_limit = 0;
1860
+ else
1861
+ bfqq->inject_limit = 1;
1862
+
1863
+ bfqq->decrease_time_jif = jiffies;
16761864 }
16771865
16781866 static void bfq_add_request(struct request *rq)
....@@ -1687,6 +1875,180 @@
16871875 bfqq->queued[rq_is_sync(rq)]++;
16881876 bfqd->queued++;
16891877
1878
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1879
+ /*
1880
+ * Detect whether bfqq's I/O seems synchronized with
1881
+ * that of some other queue, i.e., whether bfqq, after
1882
+ * remaining empty, happens to receive new I/O only
1883
+ * right after some I/O request of the other queue has
1884
+ * been completed. We call waker queue the other
1885
+ * queue, and we assume, for simplicity, that bfqq may
1886
+ * have at most one waker queue.
1887
+ *
1888
+ * A remarkable throughput boost can be reached by
1889
+ * unconditionally injecting the I/O of the waker
1890
+ * queue, every time a new bfq_dispatch_request
1891
+ * happens to be invoked while I/O is being plugged
1892
+ * for bfqq. In addition to boosting throughput, this
1893
+ * unblocks bfqq's I/O, thereby improving bandwidth
1894
+ * and latency for bfqq. Note that these same results
1895
+ * may be achieved with the general injection
1896
+ * mechanism, but less effectively. For details on
1897
+ * this aspect, see the comments on the choice of the
1898
+ * queue for injection in bfq_select_queue().
1899
+ *
1900
+ * Turning back to the detection of a waker queue, a
1901
+ * queue Q is deemed as a waker queue for bfqq if, for
1902
+ * two consecutive times, bfqq happens to become non
1903
+ * empty right after a request of Q has been
1904
+ * completed. In particular, on the first time, Q is
1905
+ * tentatively set as a candidate waker queue, while
1906
+ * on the second time, the flag
1907
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
1908
+ * is a waker queue for bfqq. These detection steps
1909
+ * are performed only if bfqq has a long think time,
1910
+ * so as to make it more likely that bfqq's I/O is
1911
+ * actually being blocked by a synchronization. This
1912
+ * last filter, plus the above two-times requirement,
1913
+ * make false positives less likely.
1914
+ *
1915
+ * NOTE
1916
+ *
1917
+ * The sooner a waker queue is detected, the sooner
1918
+ * throughput can be boosted by injecting I/O from the
1919
+ * waker queue. Fortunately, detection is likely to be
1920
+ * actually fast, for the following reasons. While
1921
+ * blocked by synchronization, bfqq has a long think
1922
+ * time. This implies that bfqq's inject limit is at
1923
+ * least equal to 1 (see the comments in
1924
+ * bfq_update_inject_limit()). So, thanks to
1925
+ * injection, the waker queue is likely to be served
1926
+ * during the very first I/O-plugging time interval
1927
+ * for bfqq. This triggers the first step of the
1928
+ * detection mechanism. Thanks again to injection, the
1929
+ * candidate waker queue is then likely to be
1930
+ * confirmed no later than during the next
1931
+ * I/O-plugging interval for bfqq.
1932
+ */
1933
+ if (bfqd->last_completed_rq_bfqq &&
1934
+ !bfq_bfqq_has_short_ttime(bfqq) &&
1935
+ ktime_get_ns() - bfqd->last_completion <
1936
+ 200 * NSEC_PER_USEC) {
1937
+ if (bfqd->last_completed_rq_bfqq != bfqq &&
1938
+ bfqd->last_completed_rq_bfqq !=
1939
+ bfqq->waker_bfqq) {
1940
+ /*
1941
+ * First synchronization detected with
1942
+ * a candidate waker queue, or with a
1943
+ * different candidate waker queue
1944
+ * from the current one.
1945
+ */
1946
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
1947
+
1948
+ /*
1949
+ * If the waker queue disappears, then
1950
+ * bfqq->waker_bfqq must be reset. To
1951
+ * this goal, we maintain in each
1952
+ * waker queue a list, woken_list, of
1953
+ * all the queues that reference the
1954
+ * waker queue through their
1955
+ * waker_bfqq pointer. When the waker
1956
+ * queue exits, the waker_bfqq pointer
1957
+ * of all the queues in the woken_list
1958
+ * is reset.
1959
+ *
1960
+ * In addition, if bfqq is already in
1961
+ * the woken_list of a waker queue,
1962
+ * then, before being inserted into
1963
+ * the woken_list of a new waker
1964
+ * queue, bfqq must be removed from
1965
+ * the woken_list of the old waker
1966
+ * queue.
1967
+ */
1968
+ if (!hlist_unhashed(&bfqq->woken_list_node))
1969
+ hlist_del_init(&bfqq->woken_list_node);
1970
+ hlist_add_head(&bfqq->woken_list_node,
1971
+ &bfqd->last_completed_rq_bfqq->woken_list);
1972
+
1973
+ bfq_clear_bfqq_has_waker(bfqq);
1974
+ } else if (bfqd->last_completed_rq_bfqq ==
1975
+ bfqq->waker_bfqq &&
1976
+ !bfq_bfqq_has_waker(bfqq)) {
1977
+ /*
1978
+ * synchronization with waker_bfqq
1979
+ * seen for the second time
1980
+ */
1981
+ bfq_mark_bfqq_has_waker(bfqq);
1982
+ }
1983
+ }
1984
+
1985
+ /*
1986
+ * Periodically reset inject limit, to make sure that
1987
+ * the latter eventually drops in case workload
1988
+ * changes, see step (3) in the comments on
1989
+ * bfq_update_inject_limit().
1990
+ */
1991
+ if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
1992
+ msecs_to_jiffies(1000)))
1993
+ bfq_reset_inject_limit(bfqd, bfqq);
1994
+
1995
+ /*
1996
+ * The following conditions must hold to setup a new
1997
+ * sampling of total service time, and then a new
1998
+ * update of the inject limit:
1999
+ * - bfqq is in service, because the total service
2000
+ * time is evaluated only for the I/O requests of
2001
+ * the queues in service;
2002
+ * - this is the right occasion to compute or to
2003
+ * lower the baseline total service time, because
2004
+ * there are actually no requests in the drive,
2005
+ * or
2006
+ * the baseline total service time is available, and
2007
+ * this is the right occasion to compute the other
2008
+ * quantity needed to update the inject limit, i.e.,
2009
+ * the total service time caused by the amount of
2010
+ * injection allowed by the current value of the
2011
+ * limit. It is the right occasion because injection
2012
+ * has actually been performed during the service
2013
+ * hole, and there are still in-flight requests,
2014
+ * which are very likely to be exactly the injected
2015
+ * requests, or part of them;
2016
+ * - the minimum interval for sampling the total
2017
+ * service time and updating the inject limit has
2018
+ * elapsed.
2019
+ */
2020
+ if (bfqq == bfqd->in_service_queue &&
2021
+ (bfqd->rq_in_driver == 0 ||
2022
+ (bfqq->last_serv_time_ns > 0 &&
2023
+ bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
2024
+ time_is_before_eq_jiffies(bfqq->decrease_time_jif +
2025
+ msecs_to_jiffies(10))) {
2026
+ bfqd->last_empty_occupied_ns = ktime_get_ns();
2027
+ /*
2028
+ * Start the state machine for measuring the
2029
+ * total service time of rq: setting
2030
+ * wait_dispatch will cause bfqd->waited_rq to
2031
+ * be set when rq will be dispatched.
2032
+ */
2033
+ bfqd->wait_dispatch = true;
2034
+ /*
2035
+ * If there is no I/O in service in the drive,
2036
+ * then possible injection occurred before the
2037
+ * arrival of rq will not affect the total
2038
+ * service time of rq. So the injection limit
2039
+ * must not be updated as a function of such
2040
+ * total service time, unless new injection
2041
+ * occurs before rq is completed. To have the
2042
+ * injection limit updated only in the latter
2043
+ * case, reset rqs_injected here (rqs_injected
2044
+ * will be set in case injection is performed
2045
+ * on bfqq before rq is completed).
2046
+ */
2047
+ if (bfqd->rq_in_driver == 0)
2048
+ bfqd->rqs_injected = false;
2049
+ }
2050
+ }
2051
+
16902052 elv_rb_add(&bfqq->sort_list, rq);
16912053
16922054 /*
....@@ -1698,8 +2060,9 @@
16982060
16992061 /*
17002062 * Adjust priority tree position, if next_rq changes.
2063
+ * See comments on bfq_pos_tree_add_move() for the unlikely().
17012064 */
1702
- if (prev != bfqq->next_rq)
2065
+ if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
17032066 bfq_pos_tree_add_move(bfqd, bfqq);
17042067
17052068 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
....@@ -1839,7 +2202,9 @@
18392202 bfqq->pos_root = NULL;
18402203 }
18412204 } else {
1842
- bfq_pos_tree_add_move(bfqd, bfqq);
2205
+ /* see comments on bfq_pos_tree_add_move() for the unlikely() */
2206
+ if (unlikely(!bfqd->nonrot_with_queueing))
2207
+ bfq_pos_tree_add_move(bfqd, bfqq);
18432208 }
18442209
18452210 if (rq->cmd_flags & REQ_META)
....@@ -1847,9 +2212,9 @@
18472212
18482213 }
18492214
1850
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
2215
+static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
2216
+ unsigned int nr_segs)
18512217 {
1852
- struct request_queue *q = hctx->queue;
18532218 struct bfq_data *bfqd = q->elevator->elevator_data;
18542219 struct request *free = NULL;
18552220 /*
....@@ -1864,13 +2229,20 @@
18642229
18652230 spin_lock_irq(&bfqd->lock);
18662231
1867
- if (bic)
2232
+ if (bic) {
2233
+ /*
2234
+ * Make sure cgroup info is uptodate for current process before
2235
+ * considering the merge.
2236
+ */
2237
+ bfq_bic_update_cgroup(bic, bio);
2238
+
18682239 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
1869
- else
2240
+ } else {
18702241 bfqd->bio_bfqq = NULL;
2242
+ }
18712243 bfqd->bio_bic = bic;
18722244
1873
- ret = blk_mq_sched_try_merge(q, bio, &free);
2245
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
18742246
18752247 if (free)
18762248 blk_mq_free_request(free);
....@@ -1888,13 +2260,14 @@
18882260 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
18892261 if (__rq && elv_bio_merge_ok(__rq, bio)) {
18902262 *req = __rq;
2263
+
2264
+ if (blk_discard_mergable(__rq))
2265
+ return ELEVATOR_DISCARD_MERGE;
18912266 return ELEVATOR_FRONT_MERGE;
18922267 }
18932268
18942269 return ELEVATOR_NO_MERGE;
18952270 }
1896
-
1897
-static struct bfq_queue *bfq_init_rq(struct request *rq);
18982271
18992272 static void bfq_request_merged(struct request_queue *q, struct request *req,
19002273 enum elv_merge type)
....@@ -1904,7 +2277,7 @@
19042277 blk_rq_pos(req) <
19052278 blk_rq_pos(container_of(rb_prev(&req->rb_node),
19062279 struct request, rb_node))) {
1907
- struct bfq_queue *bfqq = bfq_init_rq(req);
2280
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
19082281 struct bfq_data *bfqd;
19092282 struct request *prev, *next_rq;
19102283
....@@ -1929,7 +2302,12 @@
19292302 */
19302303 if (prev != bfqq->next_rq) {
19312304 bfq_updated_next_req(bfqd, bfqq);
1932
- bfq_pos_tree_add_move(bfqd, bfqq);
2305
+ /*
2306
+ * See comments on bfq_pos_tree_add_move() for
2307
+ * the unlikely().
2308
+ */
2309
+ if (unlikely(!bfqd->nonrot_with_queueing))
2310
+ bfq_pos_tree_add_move(bfqd, bfqq);
19332311 }
19342312 }
19352313 }
....@@ -1951,8 +2329,8 @@
19512329 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
19522330 struct request *next)
19532331 {
1954
- struct bfq_queue *bfqq = bfq_init_rq(rq),
1955
- *next_bfqq = bfq_init_rq(next);
2332
+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
2333
+ *next_bfqq = RQ_BFQQ(next);
19562334
19572335 if (!bfqq)
19582336 return;
....@@ -2131,6 +2509,14 @@
21312509 if (process_refs == 0 || new_process_refs == 0)
21322510 return NULL;
21332511
2512
+ /*
2513
+ * Make sure merged queues belong to the same parent. Parents could
2514
+ * have changed since the time we decided the two queues are suitable
2515
+ * for merging.
2516
+ */
2517
+ if (new_bfqq->entity.parent != bfqq->entity.parent)
2518
+ return NULL;
2519
+
21342520 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
21352521 new_bfqq->pid);
21362522
....@@ -2155,6 +2541,15 @@
21552541 * are likely to increase the throughput.
21562542 */
21572543 bfqq->new_bfqq = new_bfqq;
2544
+ /*
2545
+ * The above assignment schedules the following redirections:
2546
+ * each time some I/O for bfqq arrives, the process that
2547
+ * generated that I/O is disassociated from bfqq and
2548
+ * associated with new_bfqq. Here we increases new_bfqq->ref
2549
+ * in advance, adding the number of processes that are
2550
+ * expected to be associated with new_bfqq as they happen to
2551
+ * issue I/O.
2552
+ */
21582553 new_bfqq->ref += process_refs;
21592554 return new_bfqq;
21602555 }
....@@ -2214,6 +2609,50 @@
22142609 {
22152610 struct bfq_queue *in_service_bfqq, *new_bfqq;
22162611
2612
+ /* if a merge has already been setup, then proceed with that first */
2613
+ if (bfqq->new_bfqq)
2614
+ return bfqq->new_bfqq;
2615
+
2616
+ /*
2617
+ * Do not perform queue merging if the device is non
2618
+ * rotational and performs internal queueing. In fact, such a
2619
+ * device reaches a high speed through internal parallelism
2620
+ * and pipelining. This means that, to reach a high
2621
+ * throughput, it must have many requests enqueued at the same
2622
+ * time. But, in this configuration, the internal scheduling
2623
+ * algorithm of the device does exactly the job of queue
2624
+ * merging: it reorders requests so as to obtain as much as
2625
+ * possible a sequential I/O pattern. As a consequence, with
2626
+ * the workload generated by processes doing interleaved I/O,
2627
+ * the throughput reached by the device is likely to be the
2628
+ * same, with and without queue merging.
2629
+ *
2630
+ * Disabling merging also provides a remarkable benefit in
2631
+ * terms of throughput. Merging tends to make many workloads
2632
+ * artificially more uneven, because of shared queues
2633
+ * remaining non empty for incomparably more time than
2634
+ * non-merged queues. This may accentuate workload
2635
+ * asymmetries. For example, if one of the queues in a set of
2636
+ * merged queues has a higher weight than a normal queue, then
2637
+ * the shared queue may inherit such a high weight and, by
2638
+ * staying almost always active, may force BFQ to perform I/O
2639
+ * plugging most of the time. This evidently makes it harder
2640
+ * for BFQ to let the device reach a high throughput.
2641
+ *
2642
+ * Finally, the likely() macro below is not used because one
2643
+ * of the two branches is more likely than the other, but to
2644
+ * have the code path after the following if() executed as
2645
+ * fast as possible for the case of a non rotational device
2646
+ * with queueing. We want it because this is the fastest kind
2647
+ * of device. On the opposite end, the likely() may lengthen
2648
+ * the execution time of BFQ for the case of slower devices
2649
+ * (rotational or at least without queueing). But in this case
2650
+ * the execution time of BFQ matters very little, if not at
2651
+ * all.
2652
+ */
2653
+ if (likely(bfqd->nonrot_with_queueing))
2654
+ return NULL;
2655
+
22172656 /*
22182657 * Prevent bfqq from being merged if it has been created too
22192658 * long ago. The idea is that true cooperating processes, and
....@@ -2228,14 +2667,11 @@
22282667 if (bfq_too_late_for_merging(bfqq))
22292668 return NULL;
22302669
2231
- if (bfqq->new_bfqq)
2232
- return bfqq->new_bfqq;
2233
-
22342670 if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
22352671 return NULL;
22362672
22372673 /* If there is only one backlogged queue, don't search. */
2238
- if (bfqd->busy_queues == 1)
2674
+ if (bfq_tot_busy_queues(bfqd) == 1)
22392675 return NULL;
22402676
22412677 in_service_bfqq = bfqd->in_service_queue;
....@@ -2277,6 +2713,7 @@
22772713 if (!bic)
22782714 return;
22792715
2716
+ bic->saved_weight = bfqq->entity.orig_weight;
22802717 bic->saved_ttime = bfqq->ttime;
22812718 bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
22822719 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
....@@ -2295,6 +2732,7 @@
22952732 * to enjoy weight raising if split soon.
22962733 */
22972734 bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
2735
+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
22982736 bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
22992737 bic->saved_last_wr_start_finish = jiffies;
23002738 } else {
....@@ -2304,6 +2742,26 @@
23042742 bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
23052743 bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
23062744 }
2745
+}
2746
+
2747
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2748
+{
2749
+ /*
2750
+ * To prevent bfqq's service guarantees from being violated,
2751
+ * bfqq may be left busy, i.e., queued for service, even if
2752
+ * empty (see comments in __bfq_bfqq_expire() for
2753
+ * details). But, if no process will send requests to bfqq any
2754
+ * longer, then there is no point in keeping bfqq queued for
2755
+ * service. In addition, keeping bfqq queued for service, but
2756
+ * with no process ref any longer, may have caused bfqq to be
2757
+ * freed when dequeued from service. But this is assumed to
2758
+ * never happen.
2759
+ */
2760
+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
2761
+ bfqq != bfqd->in_service_queue)
2762
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
2763
+
2764
+ bfq_put_queue(bfqq);
23072765 }
23082766
23092767 static void
....@@ -2365,9 +2823,18 @@
23652823 * assignment causes no harm).
23662824 */
23672825 new_bfqq->bic = NULL;
2826
+ /*
2827
+ * If the queue is shared, the pid is the pid of one of the associated
2828
+ * processes. Which pid depends on the exact sequence of merge events
2829
+ * the queue underwent. So printing such a pid is useless and confusing
2830
+ * because it reports a random pid between those of the associated
2831
+ * processes.
2832
+ * We mark such a queue with a pid -1, and then print SHARED instead of
2833
+ * a pid in logging messages.
2834
+ */
2835
+ new_bfqq->pid = -1;
23682836 bfqq->bic = NULL;
2369
- /* release process reference to bfqq */
2370
- bfq_put_queue(bfqq);
2837
+ bfq_release_process_ref(bfqd, bfqq);
23712838 }
23722839
23732840 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
....@@ -2399,8 +2866,8 @@
23992866 /*
24002867 * bic still points to bfqq, then it has not yet been
24012868 * redirected to some other bfq_queue, and a queue
2402
- * merge beween bfqq and new_bfqq can be safely
2403
- * fulfillled, i.e., bic can be redirected to new_bfqq
2869
+ * merge between bfqq and new_bfqq can be safely
2870
+ * fulfilled, i.e., bic can be redirected to new_bfqq
24042871 * and bfqq can be put.
24052872 */
24062873 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
....@@ -2535,12 +3002,14 @@
25353002 * queue).
25363003 */
25373004 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2538
- bfq_symmetric_scenario(bfqd))
3005
+ !bfq_asymmetric_scenario(bfqd, bfqq))
25393006 sl = min_t(u64, sl, BFQ_MIN_TT);
25403007 else if (bfqq->wr_coeff > 1)
25413008 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
25423009
25433010 bfqd->last_idling_start = ktime_get();
3011
+ bfqd->last_idling_start_jiffies = jiffies;
3012
+
25443013 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
25453014 HRTIMER_MODE_REL);
25463015 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
....@@ -2764,7 +3233,7 @@
27643233
27653234 if ((bfqd->rq_in_driver > 0 ||
27663235 now_ns - bfqd->last_completion < BFQ_MIN_TT)
2767
- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
3236
+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
27683237 bfqd->sequential_samples++;
27693238
27703239 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
....@@ -2816,7 +3285,209 @@
28163285 bfq_remove_request(q, rq);
28173286 }
28183287
2819
-static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3288
+/*
3289
+ * There is a case where idling does not have to be performed for
3290
+ * throughput concerns, but to preserve the throughput share of
3291
+ * the process associated with bfqq.
3292
+ *
3293
+ * To introduce this case, we can note that allowing the drive
3294
+ * to enqueue more than one request at a time, and hence
3295
+ * delegating de facto final scheduling decisions to the
3296
+ * drive's internal scheduler, entails loss of control on the
3297
+ * actual request service order. In particular, the critical
3298
+ * situation is when requests from different processes happen
3299
+ * to be present, at the same time, in the internal queue(s)
3300
+ * of the drive. In such a situation, the drive, by deciding
3301
+ * the service order of the internally-queued requests, does
3302
+ * determine also the actual throughput distribution among
3303
+ * these processes. But the drive typically has no notion or
3304
+ * concern about per-process throughput distribution, and
3305
+ * makes its decisions only on a per-request basis. Therefore,
3306
+ * the service distribution enforced by the drive's internal
3307
+ * scheduler is likely to coincide with the desired throughput
3308
+ * distribution only in a completely symmetric, or favorably
3309
+ * skewed scenario where:
3310
+ * (i-a) each of these processes must get the same throughput as
3311
+ * the others,
3312
+ * (i-b) in case (i-a) does not hold, it holds that the process
3313
+ * associated with bfqq must receive a lower or equal
3314
+ * throughput than any of the other processes;
3315
+ * (ii) the I/O of each process has the same properties, in
3316
+ * terms of locality (sequential or random), direction
3317
+ * (reads or writes), request sizes, greediness
3318
+ * (from I/O-bound to sporadic), and so on;
3319
+
3320
+ * In fact, in such a scenario, the drive tends to treat the requests
3321
+ * of each process in about the same way as the requests of the
3322
+ * others, and thus to provide each of these processes with about the
3323
+ * same throughput. This is exactly the desired throughput
3324
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3325
+ * even more convenient distribution for (the process associated with)
3326
+ * bfqq.
3327
+ *
3328
+ * In contrast, in any asymmetric or unfavorable scenario, device
3329
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
3330
+ * that bfqq receives its assigned fraction of the device throughput
3331
+ * (see [1] for details).
3332
+ *
3333
+ * The problem is that idling may significantly reduce throughput with
3334
+ * certain combinations of types of I/O and devices. An important
3335
+ * example is sync random I/O on flash storage with command
3336
+ * queueing. So, unless bfqq falls in cases where idling also boosts
3337
+ * throughput, it is important to check conditions (i-a), i(-b) and
3338
+ * (ii) accurately, so as to avoid idling when not strictly needed for
3339
+ * service guarantees.
3340
+ *
3341
+ * Unfortunately, it is extremely difficult to thoroughly check
3342
+ * condition (ii). And, in case there are active groups, it becomes
3343
+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
3344
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
3345
+ * become false 'indirectly', it is enough that an active group
3346
+ * contains more active processes or sub-groups than some other active
3347
+ * group. More precisely, for conditions (i-a) or (i-b) to become
3348
+ * false because of such a group, it is not even necessary that the
3349
+ * group is (still) active: it is sufficient that, even if the group
3350
+ * has become inactive, some of its descendant processes still have
3351
+ * some request already dispatched but still waiting for
3352
+ * completion. In fact, requests have still to be guaranteed their
3353
+ * share of the throughput even after being dispatched. In this
3354
+ * respect, it is easy to show that, if a group frequently becomes
3355
+ * inactive while still having in-flight requests, and if, when this
3356
+ * happens, the group is not considered in the calculation of whether
3357
+ * the scenario is asymmetric, then the group may fail to be
3358
+ * guaranteed its fair share of the throughput (basically because
3359
+ * idling may not be performed for the descendant processes of the
3360
+ * group, but it had to be). We address this issue with the following
3361
+ * bi-modal behavior, implemented in the function
3362
+ * bfq_asymmetric_scenario().
3363
+ *
3364
+ * If there are groups with requests waiting for completion
3365
+ * (as commented above, some of these groups may even be
3366
+ * already inactive), then the scenario is tagged as
3367
+ * asymmetric, conservatively, without checking any of the
3368
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3369
+ * This behavior matches also the fact that groups are created
3370
+ * exactly if controlling I/O is a primary concern (to
3371
+ * preserve bandwidth and latency guarantees).
3372
+ *
3373
+ * On the opposite end, if there are no groups with requests waiting
3374
+ * for completion, then only conditions (i-a) and (i-b) are actually
3375
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3376
+ * idling is not performed, regardless of whether condition (ii)
3377
+ * holds. In other words, only if conditions (i-a) and (i-b) do not
3378
+ * hold, then idling is allowed, and the device tends to be prevented
3379
+ * from queueing many requests, possibly of several processes. Since
3380
+ * there are no groups with requests waiting for completion, then, to
3381
+ * control conditions (i-a) and (i-b) it is enough to check just
3382
+ * whether all the queues with requests waiting for completion also
3383
+ * have the same weight.
3384
+ *
3385
+ * Not checking condition (ii) evidently exposes bfqq to the
3386
+ * risk of getting less throughput than its fair share.
3387
+ * However, for queues with the same weight, a further
3388
+ * mechanism, preemption, mitigates or even eliminates this
3389
+ * problem. And it does so without consequences on overall
3390
+ * throughput. This mechanism and its benefits are explained
3391
+ * in the next three paragraphs.
3392
+ *
3393
+ * Even if a queue, say Q, is expired when it remains idle, Q
3394
+ * can still preempt the new in-service queue if the next
3395
+ * request of Q arrives soon (see the comments on
3396
+ * bfq_bfqq_update_budg_for_activation). If all queues and
3397
+ * groups have the same weight, this form of preemption,
3398
+ * combined with the hole-recovery heuristic described in the
3399
+ * comments on function bfq_bfqq_update_budg_for_activation,
3400
+ * are enough to preserve a correct bandwidth distribution in
3401
+ * the mid term, even without idling. In fact, even if not
3402
+ * idling allows the internal queues of the device to contain
3403
+ * many requests, and thus to reorder requests, we can rather
3404
+ * safely assume that the internal scheduler still preserves a
3405
+ * minimum of mid-term fairness.
3406
+ *
3407
+ * More precisely, this preemption-based, idleless approach
3408
+ * provides fairness in terms of IOPS, and not sectors per
3409
+ * second. This can be seen with a simple example. Suppose
3410
+ * that there are two queues with the same weight, but that
3411
+ * the first queue receives requests of 8 sectors, while the
3412
+ * second queue receives requests of 1024 sectors. In
3413
+ * addition, suppose that each of the two queues contains at
3414
+ * most one request at a time, which implies that each queue
3415
+ * always remains idle after it is served. Finally, after
3416
+ * remaining idle, each queue receives very quickly a new
3417
+ * request. It follows that the two queues are served
3418
+ * alternatively, preempting each other if needed. This
3419
+ * implies that, although both queues have the same weight,
3420
+ * the queue with large requests receives a service that is
3421
+ * 1024/8 times as high as the service received by the other
3422
+ * queue.
3423
+ *
3424
+ * The motivation for using preemption instead of idling (for
3425
+ * queues with the same weight) is that, by not idling,
3426
+ * service guarantees are preserved (completely or at least in
3427
+ * part) without minimally sacrificing throughput. And, if
3428
+ * there is no active group, then the primary expectation for
3429
+ * this device is probably a high throughput.
3430
+ *
3431
+ * We are now left only with explaining the two sub-conditions in the
3432
+ * additional compound condition that is checked below for deciding
3433
+ * whether the scenario is asymmetric. To explain the first
3434
+ * sub-condition, we need to add that the function
3435
+ * bfq_asymmetric_scenario checks the weights of only
3436
+ * non-weight-raised queues, for efficiency reasons (see comments on
3437
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
3438
+ * is checked explicitly here. More precisely, the compound condition
3439
+ * below takes into account also the fact that, even if bfqq is being
3440
+ * weight-raised, the scenario is still symmetric if all queues with
3441
+ * requests waiting for completion happen to be
3442
+ * weight-raised. Actually, we should be even more precise here, and
3443
+ * differentiate between interactive weight raising and soft real-time
3444
+ * weight raising.
3445
+ *
3446
+ * The second sub-condition checked in the compound condition is
3447
+ * whether there is a fair amount of already in-flight I/O not
3448
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
3449
+ * following reason. The drive may decide to serve in-flight
3450
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
3451
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
3452
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
3453
+ * basically uncontrolled amount of I/O from other queues may be
3454
+ * dispatched too, possibly causing the service of bfqq's I/O to be
3455
+ * delayed even longer in the drive. This problem gets more and more
3456
+ * serious as the speed and the queue depth of the drive grow,
3457
+ * because, as these two quantities grow, the probability to find no
3458
+ * queue busy but many requests in flight grows too. By contrast,
3459
+ * plugging I/O dispatching minimizes the delay induced by already
3460
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
3461
+ * lose because of this delay.
3462
+ *
3463
+ * As a side note, it is worth considering that the above
3464
+ * device-idling countermeasures may however fail in the following
3465
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
3466
+ * in a time period during which all symmetry sub-conditions hold, and
3467
+ * therefore the device is allowed to enqueue many requests, but at
3468
+ * some later point in time some sub-condition stops to hold, then it
3469
+ * may become impossible to make requests be served in the desired
3470
+ * order until all the requests already queued in the device have been
3471
+ * served. The last sub-condition commented above somewhat mitigates
3472
+ * this problem for weight-raised queues.
3473
+ */
3474
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3475
+ struct bfq_queue *bfqq)
3476
+{
3477
+ /* No point in idling for bfqq if it won't get requests any longer */
3478
+ if (unlikely(!bfqq_process_refs(bfqq)))
3479
+ return false;
3480
+
3481
+ return (bfqq->wr_coeff > 1 &&
3482
+ (bfqd->wr_busy_queues <
3483
+ bfq_tot_busy_queues(bfqd) ||
3484
+ bfqd->rq_in_driver >=
3485
+ bfqq->dispatched + 4)) ||
3486
+ bfq_asymmetric_scenario(bfqd, bfqq);
3487
+}
3488
+
3489
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3490
+ enum bfqq_expiration reason)
28203491 {
28213492 /*
28223493 * If this bfqq is shared between multiple processes, check
....@@ -2827,7 +3498,22 @@
28273498 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
28283499 bfq_mark_bfqq_split_coop(bfqq);
28293500
2830
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
3501
+ /*
3502
+ * Consider queues with a higher finish virtual time than
3503
+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
3504
+ * true, then bfqq's bandwidth would be violated if an
3505
+ * uncontrolled amount of I/O from these queues were
3506
+ * dispatched while bfqq is waiting for its new I/O to
3507
+ * arrive. This is exactly what may happen if this is a forced
3508
+ * expiration caused by a preemption attempt, and if bfqq is
3509
+ * not re-scheduled. To prevent this from happening, re-queue
3510
+ * bfqq if it needs I/O-dispatch plugging, even if it is
3511
+ * empty. By doing so, bfqq is granted to be served before the
3512
+ * above queues (provided that bfqq is of course eligible).
3513
+ */
3514
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3515
+ !(reason == BFQQE_PREEMPTED &&
3516
+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
28313517 if (bfqq->dispatched == 0)
28323518 /*
28333519 * Overloading budget_timeout field to store
....@@ -2842,8 +3528,11 @@
28423528 bfq_requeue_bfqq(bfqd, bfqq, true);
28433529 /*
28443530 * Resort priority tree of potential close cooperators.
3531
+ * See comments on bfq_pos_tree_add_move() for the unlikely().
28453532 */
2846
- bfq_pos_tree_add_move(bfqd, bfqq);
3533
+ if (unlikely(!bfqd->nonrot_with_queueing &&
3534
+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
3535
+ bfq_pos_tree_add_move(bfqd, bfqq);
28473536 }
28483537
28493538 /*
....@@ -3217,13 +3906,6 @@
32173906 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
32183907 }
32193908
3220
-static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
3221
-{
3222
- return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
3223
- blk_queue_nonrot(bfqq->bfqd->queue) &&
3224
- bfqq->bfqd->hw_tag;
3225
-}
3226
-
32273909 /**
32283910 * bfq_bfqq_expire - expire a queue.
32293911 * @bfqd: device owning the queue.
....@@ -3299,16 +3981,32 @@
32993981 * requests, then the request pattern is isochronous
33003982 * (see the comments on the function
33013983 * bfq_bfqq_softrt_next_start()). Thus we can compute
3302
- * soft_rt_next_start. If, instead, the queue still
3303
- * has outstanding requests, then we have to wait for
3304
- * the completion of all the outstanding requests to
3305
- * discover whether the request pattern is actually
3306
- * isochronous.
3984
+ * soft_rt_next_start. And we do it, unless bfqq is in
3985
+ * interactive weight raising. We do not do it in the
3986
+ * latter subcase, for the following reason. bfqq may
3987
+ * be conveying the I/O needed to load a soft
3988
+ * real-time application. Such an application will
3989
+ * actually exhibit a soft real-time I/O pattern after
3990
+ * it finally starts doing its job. But, if
3991
+ * soft_rt_next_start is computed here for an
3992
+ * interactive bfqq, and bfqq had received a lot of
3993
+ * service before remaining with no outstanding
3994
+ * request (likely to happen on a fast device), then
3995
+ * soft_rt_next_start would be assigned such a high
3996
+ * value that, for a very long time, bfqq would be
3997
+ * prevented from being possibly considered as soft
3998
+ * real time.
3999
+ *
4000
+ * If, instead, the queue still has outstanding
4001
+ * requests, then we have to wait for the completion
4002
+ * of all the outstanding requests to discover whether
4003
+ * the request pattern is actually isochronous.
33074004 */
3308
- if (bfqq->dispatched == 0)
4005
+ if (bfqq->dispatched == 0 &&
4006
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
33094007 bfqq->soft_rt_next_start =
33104008 bfq_bfqq_softrt_next_start(bfqd, bfqq);
3311
- else {
4009
+ else if (bfqq->dispatched > 0) {
33124010 /*
33134011 * Schedule an update of soft_rt_next_start to when
33144012 * the task may be discovered to be isochronous.
....@@ -3322,15 +4020,21 @@
33224020 slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
33234021
33244022 /*
4023
+ * bfqq expired, so no total service time needs to be computed
4024
+ * any longer: reset state machine for measuring total service
4025
+ * times.
4026
+ */
4027
+ bfqd->rqs_injected = bfqd->wait_dispatch = false;
4028
+ bfqd->waited_rq = NULL;
4029
+
4030
+ /*
33254031 * Increase, decrease or leave budget unchanged according to
33264032 * reason.
33274033 */
33284034 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3329
- if (__bfq_bfqq_expire(bfqd, bfqq))
4035
+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
33304036 /* bfqq is gone, no more actions on it */
33314037 return;
3332
-
3333
- bfqq->injected_service = 0;
33344038
33354039 /* mark bfqq as waiting a request only if a bic still points to it */
33364040 if (!bfq_bfqq_busy(bfqq) &&
....@@ -3399,52 +4103,16 @@
33994103 bfq_bfqq_budget_timeout(bfqq);
34004104 }
34014105
3402
-/*
3403
- * For a queue that becomes empty, device idling is allowed only if
3404
- * this function returns true for the queue. As a consequence, since
3405
- * device idling plays a critical role in both throughput boosting and
3406
- * service guarantees, the return value of this function plays a
3407
- * critical role in both these aspects as well.
3408
- *
3409
- * In a nutshell, this function returns true only if idling is
3410
- * beneficial for throughput or, even if detrimental for throughput,
3411
- * idling is however necessary to preserve service guarantees (low
3412
- * latency, desired throughput distribution, ...). In particular, on
3413
- * NCQ-capable devices, this function tries to return false, so as to
3414
- * help keep the drives' internal queues full, whenever this helps the
3415
- * device boost the throughput without causing any service-guarantee
3416
- * issue.
3417
- *
3418
- * In more detail, the return value of this function is obtained by,
3419
- * first, computing a number of boolean variables that take into
3420
- * account throughput and service-guarantee issues, and, then,
3421
- * combining these variables in a logical expression. Most of the
3422
- * issues taken into account are not trivial. We discuss these issues
3423
- * individually while introducing the variables.
3424
- */
3425
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
4106
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
4107
+ struct bfq_queue *bfqq)
34264108 {
3427
- struct bfq_data *bfqd = bfqq->bfqd;
34284109 bool rot_without_queueing =
34294110 !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
34304111 bfqq_sequential_and_IO_bound,
3431
- idling_boosts_thr, idling_boosts_thr_without_issues,
3432
- idling_needed_for_service_guarantees,
3433
- asymmetric_scenario;
4112
+ idling_boosts_thr;
34344113
3435
- if (bfqd->strict_guarantees)
3436
- return true;
3437
-
3438
- /*
3439
- * Idling is performed only if slice_idle > 0. In addition, we
3440
- * do not idle if
3441
- * (a) bfqq is async
3442
- * (b) bfqq is in the idle io prio class: in this case we do
3443
- * not idle because we want to minimize the bandwidth that
3444
- * queues in this class can steal to higher-priority queues
3445
- */
3446
- if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
3447
- bfq_class_idle(bfqq))
4114
+ /* No point in idling for bfqq if it won't get requests any longer */
4115
+ if (unlikely(!bfqq_process_refs(bfqq)))
34484116 return false;
34494117
34504118 bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
....@@ -3477,8 +4145,7 @@
34774145 bfqq_sequential_and_IO_bound);
34784146
34794147 /*
3480
- * The value of the next variable,
3481
- * idling_boosts_thr_without_issues, is equal to that of
4148
+ * The return value of this function is equal to that of
34824149 * idling_boosts_thr, unless a special case holds. In this
34834150 * special case, described below, idling may cause problems to
34844151 * weight-raised queues.
....@@ -3495,217 +4162,85 @@
34954162 * which enqueue several requests in advance, and further
34964163 * reorder internally-queued requests.
34974164 *
3498
- * For this reason, we force to false the value of
3499
- * idling_boosts_thr_without_issues if there are weight-raised
3500
- * busy queues. In this case, and if bfqq is not weight-raised,
3501
- * this guarantees that the device is not idled for bfqq (if,
3502
- * instead, bfqq is weight-raised, then idling will be
3503
- * guaranteed by another variable, see below). Combined with
3504
- * the timestamping rules of BFQ (see [1] for details), this
3505
- * behavior causes bfqq, and hence any sync non-weight-raised
3506
- * queue, to get a lower number of requests served, and thus
3507
- * to ask for a lower number of requests from the request
3508
- * pool, before the busy weight-raised queues get served
3509
- * again. This often mitigates starvation problems in the
3510
- * presence of heavy write workloads and NCQ, thereby
3511
- * guaranteeing a higher application and system responsiveness
3512
- * in these hostile scenarios.
4165
+ * For this reason, we force to false the return value if
4166
+ * there are weight-raised busy queues. In this case, and if
4167
+ * bfqq is not weight-raised, this guarantees that the device
4168
+ * is not idled for bfqq (if, instead, bfqq is weight-raised,
4169
+ * then idling will be guaranteed by another variable, see
4170
+ * below). Combined with the timestamping rules of BFQ (see
4171
+ * [1] for details), this behavior causes bfqq, and hence any
4172
+ * sync non-weight-raised queue, to get a lower number of
4173
+ * requests served, and thus to ask for a lower number of
4174
+ * requests from the request pool, before the busy
4175
+ * weight-raised queues get served again. This often mitigates
4176
+ * starvation problems in the presence of heavy write
4177
+ * workloads and NCQ, thereby guaranteeing a higher
4178
+ * application and system responsiveness in these hostile
4179
+ * scenarios.
35134180 */
3514
- idling_boosts_thr_without_issues = idling_boosts_thr &&
4181
+ return idling_boosts_thr &&
35154182 bfqd->wr_busy_queues == 0;
4183
+}
4184
+
4185
+/*
4186
+ * For a queue that becomes empty, device idling is allowed only if
4187
+ * this function returns true for that queue. As a consequence, since
4188
+ * device idling plays a critical role for both throughput boosting
4189
+ * and service guarantees, the return value of this function plays a
4190
+ * critical role as well.
4191
+ *
4192
+ * In a nutshell, this function returns true only if idling is
4193
+ * beneficial for throughput or, even if detrimental for throughput,
4194
+ * idling is however necessary to preserve service guarantees (low
4195
+ * latency, desired throughput distribution, ...). In particular, on
4196
+ * NCQ-capable devices, this function tries to return false, so as to
4197
+ * help keep the drives' internal queues full, whenever this helps the
4198
+ * device boost the throughput without causing any service-guarantee
4199
+ * issue.
4200
+ *
4201
+ * Most of the issues taken into account to get the return value of
4202
+ * this function are not trivial. We discuss these issues in the two
4203
+ * functions providing the main pieces of information needed by this
4204
+ * function.
4205
+ */
4206
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
4207
+{
4208
+ struct bfq_data *bfqd = bfqq->bfqd;
4209
+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
4210
+
4211
+ /* No point in idling for bfqq if it won't get requests any longer */
4212
+ if (unlikely(!bfqq_process_refs(bfqq)))
4213
+ return false;
4214
+
4215
+ if (unlikely(bfqd->strict_guarantees))
4216
+ return true;
35164217
35174218 /*
3518
- * There is then a case where idling must be performed not
3519
- * for throughput concerns, but to preserve service
3520
- * guarantees.
3521
- *
3522
- * To introduce this case, we can note that allowing the drive
3523
- * to enqueue more than one request at a time, and hence
3524
- * delegating de facto final scheduling decisions to the
3525
- * drive's internal scheduler, entails loss of control on the
3526
- * actual request service order. In particular, the critical
3527
- * situation is when requests from different processes happen
3528
- * to be present, at the same time, in the internal queue(s)
3529
- * of the drive. In such a situation, the drive, by deciding
3530
- * the service order of the internally-queued requests, does
3531
- * determine also the actual throughput distribution among
3532
- * these processes. But the drive typically has no notion or
3533
- * concern about per-process throughput distribution, and
3534
- * makes its decisions only on a per-request basis. Therefore,
3535
- * the service distribution enforced by the drive's internal
3536
- * scheduler is likely to coincide with the desired
3537
- * device-throughput distribution only in a completely
3538
- * symmetric scenario where:
3539
- * (i) each of these processes must get the same throughput as
3540
- * the others;
3541
- * (ii) the I/O of each process has the same properties, in
3542
- * terms of locality (sequential or random), direction
3543
- * (reads or writes), request sizes, greediness
3544
- * (from I/O-bound to sporadic), and so on.
3545
- * In fact, in such a scenario, the drive tends to treat
3546
- * the requests of each of these processes in about the same
3547
- * way as the requests of the others, and thus to provide
3548
- * each of these processes with about the same throughput
3549
- * (which is exactly the desired throughput distribution). In
3550
- * contrast, in any asymmetric scenario, device idling is
3551
- * certainly needed to guarantee that bfqq receives its
3552
- * assigned fraction of the device throughput (see [1] for
3553
- * details).
3554
- * The problem is that idling may significantly reduce
3555
- * throughput with certain combinations of types of I/O and
3556
- * devices. An important example is sync random I/O, on flash
3557
- * storage with command queueing. So, unless bfqq falls in the
3558
- * above cases where idling also boosts throughput, it would
3559
- * be important to check conditions (i) and (ii) accurately,
3560
- * so as to avoid idling when not strictly needed for service
3561
- * guarantees.
3562
- *
3563
- * Unfortunately, it is extremely difficult to thoroughly
3564
- * check condition (ii). And, in case there are active groups,
3565
- * it becomes very difficult to check condition (i) too. In
3566
- * fact, if there are active groups, then, for condition (i)
3567
- * to become false, it is enough that an active group contains
3568
- * more active processes or sub-groups than some other active
3569
- * group. More precisely, for condition (i) to hold because of
3570
- * such a group, it is not even necessary that the group is
3571
- * (still) active: it is sufficient that, even if the group
3572
- * has become inactive, some of its descendant processes still
3573
- * have some request already dispatched but still waiting for
3574
- * completion. In fact, requests have still to be guaranteed
3575
- * their share of the throughput even after being
3576
- * dispatched. In this respect, it is easy to show that, if a
3577
- * group frequently becomes inactive while still having
3578
- * in-flight requests, and if, when this happens, the group is
3579
- * not considered in the calculation of whether the scenario
3580
- * is asymmetric, then the group may fail to be guaranteed its
3581
- * fair share of the throughput (basically because idling may
3582
- * not be performed for the descendant processes of the group,
3583
- * but it had to be). We address this issue with the
3584
- * following bi-modal behavior, implemented in the function
3585
- * bfq_symmetric_scenario().
3586
- *
3587
- * If there are groups with requests waiting for completion
3588
- * (as commented above, some of these groups may even be
3589
- * already inactive), then the scenario is tagged as
3590
- * asymmetric, conservatively, without checking any of the
3591
- * conditions (i) and (ii). So the device is idled for bfqq.
3592
- * This behavior matches also the fact that groups are created
3593
- * exactly if controlling I/O is a primary concern (to
3594
- * preserve bandwidth and latency guarantees).
3595
- *
3596
- * On the opposite end, if there are no groups with requests
3597
- * waiting for completion, then only condition (i) is actually
3598
- * controlled, i.e., provided that condition (i) holds, idling
3599
- * is not performed, regardless of whether condition (ii)
3600
- * holds. In other words, only if condition (i) does not hold,
3601
- * then idling is allowed, and the device tends to be
3602
- * prevented from queueing many requests, possibly of several
3603
- * processes. Since there are no groups with requests waiting
3604
- * for completion, then, to control condition (i) it is enough
3605
- * to check just whether all the queues with requests waiting
3606
- * for completion also have the same weight.
3607
- *
3608
- * Not checking condition (ii) evidently exposes bfqq to the
3609
- * risk of getting less throughput than its fair share.
3610
- * However, for queues with the same weight, a further
3611
- * mechanism, preemption, mitigates or even eliminates this
3612
- * problem. And it does so without consequences on overall
3613
- * throughput. This mechanism and its benefits are explained
3614
- * in the next three paragraphs.
3615
- *
3616
- * Even if a queue, say Q, is expired when it remains idle, Q
3617
- * can still preempt the new in-service queue if the next
3618
- * request of Q arrives soon (see the comments on
3619
- * bfq_bfqq_update_budg_for_activation). If all queues and
3620
- * groups have the same weight, this form of preemption,
3621
- * combined with the hole-recovery heuristic described in the
3622
- * comments on function bfq_bfqq_update_budg_for_activation,
3623
- * are enough to preserve a correct bandwidth distribution in
3624
- * the mid term, even without idling. In fact, even if not
3625
- * idling allows the internal queues of the device to contain
3626
- * many requests, and thus to reorder requests, we can rather
3627
- * safely assume that the internal scheduler still preserves a
3628
- * minimum of mid-term fairness.
3629
- *
3630
- * More precisely, this preemption-based, idleless approach
3631
- * provides fairness in terms of IOPS, and not sectors per
3632
- * second. This can be seen with a simple example. Suppose
3633
- * that there are two queues with the same weight, but that
3634
- * the first queue receives requests of 8 sectors, while the
3635
- * second queue receives requests of 1024 sectors. In
3636
- * addition, suppose that each of the two queues contains at
3637
- * most one request at a time, which implies that each queue
3638
- * always remains idle after it is served. Finally, after
3639
- * remaining idle, each queue receives very quickly a new
3640
- * request. It follows that the two queues are served
3641
- * alternatively, preempting each other if needed. This
3642
- * implies that, although both queues have the same weight,
3643
- * the queue with large requests receives a service that is
3644
- * 1024/8 times as high as the service received by the other
3645
- * queue.
3646
- *
3647
- * The motivation for using preemption instead of idling (for
3648
- * queues with the same weight) is that, by not idling,
3649
- * service guarantees are preserved (completely or at least in
3650
- * part) without minimally sacrificing throughput. And, if
3651
- * there is no active group, then the primary expectation for
3652
- * this device is probably a high throughput.
3653
- *
3654
- * We are now left only with explaining the additional
3655
- * compound condition that is checked below for deciding
3656
- * whether the scenario is asymmetric. To explain this
3657
- * compound condition, we need to add that the function
3658
- * bfq_symmetric_scenario checks the weights of only
3659
- * non-weight-raised queues, for efficiency reasons (see
3660
- * comments on bfq_weights_tree_add()). Then the fact that
3661
- * bfqq is weight-raised is checked explicitly here. More
3662
- * precisely, the compound condition below takes into account
3663
- * also the fact that, even if bfqq is being weight-raised,
3664
- * the scenario is still symmetric if all queues with requests
3665
- * waiting for completion happen to be
3666
- * weight-raised. Actually, we should be even more precise
3667
- * here, and differentiate between interactive weight raising
3668
- * and soft real-time weight raising.
3669
- *
3670
- * As a side note, it is worth considering that the above
3671
- * device-idling countermeasures may however fail in the
3672
- * following unlucky scenario: if idling is (correctly)
3673
- * disabled in a time period during which all symmetry
3674
- * sub-conditions hold, and hence the device is allowed to
3675
- * enqueue many requests, but at some later point in time some
3676
- * sub-condition stops to hold, then it may become impossible
3677
- * to let requests be served in the desired order until all
3678
- * the requests already queued in the device have been served.
4219
+ * Idling is performed only if slice_idle > 0. In addition, we
4220
+ * do not idle if
4221
+ * (a) bfqq is async
4222
+ * (b) bfqq is in the idle io prio class: in this case we do
4223
+ * not idle because we want to minimize the bandwidth that
4224
+ * queues in this class can steal to higher-priority queues
36794225 */
3680
- asymmetric_scenario = (bfqq->wr_coeff > 1 &&
3681
- bfqd->wr_busy_queues < bfqd->busy_queues) ||
3682
- !bfq_symmetric_scenario(bfqd);
4226
+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
4227
+ bfq_class_idle(bfqq))
4228
+ return false;
4229
+
4230
+ idling_boosts_thr_with_no_issue =
4231
+ idling_boosts_thr_without_issues(bfqd, bfqq);
4232
+
4233
+ idling_needed_for_service_guar =
4234
+ idling_needed_for_service_guarantees(bfqd, bfqq);
36834235
36844236 /*
3685
- * Finally, there is a case where maximizing throughput is the
3686
- * best choice even if it may cause unfairness toward
3687
- * bfqq. Such a case is when bfqq became active in a burst of
3688
- * queue activations. Queues that became active during a large
3689
- * burst benefit only from throughput, as discussed in the
3690
- * comments on bfq_handle_burst. Thus, if bfqq became active
3691
- * in a burst and not idling the device maximizes throughput,
3692
- * then the device must no be idled, because not idling the
3693
- * device provides bfqq and all other queues in the burst with
3694
- * maximum benefit. Combining this and the above case, we can
3695
- * now establish when idling is actually needed to preserve
3696
- * service guarantees.
3697
- */
3698
- idling_needed_for_service_guarantees =
3699
- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
3700
-
3701
- /*
3702
- * We have now all the components we need to compute the
4237
+ * We have now the two components we need to compute the
37034238 * return value of the function, which is true only if idling
37044239 * either boosts the throughput (without issues), or is
37054240 * necessary to preserve service guarantees.
37064241 */
3707
- return idling_boosts_thr_without_issues ||
3708
- idling_needed_for_service_guarantees;
4242
+ return idling_boosts_thr_with_no_issue ||
4243
+ idling_needed_for_service_guar;
37094244 }
37104245
37114246 /*
....@@ -3724,26 +4259,98 @@
37244259 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
37254260 }
37264261
3727
-static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
4262
+/*
4263
+ * This function chooses the queue from which to pick the next extra
4264
+ * I/O request to inject, if it finds a compatible queue. See the
4265
+ * comments on bfq_update_inject_limit() for details on the injection
4266
+ * mechanism, and for the definitions of the quantities mentioned
4267
+ * below.
4268
+ */
4269
+static struct bfq_queue *
4270
+bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
37284271 {
3729
- struct bfq_queue *bfqq;
4272
+ struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
4273
+ unsigned int limit = in_serv_bfqq->inject_limit;
4274
+ /*
4275
+ * If
4276
+ * - bfqq is not weight-raised and therefore does not carry
4277
+ * time-critical I/O,
4278
+ * or
4279
+ * - regardless of whether bfqq is weight-raised, bfqq has
4280
+ * however a long think time, during which it can absorb the
4281
+ * effect of an appropriate number of extra I/O requests
4282
+ * from other queues (see bfq_update_inject_limit for
4283
+ * details on the computation of this number);
4284
+ * then injection can be performed without restrictions.
4285
+ */
4286
+ bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 ||
4287
+ !bfq_bfqq_has_short_ttime(in_serv_bfqq);
37304288
37314289 /*
3732
- * A linear search; but, with a high probability, very few
3733
- * steps are needed to find a candidate queue, i.e., a queue
3734
- * with enough budget left for its next request. In fact:
4290
+ * If
4291
+ * - the baseline total service time could not be sampled yet,
4292
+ * so the inject limit happens to be still 0, and
4293
+ * - a lot of time has elapsed since the plugging of I/O
4294
+ * dispatching started, so drive speed is being wasted
4295
+ * significantly;
4296
+ * then temporarily raise inject limit to one request.
4297
+ */
4298
+ if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
4299
+ bfq_bfqq_wait_request(in_serv_bfqq) &&
4300
+ time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
4301
+ bfqd->bfq_slice_idle)
4302
+ )
4303
+ limit = 1;
4304
+
4305
+ if (bfqd->rq_in_driver >= limit)
4306
+ return NULL;
4307
+
4308
+ /*
4309
+ * Linear search of the source queue for injection; but, with
4310
+ * a high probability, very few steps are needed to find a
4311
+ * candidate queue, i.e., a queue with enough budget left for
4312
+ * its next request. In fact:
37354313 * - BFQ dynamically updates the budget of every queue so as
37364314 * to accommodate the expected backlog of the queue;
37374315 * - if a queue gets all its requests dispatched as injected
37384316 * service, then the queue is removed from the active list
3739
- * (and re-added only if it gets new requests, but with
3740
- * enough budget for its new backlog).
4317
+ * (and re-added only if it gets new requests, but then it
4318
+ * is assigned again enough budget for its new backlog).
37414319 */
37424320 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
37434321 if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
4322
+ (in_serv_always_inject || bfqq->wr_coeff > 1) &&
37444323 bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
3745
- bfq_bfqq_budget_left(bfqq))
3746
- return bfqq;
4324
+ bfq_bfqq_budget_left(bfqq)) {
4325
+ /*
4326
+ * Allow for only one large in-flight request
4327
+ * on non-rotational devices, for the
4328
+ * following reason. On non-rotationl drives,
4329
+ * large requests take much longer than
4330
+ * smaller requests to be served. In addition,
4331
+ * the drive prefers to serve large requests
4332
+ * w.r.t. to small ones, if it can choose. So,
4333
+ * having more than one large requests queued
4334
+ * in the drive may easily make the next first
4335
+ * request of the in-service queue wait for so
4336
+ * long to break bfqq's service guarantees. On
4337
+ * the bright side, large requests let the
4338
+ * drive reach a very high throughput, even if
4339
+ * there is only one in-flight large request
4340
+ * at a time.
4341
+ */
4342
+ if (blk_queue_nonrot(bfqd->queue) &&
4343
+ blk_rq_sectors(bfqq->next_rq) >=
4344
+ BFQQ_SECT_THR_NONROT)
4345
+ limit = min_t(unsigned int, 1, limit);
4346
+ else
4347
+ limit = in_serv_bfqq->inject_limit;
4348
+
4349
+ if (bfqd->rq_in_driver < limit) {
4350
+ bfqd->rqs_injected = true;
4351
+ return bfqq;
4352
+ }
4353
+ }
37474354
37484355 return NULL;
37494356 }
....@@ -3830,14 +4437,105 @@
38304437 * for a new request, or has requests waiting for a completion and
38314438 * may idle after their completion, then keep it anyway.
38324439 *
3833
- * Yet, to boost throughput, inject service from other queues if
3834
- * possible.
4440
+ * Yet, inject service from other queues if it boosts
4441
+ * throughput and is possible.
38354442 */
38364443 if (bfq_bfqq_wait_request(bfqq) ||
38374444 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3838
- if (bfq_bfqq_injectable(bfqq) &&
3839
- bfqq->injected_service * bfqq->inject_coeff <
3840
- bfqq->entity.service * 10)
4445
+ struct bfq_queue *async_bfqq =
4446
+ bfqq->bic && bfqq->bic->bfqq[0] &&
4447
+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
4448
+ bfqq->bic->bfqq[0]->next_rq ?
4449
+ bfqq->bic->bfqq[0] : NULL;
4450
+
4451
+ /*
4452
+ * The next three mutually-exclusive ifs decide
4453
+ * whether to try injection, and choose the queue to
4454
+ * pick an I/O request from.
4455
+ *
4456
+ * The first if checks whether the process associated
4457
+ * with bfqq has also async I/O pending. If so, it
4458
+ * injects such I/O unconditionally. Injecting async
4459
+ * I/O from the same process can cause no harm to the
4460
+ * process. On the contrary, it can only increase
4461
+ * bandwidth and reduce latency for the process.
4462
+ *
4463
+ * The second if checks whether there happens to be a
4464
+ * non-empty waker queue for bfqq, i.e., a queue whose
4465
+ * I/O needs to be completed for bfqq to receive new
4466
+ * I/O. This happens, e.g., if bfqq is associated with
4467
+ * a process that does some sync. A sync generates
4468
+ * extra blocking I/O, which must be completed before
4469
+ * the process associated with bfqq can go on with its
4470
+ * I/O. If the I/O of the waker queue is not served,
4471
+ * then bfqq remains empty, and no I/O is dispatched,
4472
+ * until the idle timeout fires for bfqq. This is
4473
+ * likely to result in lower bandwidth and higher
4474
+ * latencies for bfqq, and in a severe loss of total
4475
+ * throughput. The best action to take is therefore to
4476
+ * serve the waker queue as soon as possible. So do it
4477
+ * (without relying on the third alternative below for
4478
+ * eventually serving waker_bfqq's I/O; see the last
4479
+ * paragraph for further details). This systematic
4480
+ * injection of I/O from the waker queue does not
4481
+ * cause any delay to bfqq's I/O. On the contrary,
4482
+ * next bfqq's I/O is brought forward dramatically,
4483
+ * for it is not blocked for milliseconds.
4484
+ *
4485
+ * The third if checks whether bfqq is a queue for
4486
+ * which it is better to avoid injection. It is so if
4487
+ * bfqq delivers more throughput when served without
4488
+ * any further I/O from other queues in the middle, or
4489
+ * if the service times of bfqq's I/O requests both
4490
+ * count more than overall throughput, and may be
4491
+ * easily increased by injection (this happens if bfqq
4492
+ * has a short think time). If none of these
4493
+ * conditions holds, then a candidate queue for
4494
+ * injection is looked for through
4495
+ * bfq_choose_bfqq_for_injection(). Note that the
4496
+ * latter may return NULL (for example if the inject
4497
+ * limit for bfqq is currently 0).
4498
+ *
4499
+ * NOTE: motivation for the second alternative
4500
+ *
4501
+ * Thanks to the way the inject limit is updated in
4502
+ * bfq_update_has_short_ttime(), it is rather likely
4503
+ * that, if I/O is being plugged for bfqq and the
4504
+ * waker queue has pending I/O requests that are
4505
+ * blocking bfqq's I/O, then the third alternative
4506
+ * above lets the waker queue get served before the
4507
+ * I/O-plugging timeout fires. So one may deem the
4508
+ * second alternative superfluous. It is not, because
4509
+ * the third alternative may be way less effective in
4510
+ * case of a synchronization. For two main
4511
+ * reasons. First, throughput may be low because the
4512
+ * inject limit may be too low to guarantee the same
4513
+ * amount of injected I/O, from the waker queue or
4514
+ * other queues, that the second alternative
4515
+ * guarantees (the second alternative unconditionally
4516
+ * injects a pending I/O request of the waker queue
4517
+ * for each bfq_dispatch_request()). Second, with the
4518
+ * third alternative, the duration of the plugging,
4519
+ * i.e., the time before bfqq finally receives new I/O,
4520
+ * may not be minimized, because the waker queue may
4521
+ * happen to be served only after other queues.
4522
+ */
4523
+ if (async_bfqq &&
4524
+ icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4525
+ bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4526
+ bfq_bfqq_budget_left(async_bfqq))
4527
+ bfqq = bfqq->bic->bfqq[0];
4528
+ else if (bfq_bfqq_has_waker(bfqq) &&
4529
+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
4530
+ bfqq->waker_bfqq->next_rq &&
4531
+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
4532
+ bfqq->waker_bfqq) <=
4533
+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
4534
+ )
4535
+ bfqq = bfqq->waker_bfqq;
4536
+ else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4537
+ (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
4538
+ !bfq_bfqq_has_short_ttime(bfqq)))
38414539 bfqq = bfq_choose_bfqq_for_injection(bfqd);
38424540 else
38434541 bfqq = NULL;
....@@ -3929,15 +4627,15 @@
39294627
39304628 bfq_bfqq_served(bfqq, service_to_charge);
39314629
4630
+ if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
4631
+ bfqd->wait_dispatch = false;
4632
+ bfqd->waited_rq = rq;
4633
+ }
4634
+
39324635 bfq_dispatch_remove(bfqd->queue, rq);
39334636
3934
- if (bfqq != bfqd->in_service_queue) {
3935
- if (likely(bfqd->in_service_queue))
3936
- bfqd->in_service_queue->injected_service +=
3937
- bfq_serv_to_charge(rq, bfqq);
3938
-
4637
+ if (bfqq != bfqd->in_service_queue)
39394638 goto return_rq;
3940
- }
39414639
39424640 /*
39434641 * If weight raising has to terminate for bfqq, then next
....@@ -3957,7 +4655,7 @@
39574655 * belongs to CLASS_IDLE and other queues are waiting for
39584656 * service.
39594657 */
3960
- if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
4658
+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
39614659 goto return_rq;
39624660
39634661 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
....@@ -3975,7 +4673,7 @@
39754673 * most a call to dispatch for nothing
39764674 */
39774675 return !list_empty_careful(&bfqd->dispatch) ||
3978
- bfqd->busy_queues > 0;
4676
+ bfq_tot_busy_queues(bfqd) > 0;
39794677 }
39804678
39814679 static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
....@@ -4029,9 +4727,10 @@
40294727 goto start_rq;
40304728 }
40314729
4032
- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
4730
+ bfq_log(bfqd, "dispatch requests: %d busy queues",
4731
+ bfq_tot_busy_queues(bfqd));
40334732
4034
- if (bfqd->busy_queues == 0)
4733
+ if (bfq_tot_busy_queues(bfqd) == 0)
40354734 goto exit;
40364735
40374736 /*
....@@ -4043,7 +4742,7 @@
40434742 * some unlucky request wait for as long as the device
40444743 * wishes.
40454744 *
4046
- * Of course, serving one request at at time may cause loss of
4745
+ * Of course, serving one request at a time may cause loss of
40474746 * throughput.
40484747 */
40494748 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
....@@ -4065,7 +4764,7 @@
40654764 return rq;
40664765 }
40674766
4068
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
4767
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
40694768 static void bfq_update_dispatch_stats(struct request_queue *q,
40704769 struct request *rq,
40714770 struct bfq_queue *in_serv_queue,
....@@ -4089,7 +4788,7 @@
40894788 * In addition, the following queue lock guarantees that
40904789 * bfqq_group(bfqq) exists as well.
40914790 */
4092
- spin_lock_irq(q->queue_lock);
4791
+ spin_lock_irq(&q->queue_lock);
40934792 if (idle_timer_disabled)
40944793 /*
40954794 * Since the idle timer has been disabled,
....@@ -4108,21 +4807,21 @@
41084807 bfqg_stats_set_start_empty_time(bfqg);
41094808 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
41104809 }
4111
- spin_unlock_irq(q->queue_lock);
4810
+ spin_unlock_irq(&q->queue_lock);
41124811 }
41134812 #else
41144813 static inline void bfq_update_dispatch_stats(struct request_queue *q,
41154814 struct request *rq,
41164815 struct bfq_queue *in_serv_queue,
41174816 bool idle_timer_disabled) {}
4118
-#endif
4817
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
41194818
41204819 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
41214820 {
41224821 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
41234822 struct request *rq;
41244823 struct bfq_queue *in_serv_queue;
4125
- bool waiting_rq, idle_timer_disabled;
4824
+ bool waiting_rq, idle_timer_disabled = false;
41264825
41274826 spin_lock_irq(&bfqd->lock);
41284827
....@@ -4130,14 +4829,15 @@
41304829 waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
41314830
41324831 rq = __bfq_dispatch_request(hctx);
4133
-
4134
- idle_timer_disabled =
4135
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
4832
+ if (in_serv_queue == bfqd->in_service_queue) {
4833
+ idle_timer_disabled =
4834
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
4835
+ }
41364836
41374837 spin_unlock_irq(&bfqd->lock);
4138
-
4139
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
4140
- idle_timer_disabled);
4838
+ bfq_update_dispatch_stats(hctx->queue, rq,
4839
+ idle_timer_disabled ? in_serv_queue : NULL,
4840
+ idle_timer_disabled);
41414841
41424842 return rq;
41434843 }
....@@ -4151,9 +4851,9 @@
41514851 */
41524852 void bfq_put_queue(struct bfq_queue *bfqq)
41534853 {
4154
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
4854
+ struct bfq_queue *item;
4855
+ struct hlist_node *n;
41554856 struct bfq_group *bfqg = bfqq_group(bfqq);
4156
-#endif
41574857
41584858 if (bfqq->bfqd)
41594859 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
....@@ -4195,13 +4895,41 @@
41954895 bfqq->bfqd->burst_size--;
41964896 }
41974897
4898
+ /*
4899
+ * bfqq does not exist any longer, so it cannot be woken by
4900
+ * any other queue, and cannot wake any other queue. Then bfqq
4901
+ * must be removed from the woken list of its possible waker
4902
+ * queue, and all queues in the woken list of bfqq must stop
4903
+ * having a waker queue. Strictly speaking, these updates
4904
+ * should be performed when bfqq remains with no I/O source
4905
+ * attached to it, which happens before bfqq gets freed. In
4906
+ * particular, this happens when the last process associated
4907
+ * with bfqq exits or gets associated with a different
4908
+ * queue. However, both events lead to bfqq being freed soon,
4909
+ * and dangling references would come out only after bfqq gets
4910
+ * freed. So these updates are done here, as a simple and safe
4911
+ * way to handle all cases.
4912
+ */
4913
+ /* remove bfqq from woken list */
4914
+ if (!hlist_unhashed(&bfqq->woken_list_node))
4915
+ hlist_del_init(&bfqq->woken_list_node);
4916
+
4917
+ /* reset waker for all queues in woken list */
4918
+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
4919
+ woken_list_node) {
4920
+ item->waker_bfqq = NULL;
4921
+ bfq_clear_bfqq_has_waker(item);
4922
+ hlist_del_init(&item->woken_list_node);
4923
+ }
4924
+
4925
+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
4926
+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
4927
+
41984928 kmem_cache_free(bfq_pool, bfqq);
4199
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
42004929 bfqg_and_blkg_put(bfqg);
4201
-#endif
42024930 }
42034931
4204
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
4932
+void bfq_put_cooperator(struct bfq_queue *bfqq)
42054933 {
42064934 struct bfq_queue *__bfqq, *next;
42074935
....@@ -4223,7 +4951,7 @@
42234951 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
42244952 {
42254953 if (bfqq == bfqd->in_service_queue) {
4226
- __bfq_bfqq_expire(bfqd, bfqq);
4954
+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
42274955 bfq_schedule_dispatch(bfqd);
42284956 }
42294957
....@@ -4231,7 +4959,7 @@
42314959
42324960 bfq_put_cooperator(bfqq);
42334961
4234
- bfq_put_queue(bfqq); /* release process reference */
4962
+ bfq_release_process_ref(bfqd, bfqq);
42354963 }
42364964
42374965 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
....@@ -4281,7 +5009,7 @@
42815009 pr_err("bdi %s: bfq: bad prio class %d\n",
42825010 bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
42835011 ioprio_class);
4284
- /* fall through */
5012
+ fallthrough;
42855013 case IOPRIO_CLASS_NONE:
42865014 /*
42875015 * No prio set, inherit CPU scheduling settings.
....@@ -4334,8 +5062,7 @@
43345062
43355063 bfqq = bic_to_bfqq(bic, false);
43365064 if (bfqq) {
4337
- /* release process reference on this queue */
4338
- bfq_put_queue(bfqq);
5065
+ bfq_release_process_ref(bfqd, bfqq);
43395066 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
43405067 bic_set_bfqq(bic, bfqq, false);
43415068 }
....@@ -4351,6 +5078,8 @@
43515078 RB_CLEAR_NODE(&bfqq->entity.rb_node);
43525079 INIT_LIST_HEAD(&bfqq->fifo);
43535080 INIT_HLIST_NODE(&bfqq->burst_list_node);
5081
+ INIT_HLIST_NODE(&bfqq->woken_list_node);
5082
+ INIT_HLIST_HEAD(&bfqq->woken_list);
43545083
43555084 bfqq->ref = 0;
43565085 bfqq->bfqd = bfqd;
....@@ -4369,13 +5098,6 @@
43695098 bfq_mark_bfqq_has_short_ttime(bfqq);
43705099 bfq_mark_bfqq_sync(bfqq);
43715100 bfq_mark_bfqq_just_created(bfqq);
4372
- /*
4373
- * Aggressively inject a lot of service: up to 90%.
4374
- * This coefficient remains constant during bfqq life,
4375
- * but this behavior might be changed, after enough
4376
- * testing and tuning.
4377
- */
4378
- bfqq->inject_coeff = 1;
43795101 } else
43805102 bfq_clear_bfqq_sync(bfqq);
43815103
....@@ -4419,7 +5141,7 @@
44195141 return &bfqg->async_bfqq[0][ioprio];
44205142 case IOPRIO_CLASS_NONE:
44215143 ioprio = IOPRIO_NORM;
4422
- /* fall through */
5144
+ fallthrough;
44235145 case IOPRIO_CLASS_BE:
44245146 return &bfqg->async_bfqq[1][ioprio];
44255147 case IOPRIO_CLASS_IDLE:
....@@ -4439,14 +5161,7 @@
44395161 struct bfq_queue *bfqq;
44405162 struct bfq_group *bfqg;
44415163
4442
- rcu_read_lock();
4443
-
4444
- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
4445
- if (!bfqg) {
4446
- bfqq = &bfqd->oom_bfqq;
4447
- goto out;
4448
- }
4449
-
5164
+ bfqg = bfq_bio_bfqg(bfqd, bio);
44505165 if (!is_sync) {
44515166 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
44525167 ioprio);
....@@ -4490,7 +5205,6 @@
44905205 out:
44915206 bfqq->ref++; /* get a process reference to this queue */
44925207 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
4493
- rcu_read_unlock();
44945208 return bfqq;
44955209 }
44965210
....@@ -4513,17 +5227,19 @@
45135227 struct request *rq)
45145228 {
45155229 bfqq->seek_history <<= 1;
4516
- bfqq->seek_history |=
4517
- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
4518
- (!blk_queue_nonrot(bfqd->queue) ||
4519
- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5230
+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
5231
+
5232
+ if (bfqq->wr_coeff > 1 &&
5233
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
5234
+ BFQQ_TOTALLY_SEEKY(bfqq))
5235
+ bfq_bfqq_end_wr(bfqq);
45205236 }
45215237
45225238 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
45235239 struct bfq_queue *bfqq,
45245240 struct bfq_io_cq *bic)
45255241 {
4526
- bool has_short_ttime = true;
5242
+ bool has_short_ttime = true, state_changed;
45275243
45285244 /*
45295245 * No need to update has_short_ttime if bfqq is async or in
....@@ -4548,13 +5264,102 @@
45485264 bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
45495265 has_short_ttime = false;
45505266
4551
- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
4552
- has_short_ttime);
5267
+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
45535268
45545269 if (has_short_ttime)
45555270 bfq_mark_bfqq_has_short_ttime(bfqq);
45565271 else
45575272 bfq_clear_bfqq_has_short_ttime(bfqq);
5273
+
5274
+ /*
5275
+ * Until the base value for the total service time gets
5276
+ * finally computed for bfqq, the inject limit does depend on
5277
+ * the think-time state (short|long). In particular, the limit
5278
+ * is 0 or 1 if the think time is deemed, respectively, as
5279
+ * short or long (details in the comments in
5280
+ * bfq_update_inject_limit()). Accordingly, the next
5281
+ * instructions reset the inject limit if the think-time state
5282
+ * has changed and the above base value is still to be
5283
+ * computed.
5284
+ *
5285
+ * However, the reset is performed only if more than 100 ms
5286
+ * have elapsed since the last update of the inject limit, or
5287
+ * (inclusive) if the change is from short to long think
5288
+ * time. The reason for this waiting is as follows.
5289
+ *
5290
+ * bfqq may have a long think time because of a
5291
+ * synchronization with some other queue, i.e., because the
5292
+ * I/O of some other queue may need to be completed for bfqq
5293
+ * to receive new I/O. Details in the comments on the choice
5294
+ * of the queue for injection in bfq_select_queue().
5295
+ *
5296
+ * As stressed in those comments, if such a synchronization is
5297
+ * actually in place, then, without injection on bfqq, the
5298
+ * blocking I/O cannot happen to served while bfqq is in
5299
+ * service. As a consequence, if bfqq is granted
5300
+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
5301
+ * is dispatched, until the idle timeout fires. This is likely
5302
+ * to result in lower bandwidth and higher latencies for bfqq,
5303
+ * and in a severe loss of total throughput.
5304
+ *
5305
+ * On the opposite end, a non-zero inject limit may allow the
5306
+ * I/O that blocks bfqq to be executed soon, and therefore
5307
+ * bfqq to receive new I/O soon.
5308
+ *
5309
+ * But, if the blocking gets actually eliminated, then the
5310
+ * next think-time sample for bfqq may be very low. This in
5311
+ * turn may cause bfqq's think time to be deemed
5312
+ * short. Without the 100 ms barrier, this new state change
5313
+ * would cause the body of the next if to be executed
5314
+ * immediately. But this would set to 0 the inject
5315
+ * limit. Without injection, the blocking I/O would cause the
5316
+ * think time of bfqq to become long again, and therefore the
5317
+ * inject limit to be raised again, and so on. The only effect
5318
+ * of such a steady oscillation between the two think-time
5319
+ * states would be to prevent effective injection on bfqq.
5320
+ *
5321
+ * In contrast, if the inject limit is not reset during such a
5322
+ * long time interval as 100 ms, then the number of short
5323
+ * think time samples can grow significantly before the reset
5324
+ * is performed. As a consequence, the think time state can
5325
+ * become stable before the reset. Therefore there will be no
5326
+ * state change when the 100 ms elapse, and no reset of the
5327
+ * inject limit. The inject limit remains steadily equal to 1
5328
+ * both during and after the 100 ms. So injection can be
5329
+ * performed at all times, and throughput gets boosted.
5330
+ *
5331
+ * An inject limit equal to 1 is however in conflict, in
5332
+ * general, with the fact that the think time of bfqq is
5333
+ * short, because injection may be likely to delay bfqq's I/O
5334
+ * (as explained in the comments in
5335
+ * bfq_update_inject_limit()). But this does not happen in
5336
+ * this special case, because bfqq's low think time is due to
5337
+ * an effective handling of a synchronization, through
5338
+ * injection. In this special case, bfqq's I/O does not get
5339
+ * delayed by injection; on the contrary, bfqq's I/O is
5340
+ * brought forward, because it is not blocked for
5341
+ * milliseconds.
5342
+ *
5343
+ * In addition, serving the blocking I/O much sooner, and much
5344
+ * more frequently than once per I/O-plugging timeout, makes
5345
+ * it much quicker to detect a waker queue (the concept of
5346
+ * waker queue is defined in the comments in
5347
+ * bfq_add_request()). This makes it possible to start sooner
5348
+ * to boost throughput more effectively, by injecting the I/O
5349
+ * of the waker queue unconditionally on every
5350
+ * bfq_dispatch_request().
5351
+ *
5352
+ * One last, important benefit of not resetting the inject
5353
+ * limit before 100 ms is that, during this time interval, the
5354
+ * base value for the total service time is likely to get
5355
+ * finally computed for bfqq, freeing the inject limit from
5356
+ * its relation with the think time.
5357
+ */
5358
+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
5359
+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
5360
+ msecs_to_jiffies(100)) ||
5361
+ !has_short_ttime))
5362
+ bfq_reset_inject_limit(bfqd, bfqq);
45585363 }
45595364
45605365 /*
....@@ -4564,18 +5369,8 @@
45645369 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
45655370 struct request *rq)
45665371 {
4567
- struct bfq_io_cq *bic = RQ_BIC(rq);
4568
-
45695372 if (rq->cmd_flags & REQ_META)
45705373 bfqq->meta_pending++;
4571
-
4572
- bfq_update_io_thinktime(bfqd, bfqq);
4573
- bfq_update_has_short_ttime(bfqd, bfqq, bic);
4574
- bfq_update_io_seektime(bfqd, bfqq, rq);
4575
-
4576
- bfq_log_bfqq(bfqd, bfqq,
4577
- "rq_enqueued: has_short_ttime=%d (seeky %d)",
4578
- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
45795374
45805375 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
45815376
....@@ -4585,28 +5380,31 @@
45855380 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
45865381
45875382 /*
4588
- * There is just this request queued: if the request
4589
- * is small and the queue is not to be expired, then
4590
- * just exit.
5383
+ * There is just this request queued: if
5384
+ * - the request is small, and
5385
+ * - we are idling to boost throughput, and
5386
+ * - the queue is not to be expired,
5387
+ * then just exit.
45915388 *
45925389 * In this way, if the device is being idled to wait
45935390 * for a new request from the in-service queue, we
45945391 * avoid unplugging the device and committing the
4595
- * device to serve just a small request. On the
4596
- * contrary, we wait for the block layer to decide
4597
- * when to unplug the device: hopefully, new requests
4598
- * will be merged to this one quickly, then the device
4599
- * will be unplugged and larger requests will be
4600
- * dispatched.
5392
+ * device to serve just a small request. In contrast
5393
+ * we wait for the block layer to decide when to
5394
+ * unplug the device: hopefully, new requests will be
5395
+ * merged to this one quickly, then the device will be
5396
+ * unplugged and larger requests will be dispatched.
46015397 */
4602
- if (small_req && !budget_timeout)
5398
+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
5399
+ !budget_timeout)
46035400 return;
46045401
46055402 /*
4606
- * A large enough request arrived, or the queue is to
4607
- * be expired: in both cases disk idling is to be
4608
- * stopped, so clear wait_request flag and reset
4609
- * timer.
5403
+ * A large enough request arrived, or idling is being
5404
+ * performed to preserve service guarantees, or
5405
+ * finally the queue is to be expired: in all these
5406
+ * cases disk idling is to be stopped, so clear
5407
+ * wait_request flag and reset timer.
46105408 */
46115409 bfq_clear_bfqq_wait_request(bfqq);
46125410 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
....@@ -4632,8 +5430,6 @@
46325430 bool waiting, idle_timer_disabled = false;
46335431
46345432 if (new_bfqq) {
4635
- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
4636
- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
46375433 /*
46385434 * Release the request's reference to the old bfqq
46395435 * and make sure one is taken to the shared queue.
....@@ -4663,6 +5459,10 @@
46635459 bfqq = new_bfqq;
46645460 }
46655461
5462
+ bfq_update_io_thinktime(bfqd, bfqq);
5463
+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
5464
+ bfq_update_io_seektime(bfqd, bfqq, rq);
5465
+
46665466 waiting = bfqq && bfq_bfqq_wait_request(bfqq);
46675467 bfq_add_request(rq);
46685468 idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
....@@ -4675,7 +5475,7 @@
46755475 return idle_timer_disabled;
46765476 }
46775477
4678
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
5478
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
46795479 static void bfq_update_insert_stats(struct request_queue *q,
46805480 struct bfq_queue *bfqq,
46815481 bool idle_timer_disabled,
....@@ -4694,18 +5494,20 @@
46945494 * In addition, the following queue lock guarantees that
46955495 * bfqq_group(bfqq) exists as well.
46965496 */
4697
- spin_lock_irq(q->queue_lock);
5497
+ spin_lock_irq(&q->queue_lock);
46985498 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
46995499 if (idle_timer_disabled)
47005500 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4701
- spin_unlock_irq(q->queue_lock);
5501
+ spin_unlock_irq(&q->queue_lock);
47025502 }
47035503 #else
47045504 static inline void bfq_update_insert_stats(struct request_queue *q,
47055505 struct bfq_queue *bfqq,
47065506 bool idle_timer_disabled,
47075507 unsigned int cmd_flags) {}
4708
-#endif
5508
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
5509
+
5510
+static struct bfq_queue *bfq_init_rq(struct request *rq);
47095511
47105512 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
47115513 bool at_head)
....@@ -4716,18 +5518,19 @@
47165518 bool idle_timer_disabled = false;
47175519 unsigned int cmd_flags;
47185520
5521
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
5522
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
5523
+ bfqg_stats_update_legacy_io(q, rq);
5524
+#endif
47195525 spin_lock_irq(&bfqd->lock);
5526
+ bfqq = bfq_init_rq(rq);
47205527 if (blk_mq_sched_try_insert_merge(q, rq)) {
47215528 spin_unlock_irq(&bfqd->lock);
47225529 return;
47235530 }
47245531
4725
- spin_unlock_irq(&bfqd->lock);
4726
-
47275532 blk_mq_sched_request_inserted(rq);
47285533
4729
- spin_lock_irq(&bfqd->lock);
4730
- bfqq = bfq_init_rq(rq);
47315534 if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
47325535 if (at_head)
47335536 list_add(&rq->queuelist, &bfqd->dispatch);
....@@ -4776,6 +5579,8 @@
47765579
47775580 static void bfq_update_hw_tag(struct bfq_data *bfqd)
47785581 {
5582
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
5583
+
47795584 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
47805585 bfqd->rq_in_driver);
47815586
....@@ -4788,7 +5593,18 @@
47885593 * sum is not exact, as it's not taking into account deactivated
47895594 * requests.
47905595 */
4791
- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
5596
+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
5597
+ return;
5598
+
5599
+ /*
5600
+ * If active queue hasn't enough requests and can idle, bfq might not
5601
+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
5602
+ * case
5603
+ */
5604
+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
5605
+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
5606
+ BFQ_HW_QUEUE_THRESHOLD &&
5607
+ bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
47925608 return;
47935609
47945610 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
....@@ -4797,6 +5613,9 @@
47975613 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
47985614 bfqd->max_rq_in_driver = 0;
47995615 bfqd->hw_tag_samples = 0;
5616
+
5617
+ bfqd->nonrot_with_queueing =
5618
+ blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
48005619 }
48015620
48025621 static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
....@@ -4852,6 +5671,7 @@
48525671 1UL<<(BFQ_RATE_SHIFT - 10))
48535672 bfq_update_rate_reset(bfqd, NULL);
48545673 bfqd->last_completion = now_ns;
5674
+ bfqd->last_completed_rq_bfqq = bfqq;
48555675
48565676 /*
48575677 * If we are waiting to discover whether the request pattern
....@@ -4859,11 +5679,14 @@
48595679 * isochronous, and both requisites for this condition to hold
48605680 * are now satisfied, then compute soft_rt_next_start (see the
48615681 * comments on the function bfq_bfqq_softrt_next_start()). We
4862
- * schedule this delayed check when bfqq expires, if it still
4863
- * has in-flight requests.
5682
+ * do not compute soft_rt_next_start if bfqq is in interactive
5683
+ * weight raising (see the comments in bfq_bfqq_expire() for
5684
+ * an explanation). We schedule this delayed update when bfqq
5685
+ * expires, if it still has in-flight requests.
48645686 */
48655687 if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
4866
- RB_EMPTY_ROOT(&bfqq->sort_list))
5688
+ RB_EMPTY_ROOT(&bfqq->sort_list) &&
5689
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
48675690 bfqq->soft_rt_next_start =
48685691 bfq_bfqq_softrt_next_start(bfqd, bfqq);
48695692
....@@ -4921,6 +5744,167 @@
49215744 }
49225745
49235746 /*
5747
+ * The processes associated with bfqq may happen to generate their
5748
+ * cumulative I/O at a lower rate than the rate at which the device
5749
+ * could serve the same I/O. This is rather probable, e.g., if only
5750
+ * one process is associated with bfqq and the device is an SSD. It
5751
+ * results in bfqq becoming often empty while in service. In this
5752
+ * respect, if BFQ is allowed to switch to another queue when bfqq
5753
+ * remains empty, then the device goes on being fed with I/O requests,
5754
+ * and the throughput is not affected. In contrast, if BFQ is not
5755
+ * allowed to switch to another queue---because bfqq is sync and
5756
+ * I/O-dispatch needs to be plugged while bfqq is temporarily
5757
+ * empty---then, during the service of bfqq, there will be frequent
5758
+ * "service holes", i.e., time intervals during which bfqq gets empty
5759
+ * and the device can only consume the I/O already queued in its
5760
+ * hardware queues. During service holes, the device may even get to
5761
+ * remaining idle. In the end, during the service of bfqq, the device
5762
+ * is driven at a lower speed than the one it can reach with the kind
5763
+ * of I/O flowing through bfqq.
5764
+ *
5765
+ * To counter this loss of throughput, BFQ implements a "request
5766
+ * injection mechanism", which tries to fill the above service holes
5767
+ * with I/O requests taken from other queues. The hard part in this
5768
+ * mechanism is finding the right amount of I/O to inject, so as to
5769
+ * both boost throughput and not break bfqq's bandwidth and latency
5770
+ * guarantees. In this respect, the mechanism maintains a per-queue
5771
+ * inject limit, computed as below. While bfqq is empty, the injection
5772
+ * mechanism dispatches extra I/O requests only until the total number
5773
+ * of I/O requests in flight---i.e., already dispatched but not yet
5774
+ * completed---remains lower than this limit.
5775
+ *
5776
+ * A first definition comes in handy to introduce the algorithm by
5777
+ * which the inject limit is computed. We define as first request for
5778
+ * bfqq, an I/O request for bfqq that arrives while bfqq is in
5779
+ * service, and causes bfqq to switch from empty to non-empty. The
5780
+ * algorithm updates the limit as a function of the effect of
5781
+ * injection on the service times of only the first requests of
5782
+ * bfqq. The reason for this restriction is that these are the
5783
+ * requests whose service time is affected most, because they are the
5784
+ * first to arrive after injection possibly occurred.
5785
+ *
5786
+ * To evaluate the effect of injection, the algorithm measures the
5787
+ * "total service time" of first requests. We define as total service
5788
+ * time of an I/O request, the time that elapses since when the
5789
+ * request is enqueued into bfqq, to when it is completed. This
5790
+ * quantity allows the whole effect of injection to be measured. It is
5791
+ * easy to see why. Suppose that some requests of other queues are
5792
+ * actually injected while bfqq is empty, and that a new request R
5793
+ * then arrives for bfqq. If the device does start to serve all or
5794
+ * part of the injected requests during the service hole, then,
5795
+ * because of this extra service, it may delay the next invocation of
5796
+ * the dispatch hook of BFQ. Then, even after R gets eventually
5797
+ * dispatched, the device may delay the actual service of R if it is
5798
+ * still busy serving the extra requests, or if it decides to serve,
5799
+ * before R, some extra request still present in its queues. As a
5800
+ * conclusion, the cumulative extra delay caused by injection can be
5801
+ * easily evaluated by just comparing the total service time of first
5802
+ * requests with and without injection.
5803
+ *
5804
+ * The limit-update algorithm works as follows. On the arrival of a
5805
+ * first request of bfqq, the algorithm measures the total time of the
5806
+ * request only if one of the three cases below holds, and, for each
5807
+ * case, it updates the limit as described below:
5808
+ *
5809
+ * (1) If there is no in-flight request. This gives a baseline for the
5810
+ * total service time of the requests of bfqq. If the baseline has
5811
+ * not been computed yet, then, after computing it, the limit is
5812
+ * set to 1, to start boosting throughput, and to prepare the
5813
+ * ground for the next case. If the baseline has already been
5814
+ * computed, then it is updated, in case it results to be lower
5815
+ * than the previous value.
5816
+ *
5817
+ * (2) If the limit is higher than 0 and there are in-flight
5818
+ * requests. By comparing the total service time in this case with
5819
+ * the above baseline, it is possible to know at which extent the
5820
+ * current value of the limit is inflating the total service
5821
+ * time. If the inflation is below a certain threshold, then bfqq
5822
+ * is assumed to be suffering from no perceivable loss of its
5823
+ * service guarantees, and the limit is even tentatively
5824
+ * increased. If the inflation is above the threshold, then the
5825
+ * limit is decreased. Due to the lack of any hysteresis, this
5826
+ * logic makes the limit oscillate even in steady workload
5827
+ * conditions. Yet we opted for it, because it is fast in reaching
5828
+ * the best value for the limit, as a function of the current I/O
5829
+ * workload. To reduce oscillations, this step is disabled for a
5830
+ * short time interval after the limit happens to be decreased.
5831
+ *
5832
+ * (3) Periodically, after resetting the limit, to make sure that the
5833
+ * limit eventually drops in case the workload changes. This is
5834
+ * needed because, after the limit has gone safely up for a
5835
+ * certain workload, it is impossible to guess whether the
5836
+ * baseline total service time may have changed, without measuring
5837
+ * it again without injection. A more effective version of this
5838
+ * step might be to just sample the baseline, by interrupting
5839
+ * injection only once, and then to reset/lower the limit only if
5840
+ * the total service time with the current limit does happen to be
5841
+ * too large.
5842
+ *
5843
+ * More details on each step are provided in the comments on the
5844
+ * pieces of code that implement these steps: the branch handling the
5845
+ * transition from empty to non empty in bfq_add_request(), the branch
5846
+ * handling injection in bfq_select_queue(), and the function
5847
+ * bfq_choose_bfqq_for_injection(). These comments also explain some
5848
+ * exceptions, made by the injection mechanism in some special cases.
5849
+ */
5850
+static void bfq_update_inject_limit(struct bfq_data *bfqd,
5851
+ struct bfq_queue *bfqq)
5852
+{
5853
+ u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
5854
+ unsigned int old_limit = bfqq->inject_limit;
5855
+
5856
+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
5857
+ u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
5858
+
5859
+ if (tot_time_ns >= threshold && old_limit > 0) {
5860
+ bfqq->inject_limit--;
5861
+ bfqq->decrease_time_jif = jiffies;
5862
+ } else if (tot_time_ns < threshold &&
5863
+ old_limit <= bfqd->max_rq_in_driver)
5864
+ bfqq->inject_limit++;
5865
+ }
5866
+
5867
+ /*
5868
+ * Either we still have to compute the base value for the
5869
+ * total service time, and there seem to be the right
5870
+ * conditions to do it, or we can lower the last base value
5871
+ * computed.
5872
+ *
5873
+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
5874
+ * request in flight, because this function is in the code
5875
+ * path that handles the completion of a request of bfqq, and,
5876
+ * in particular, this function is executed before
5877
+ * bfqd->rq_in_driver is decremented in such a code path.
5878
+ */
5879
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
5880
+ tot_time_ns < bfqq->last_serv_time_ns) {
5881
+ if (bfqq->last_serv_time_ns == 0) {
5882
+ /*
5883
+ * Now we certainly have a base value: make sure we
5884
+ * start trying injection.
5885
+ */
5886
+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
5887
+ }
5888
+ bfqq->last_serv_time_ns = tot_time_ns;
5889
+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
5890
+ /*
5891
+ * No I/O injected and no request still in service in
5892
+ * the drive: these are the exact conditions for
5893
+ * computing the base value of the total service time
5894
+ * for bfqq. So let's update this value, because it is
5895
+ * rather variable. For example, it varies if the size
5896
+ * or the spatial locality of the I/O requests in bfqq
5897
+ * change.
5898
+ */
5899
+ bfqq->last_serv_time_ns = tot_time_ns;
5900
+
5901
+
5902
+ /* update complete, not waiting for any request completion any longer */
5903
+ bfqd->waited_rq = NULL;
5904
+ bfqd->rqs_injected = false;
5905
+}
5906
+
5907
+/*
49245908 * Handle either a requeue or a finish for rq. The things to do are
49255909 * the same in both cases: all references to rq are to be dropped. In
49265910 * particular, rq is considered completed from the point of view of
....@@ -4930,18 +5914,6 @@
49305914 {
49315915 struct bfq_queue *bfqq = RQ_BFQQ(rq);
49325916 struct bfq_data *bfqd;
4933
-
4934
- /*
4935
- * Requeue and finish hooks are invoked in blk-mq without
4936
- * checking whether the involved request is actually still
4937
- * referenced in the scheduler. To handle this fact, the
4938
- * following two checks make this function exit in case of
4939
- * spurious invocations, for which there is nothing to do.
4940
- *
4941
- * First, check whether rq has nothing to do with an elevator.
4942
- */
4943
- if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
4944
- return;
49455917
49465918 /*
49475919 * rq either is not associated with any icq, or is an already
....@@ -4963,6 +5935,9 @@
49635935 unsigned long flags;
49645936
49655937 spin_lock_irqsave(&bfqd->lock, flags);
5938
+
5939
+ if (rq == bfqd->waited_rq)
5940
+ bfq_update_inject_limit(bfqd, bfqq);
49665941
49675942 bfq_completed_request(bfqq, bfqd);
49685943 bfq_finish_requeue_request_body(bfqq);
....@@ -5012,6 +5987,8 @@
50125987 }
50135988
50145989 /*
5990
+ * Removes the association between the current task and bfqq, assuming
5991
+ * that bic points to the bfq iocontext of the task.
50155992 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
50165993 * was the last process referring to that bfqq.
50175994 */
....@@ -5031,7 +6008,7 @@
50316008
50326009 bfq_put_cooperator(bfqq);
50336010
5034
- bfq_put_queue(bfqq);
6011
+ bfq_release_process_ref(bfqq->bfqd, bfqq);
50356012 return NULL;
50366013 }
50376014
....@@ -5104,7 +6081,7 @@
51046081 * comments on bfq_init_rq for the reason behind this delayed
51056082 * preparation.
51066083 */
5107
-static void bfq_prepare_request(struct request *rq, struct bio *bio)
6084
+static void bfq_prepare_request(struct request *rq)
51086085 {
51096086 /*
51106087 * Regardless of whether we have an icq attached, we have to
....@@ -5127,7 +6104,7 @@
51276104 * preparation is that, after the prepare_request hook is invoked for
51286105 * rq, rq may still be transformed into a request with no icq, i.e., a
51296106 * request not associated with any queue. No bfq hook is invoked to
5130
- * signal this tranformation. As a consequence, should these
6107
+ * signal this transformation. As a consequence, should these
51316108 * preparation operations be performed when the prepare_request hook
51326109 * is invoked, and should rq be transformed one moment later, bfq
51336110 * would end up in an inconsistent state, because it would have
....@@ -5218,7 +6195,29 @@
52186195 }
52196196 }
52206197
5221
- if (unlikely(bfq_bfqq_just_created(bfqq)))
6198
+ /*
6199
+ * Consider bfqq as possibly belonging to a burst of newly
6200
+ * created queues only if:
6201
+ * 1) A burst is actually happening (bfqd->burst_size > 0)
6202
+ * or
6203
+ * 2) There is no other active queue. In fact, if, in
6204
+ * contrast, there are active queues not belonging to the
6205
+ * possible burst bfqq may belong to, then there is no gain
6206
+ * in considering bfqq as belonging to a burst, and
6207
+ * therefore in not weight-raising bfqq. See comments on
6208
+ * bfq_handle_burst().
6209
+ *
6210
+ * This filtering also helps eliminating false positives,
6211
+ * occurring when bfqq does not belong to an actual large
6212
+ * burst, but some background task (e.g., a service) happens
6213
+ * to trigger the creation of new queues very close to when
6214
+ * bfqq and its possible companion queues are created. See
6215
+ * comments on bfq_handle_burst() for further details also on
6216
+ * this issue.
6217
+ */
6218
+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
6219
+ (bfqd->burst_size > 0 ||
6220
+ bfq_tot_busy_queues(bfqd) == 0)))
52226221 bfq_handle_burst(bfqd, bfqq);
52236222
52246223 return bfqq;
....@@ -5267,8 +6266,8 @@
52676266 bfq_bfqq_expire(bfqd, bfqq, true, reason);
52686267
52696268 schedule_dispatch:
5270
- spin_unlock_irqrestore(&bfqd->lock, flags);
52716269 bfq_schedule_dispatch(bfqd);
6270
+ spin_unlock_irqrestore(&bfqd->lock, flags);
52726271 }
52736272
52746273 /*
....@@ -5381,8 +6380,8 @@
53816380 struct blk_mq_tags *tags = hctx->sched_tags;
53826381 unsigned int min_shallow;
53836382
5384
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
5385
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
6383
+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
6384
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
53866385 }
53876386
53886387 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
....@@ -5405,10 +6404,10 @@
54056404
54066405 hrtimer_cancel(&bfqd->idle_slice_timer);
54076406
5408
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
54096407 /* release oom-queue reference to root group */
54106408 bfqg_and_blkg_put(bfqd->root_group);
54116409
6410
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
54126411 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
54136412 #else
54146413 spin_lock_irq(&bfqd->lock);
....@@ -5454,9 +6453,9 @@
54546453 }
54556454 eq->elevator_data = bfqd;
54566455
5457
- spin_lock_irq(q->queue_lock);
6456
+ spin_lock_irq(&q->queue_lock);
54586457 q->elevator = eq;
5459
- spin_unlock_irq(q->queue_lock);
6458
+ spin_unlock_irq(&q->queue_lock);
54606459
54616460 /*
54626461 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
....@@ -5488,7 +6487,7 @@
54886487 HRTIMER_MODE_REL);
54896488 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
54906489
5491
- bfqd->queue_weights_tree = RB_ROOT;
6490
+ bfqd->queue_weights_tree = RB_ROOT_CACHED;
54926491 bfqd->num_groups_with_pending_reqs = 0;
54936492
54946493 INIT_LIST_HEAD(&bfqd->active_list);
....@@ -5496,6 +6495,7 @@
54966495 INIT_HLIST_HEAD(&bfqd->burst_list);
54976496
54986497 bfqd->hw_tag = -1;
6498
+ bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
54996499
55006500 bfqd->bfq_max_budget = bfq_default_max_budget;
55016501
....@@ -5796,7 +6796,7 @@
57966796 };
57976797
57986798 static struct elevator_type iosched_bfq_mq = {
5799
- .ops.mq = {
6799
+ .ops = {
58006800 .limit_depth = bfq_limit_depth,
58016801 .prepare_request = bfq_prepare_request,
58026802 .requeue_request = bfq_finish_requeue_request,
....@@ -5818,7 +6818,6 @@
58186818 .exit_sched = bfq_exit_queue,
58196819 },
58206820
5821
- .uses_mq = true,
58226821 .icq_size = sizeof(struct bfq_io_cq),
58236822 .icq_align = __alignof__(struct bfq_io_cq),
58246823 .elevator_attrs = bfq_attrs,