.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * Budget Fair Queueing (BFQ) I/O scheduler. |
---|
3 | 4 | * |
---|
.. | .. |
---|
12 | 13 | * |
---|
13 | 14 | * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> |
---|
14 | 15 | * |
---|
15 | | - * This program is free software; you can redistribute it and/or |
---|
16 | | - * modify it under the terms of the GNU General Public License as |
---|
17 | | - * published by the Free Software Foundation; either version 2 of the |
---|
18 | | - * License, or (at your option) any later version. |
---|
19 | | - * |
---|
20 | | - * This program is distributed in the hope that it will be useful, |
---|
21 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
22 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
23 | | - * General Public License for more details. |
---|
24 | | - * |
---|
25 | 16 | * BFQ is a proportional-share I/O scheduler, with some extra |
---|
26 | 17 | * low-latency capabilities. BFQ also supports full hierarchical |
---|
27 | 18 | * scheduling through cgroups. Next paragraphs provide an introduction |
---|
28 | 19 | * on BFQ inner workings. Details on BFQ benefits, usage and |
---|
29 | | - * limitations can be found in Documentation/block/bfq-iosched.txt. |
---|
| 20 | + * limitations can be found in Documentation/block/bfq-iosched.rst. |
---|
30 | 21 | * |
---|
31 | 22 | * BFQ is a proportional-share storage-I/O scheduling algorithm based |
---|
32 | 23 | * on the slice-by-slice service scheme of CFQ. But BFQ assigns |
---|
.. | .. |
---|
167 | 158 | BFQ_BFQQ_FNS(coop); |
---|
168 | 159 | BFQ_BFQQ_FNS(split_coop); |
---|
169 | 160 | BFQ_BFQQ_FNS(softrt_update); |
---|
| 161 | +BFQ_BFQQ_FNS(has_waker); |
---|
170 | 162 | #undef BFQ_BFQQ_FNS \ |
---|
171 | 163 | |
---|
172 | 164 | /* Expiration time of sync (0) and async (1) requests, in ns. */ |
---|
.. | .. |
---|
190 | 182 | /* |
---|
191 | 183 | * When a sync request is dispatched, the queue that contains that |
---|
192 | 184 | * request, and all the ancestor entities of that queue, are charged |
---|
193 | | - * with the number of sectors of the request. In constrast, if the |
---|
| 185 | + * with the number of sectors of the request. In contrast, if the |
---|
194 | 186 | * request is async, then the queue and its ancestor entities are |
---|
195 | 187 | * charged with the number of sectors of the request, multiplied by |
---|
196 | 188 | * the factor below. This throttles the bandwidth for async I/O, |
---|
.. | .. |
---|
218 | 210 | * queue merging. |
---|
219 | 211 | * |
---|
220 | 212 | * As can be deduced from the low time limit below, queue merging, if |
---|
221 | | - * successful, happens at the very beggining of the I/O of the involved |
---|
| 213 | + * successful, happens at the very beginning of the I/O of the involved |
---|
222 | 214 | * cooperating processes, as a consequence of the arrival of the very |
---|
223 | 215 | * first requests from each cooperator. After that, there is very |
---|
224 | 216 | * little chance to find cooperators. |
---|
.. | .. |
---|
231 | 223 | #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) |
---|
232 | 224 | |
---|
233 | 225 | /* hw_tag detection: parallel requests threshold and min samples needed. */ |
---|
234 | | -#define BFQ_HW_QUEUE_THRESHOLD 4 |
---|
| 226 | +#define BFQ_HW_QUEUE_THRESHOLD 3 |
---|
235 | 227 | #define BFQ_HW_QUEUE_SAMPLES 32 |
---|
236 | 228 | |
---|
237 | 229 | #define BFQQ_SEEK_THR (sector_t)(8 * 100) |
---|
238 | 230 | #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) |
---|
| 231 | +#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ |
---|
| 232 | + (get_sdist(last_pos, rq) > \ |
---|
| 233 | + BFQQ_SEEK_THR && \ |
---|
| 234 | + (!blk_queue_nonrot(bfqd->queue) || \ |
---|
| 235 | + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) |
---|
239 | 236 | #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) |
---|
240 | 237 | #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) |
---|
| 238 | +/* |
---|
| 239 | + * Sync random I/O is likely to be confused with soft real-time I/O, |
---|
| 240 | + * because it is characterized by limited throughput and apparently |
---|
| 241 | + * isochronous arrival pattern. To avoid false positives, queues |
---|
| 242 | + * containing only random (seeky) I/O are prevented from being tagged |
---|
| 243 | + * as soft real-time. |
---|
| 244 | + */ |
---|
| 245 | +#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1) |
---|
241 | 246 | |
---|
242 | 247 | /* Min number of samples required to perform peak-rate update */ |
---|
243 | 248 | #define BFQ_RATE_MIN_SAMPLES 32 |
---|
.. | .. |
---|
368 | 373 | |
---|
369 | 374 | void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) |
---|
370 | 375 | { |
---|
| 376 | + struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; |
---|
| 377 | + |
---|
| 378 | + /* Clear bic pointer if bfqq is detached from this bic */ |
---|
| 379 | + if (old_bfqq && old_bfqq->bic == bic) |
---|
| 380 | + old_bfqq->bic = NULL; |
---|
| 381 | + |
---|
371 | 382 | bic->bfqq[is_sync] = bfqq; |
---|
372 | 383 | } |
---|
373 | 384 | |
---|
.. | .. |
---|
400 | 411 | unsigned long flags; |
---|
401 | 412 | struct bfq_io_cq *icq; |
---|
402 | 413 | |
---|
403 | | - spin_lock_irqsave(q->queue_lock, flags); |
---|
| 414 | + spin_lock_irqsave(&q->queue_lock, flags); |
---|
404 | 415 | icq = icq_to_bic(ioc_lookup_icq(ioc, q)); |
---|
405 | | - spin_unlock_irqrestore(q->queue_lock, flags); |
---|
| 416 | + spin_unlock_irqrestore(&q->queue_lock, flags); |
---|
406 | 417 | |
---|
407 | 418 | return icq; |
---|
408 | 419 | } |
---|
.. | .. |
---|
416 | 427 | */ |
---|
417 | 428 | void bfq_schedule_dispatch(struct bfq_data *bfqd) |
---|
418 | 429 | { |
---|
| 430 | + lockdep_assert_held(&bfqd->lock); |
---|
| 431 | + |
---|
419 | 432 | if (bfqd->queued != 0) { |
---|
420 | 433 | bfq_log(bfqd, "schedule dispatch"); |
---|
421 | 434 | blk_mq_run_hw_queues(bfqd->queue, true); |
---|
.. | .. |
---|
423 | 436 | } |
---|
424 | 437 | |
---|
425 | 438 | #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) |
---|
426 | | -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) |
---|
427 | 439 | |
---|
428 | 440 | #define bfq_sample_valid(samples) ((samples) > 80) |
---|
429 | 441 | |
---|
430 | 442 | /* |
---|
431 | 443 | * Lifted from AS - choose which of rq1 and rq2 that is best served now. |
---|
432 | | - * We choose the request that is closesr to the head right now. Distance |
---|
| 444 | + * We choose the request that is closer to the head right now. Distance |
---|
433 | 445 | * behind the head is penalized and only allowed to a certain extent. |
---|
434 | 446 | */ |
---|
435 | 447 | static struct request *bfq_choose_req(struct bfq_data *bfqd, |
---|
.. | .. |
---|
591 | 603 | bfq_merge_time_limit); |
---|
592 | 604 | } |
---|
593 | 605 | |
---|
594 | | -void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
---|
| 606 | +/* |
---|
| 607 | + * The following function is not marked as __cold because it is |
---|
| 608 | + * actually cold, but for the same performance goal described in the |
---|
| 609 | + * comments on the likely() at the beginning of |
---|
| 610 | + * bfq_setup_cooperator(). Unexpectedly, to reach an even lower |
---|
| 611 | + * execution time for the case where this function is not invoked, we |
---|
| 612 | + * had to add an unlikely() in each involved if(). |
---|
| 613 | + */ |
---|
| 614 | +void __cold |
---|
| 615 | +bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
---|
595 | 616 | { |
---|
596 | 617 | struct rb_node **p, *parent; |
---|
597 | 618 | struct bfq_queue *__bfqq; |
---|
.. | .. |
---|
600 | 621 | rb_erase(&bfqq->pos_node, bfqq->pos_root); |
---|
601 | 622 | bfqq->pos_root = NULL; |
---|
602 | 623 | } |
---|
| 624 | + |
---|
| 625 | + /* oom_bfqq does not participate in queue merging */ |
---|
| 626 | + if (bfqq == &bfqd->oom_bfqq) |
---|
| 627 | + return; |
---|
603 | 628 | |
---|
604 | 629 | /* |
---|
605 | 630 | * bfqq cannot be merged any longer (see comments in |
---|
.. | .. |
---|
625 | 650 | } |
---|
626 | 651 | |
---|
627 | 652 | /* |
---|
628 | | - * Tell whether there are active queues with different weights or |
---|
629 | | - * active groups. |
---|
630 | | - */ |
---|
631 | | -static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) |
---|
632 | | -{ |
---|
633 | | - /* |
---|
634 | | - * For queue weights to differ, queue_weights_tree must contain |
---|
635 | | - * at least two nodes. |
---|
636 | | - */ |
---|
637 | | - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && |
---|
638 | | - (bfqd->queue_weights_tree.rb_node->rb_left || |
---|
639 | | - bfqd->queue_weights_tree.rb_node->rb_right) |
---|
640 | | -#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
641 | | - ) || |
---|
642 | | - (bfqd->num_groups_with_pending_reqs > 0 |
---|
643 | | -#endif |
---|
644 | | - ); |
---|
645 | | -} |
---|
646 | | - |
---|
647 | | -/* |
---|
648 | | - * The following function returns true if every queue must receive the |
---|
649 | | - * same share of the throughput (this condition is used when deciding |
---|
650 | | - * whether idling may be disabled, see the comments in the function |
---|
651 | | - * bfq_better_to_idle()). |
---|
| 653 | + * The following function returns false either if every active queue |
---|
| 654 | + * must receive the same share of the throughput (symmetric scenario), |
---|
| 655 | + * or, as a special case, if bfqq must receive a share of the |
---|
| 656 | + * throughput lower than or equal to the share that every other active |
---|
| 657 | + * queue must receive. If bfqq does sync I/O, then these are the only |
---|
| 658 | + * two cases where bfqq happens to be guaranteed its share of the |
---|
| 659 | + * throughput even if I/O dispatching is not plugged when bfqq remains |
---|
| 660 | + * temporarily empty (for more details, see the comments in the |
---|
| 661 | + * function bfq_better_to_idle()). For this reason, the return value |
---|
| 662 | + * of this function is used to check whether I/O-dispatch plugging can |
---|
| 663 | + * be avoided. |
---|
652 | 664 | * |
---|
653 | | - * Such a scenario occurs when: |
---|
| 665 | + * The above first case (symmetric scenario) occurs when: |
---|
654 | 666 | * 1) all active queues have the same weight, |
---|
655 | | - * 2) all active groups at the same level in the groups tree have the same |
---|
656 | | - * weight, |
---|
| 667 | + * 2) all active queues belong to the same I/O-priority class, |
---|
657 | 668 | * 3) all active groups at the same level in the groups tree have the same |
---|
| 669 | + * weight, |
---|
| 670 | + * 4) all active groups at the same level in the groups tree have the same |
---|
658 | 671 | * number of children. |
---|
659 | 672 | * |
---|
660 | 673 | * Unfortunately, keeping the necessary state for evaluating exactly |
---|
661 | 674 | * the last two symmetry sub-conditions above would be quite complex |
---|
662 | | - * and time consuming. Therefore this function evaluates, instead, |
---|
663 | | - * only the following stronger two sub-conditions, for which it is |
---|
| 675 | + * and time consuming. Therefore this function evaluates, instead, |
---|
| 676 | + * only the following stronger three sub-conditions, for which it is |
---|
664 | 677 | * much easier to maintain the needed state: |
---|
665 | 678 | * 1) all active queues have the same weight, |
---|
666 | | - * 2) there are no active groups. |
---|
| 679 | + * 2) all active queues belong to the same I/O-priority class, |
---|
| 680 | + * 3) there are no active groups. |
---|
667 | 681 | * In particular, the last condition is always true if hierarchical |
---|
668 | 682 | * support or the cgroups interface are not enabled, thus no state |
---|
669 | 683 | * needs to be maintained in this case. |
---|
670 | 684 | */ |
---|
671 | | -static bool bfq_symmetric_scenario(struct bfq_data *bfqd) |
---|
| 685 | +static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, |
---|
| 686 | + struct bfq_queue *bfqq) |
---|
672 | 687 | { |
---|
673 | | - return !bfq_varied_queue_weights_or_active_groups(bfqd); |
---|
| 688 | + bool smallest_weight = bfqq && |
---|
| 689 | + bfqq->weight_counter && |
---|
| 690 | + bfqq->weight_counter == |
---|
| 691 | + container_of( |
---|
| 692 | + rb_first_cached(&bfqd->queue_weights_tree), |
---|
| 693 | + struct bfq_weight_counter, |
---|
| 694 | + weights_node); |
---|
| 695 | + |
---|
| 696 | + /* |
---|
| 697 | + * For queue weights to differ, queue_weights_tree must contain |
---|
| 698 | + * at least two nodes. |
---|
| 699 | + */ |
---|
| 700 | + bool varied_queue_weights = !smallest_weight && |
---|
| 701 | + !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) && |
---|
| 702 | + (bfqd->queue_weights_tree.rb_root.rb_node->rb_left || |
---|
| 703 | + bfqd->queue_weights_tree.rb_root.rb_node->rb_right); |
---|
| 704 | + |
---|
| 705 | + bool multiple_classes_busy = |
---|
| 706 | + (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || |
---|
| 707 | + (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || |
---|
| 708 | + (bfqd->busy_queues[1] && bfqd->busy_queues[2]); |
---|
| 709 | + |
---|
| 710 | + return varied_queue_weights || multiple_classes_busy |
---|
| 711 | +#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
| 712 | + || bfqd->num_groups_with_pending_reqs > 0 |
---|
| 713 | +#endif |
---|
| 714 | + ; |
---|
674 | 715 | } |
---|
675 | 716 | |
---|
676 | 717 | /* |
---|
.. | .. |
---|
687 | 728 | * should be low too. |
---|
688 | 729 | */ |
---|
689 | 730 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
---|
690 | | - struct rb_root *root) |
---|
| 731 | + struct rb_root_cached *root) |
---|
691 | 732 | { |
---|
692 | 733 | struct bfq_entity *entity = &bfqq->entity; |
---|
693 | | - struct rb_node **new = &(root->rb_node), *parent = NULL; |
---|
| 734 | + struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; |
---|
| 735 | + bool leftmost = true; |
---|
694 | 736 | |
---|
695 | 737 | /* |
---|
696 | 738 | * Do not insert if the queue is already associated with a |
---|
.. | .. |
---|
719 | 761 | } |
---|
720 | 762 | if (entity->weight < __counter->weight) |
---|
721 | 763 | new = &((*new)->rb_left); |
---|
722 | | - else |
---|
| 764 | + else { |
---|
723 | 765 | new = &((*new)->rb_right); |
---|
| 766 | + leftmost = false; |
---|
| 767 | + } |
---|
724 | 768 | } |
---|
725 | 769 | |
---|
726 | 770 | bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), |
---|
.. | .. |
---|
729 | 773 | /* |
---|
730 | 774 | * In the unlucky event of an allocation failure, we just |
---|
731 | 775 | * exit. This will cause the weight of queue to not be |
---|
732 | | - * considered in bfq_varied_queue_weights_or_active_groups, |
---|
733 | | - * which, in its turn, causes the scenario to be deemed |
---|
734 | | - * wrongly symmetric in case bfqq's weight would have been |
---|
735 | | - * the only weight making the scenario asymmetric. On the |
---|
736 | | - * bright side, no unbalance will however occur when bfqq |
---|
737 | | - * becomes inactive again (the invocation of this function |
---|
738 | | - * is triggered by an activation of queue). In fact, |
---|
739 | | - * bfq_weights_tree_remove does nothing if |
---|
740 | | - * !bfqq->weight_counter. |
---|
| 776 | + * considered in bfq_asymmetric_scenario, which, in its turn, |
---|
| 777 | + * causes the scenario to be deemed wrongly symmetric in case |
---|
| 778 | + * bfqq's weight would have been the only weight making the |
---|
| 779 | + * scenario asymmetric. On the bright side, no unbalance will |
---|
| 780 | + * however occur when bfqq becomes inactive again (the |
---|
| 781 | + * invocation of this function is triggered by an activation |
---|
| 782 | + * of queue). In fact, bfq_weights_tree_remove does nothing |
---|
| 783 | + * if !bfqq->weight_counter. |
---|
741 | 784 | */ |
---|
742 | 785 | if (unlikely(!bfqq->weight_counter)) |
---|
743 | 786 | return; |
---|
744 | 787 | |
---|
745 | 788 | bfqq->weight_counter->weight = entity->weight; |
---|
746 | 789 | rb_link_node(&bfqq->weight_counter->weights_node, parent, new); |
---|
747 | | - rb_insert_color(&bfqq->weight_counter->weights_node, root); |
---|
| 790 | + rb_insert_color_cached(&bfqq->weight_counter->weights_node, root, |
---|
| 791 | + leftmost); |
---|
748 | 792 | |
---|
749 | 793 | inc_counter: |
---|
750 | 794 | bfqq->weight_counter->num_active++; |
---|
.. | .. |
---|
759 | 803 | */ |
---|
760 | 804 | void __bfq_weights_tree_remove(struct bfq_data *bfqd, |
---|
761 | 805 | struct bfq_queue *bfqq, |
---|
762 | | - struct rb_root *root) |
---|
| 806 | + struct rb_root_cached *root) |
---|
763 | 807 | { |
---|
764 | 808 | if (!bfqq->weight_counter) |
---|
765 | 809 | return; |
---|
.. | .. |
---|
768 | 812 | if (bfqq->weight_counter->num_active > 0) |
---|
769 | 813 | goto reset_entity_pointer; |
---|
770 | 814 | |
---|
771 | | - rb_erase(&bfqq->weight_counter->weights_node, root); |
---|
| 815 | + rb_erase_cached(&bfqq->weight_counter->weights_node, root); |
---|
772 | 816 | kfree(bfqq->weight_counter); |
---|
773 | 817 | |
---|
774 | 818 | reset_entity_pointer: |
---|
.. | .. |
---|
882 | 926 | static unsigned long bfq_serv_to_charge(struct request *rq, |
---|
883 | 927 | struct bfq_queue *bfqq) |
---|
884 | 928 | { |
---|
885 | | - if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) |
---|
| 929 | + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || |
---|
| 930 | + bfq_asymmetric_scenario(bfqq->bfqd, bfqq)) |
---|
886 | 931 | return blk_rq_sectors(rq); |
---|
887 | 932 | |
---|
888 | 933 | return blk_rq_sectors(rq) * bfq_async_charge_factor; |
---|
.. | .. |
---|
916 | 961 | */ |
---|
917 | 962 | return; |
---|
918 | 963 | |
---|
919 | | - new_budget = max_t(unsigned long, bfqq->max_budget, |
---|
920 | | - bfq_serv_to_charge(next_rq, bfqq)); |
---|
| 964 | + new_budget = max_t(unsigned long, |
---|
| 965 | + max_t(unsigned long, bfqq->max_budget, |
---|
| 966 | + bfq_serv_to_charge(next_rq, bfqq)), |
---|
| 967 | + entity->service); |
---|
921 | 968 | if (entity->budget != new_budget) { |
---|
922 | 969 | entity->budget = new_budget; |
---|
923 | 970 | bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", |
---|
.. | .. |
---|
946 | 993 | * of several files |
---|
947 | 994 | * mplayer took 23 seconds to start, if constantly weight-raised. |
---|
948 | 995 | * |
---|
949 | | - * As for higher values than that accomodating the above bad |
---|
| 996 | + * As for higher values than that accommodating the above bad |
---|
950 | 997 | * scenario, tests show that higher values would often yield |
---|
951 | 998 | * the opposite of the desired result, i.e., would worsen |
---|
952 | 999 | * responsiveness by allowing non-interactive applications to |
---|
.. | .. |
---|
985 | 1032 | else |
---|
986 | 1033 | bfq_clear_bfqq_IO_bound(bfqq); |
---|
987 | 1034 | |
---|
| 1035 | + bfqq->entity.new_weight = bic->saved_weight; |
---|
988 | 1036 | bfqq->ttime = bic->saved_ttime; |
---|
989 | 1037 | bfqq->wr_coeff = bic->saved_wr_coeff; |
---|
990 | 1038 | bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; |
---|
.. | .. |
---|
1020 | 1068 | |
---|
1021 | 1069 | static int bfqq_process_refs(struct bfq_queue *bfqq) |
---|
1022 | 1070 | { |
---|
1023 | | - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st - |
---|
| 1071 | + return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - |
---|
1024 | 1072 | (bfqq->weight_counter != NULL); |
---|
1025 | 1073 | } |
---|
1026 | 1074 | |
---|
.. | .. |
---|
1032 | 1080 | |
---|
1033 | 1081 | hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) |
---|
1034 | 1082 | hlist_del_init(&item->burst_list_node); |
---|
1035 | | - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
---|
1036 | | - bfqd->burst_size = 1; |
---|
| 1083 | + |
---|
| 1084 | + /* |
---|
| 1085 | + * Start the creation of a new burst list only if there is no |
---|
| 1086 | + * active queue. See comments on the conditional invocation of |
---|
| 1087 | + * bfq_handle_burst(). |
---|
| 1088 | + */ |
---|
| 1089 | + if (bfq_tot_busy_queues(bfqd) == 0) { |
---|
| 1090 | + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
---|
| 1091 | + bfqd->burst_size = 1; |
---|
| 1092 | + } else |
---|
| 1093 | + bfqd->burst_size = 0; |
---|
| 1094 | + |
---|
1037 | 1095 | bfqd->burst_parent_entity = bfqq->entity.parent; |
---|
1038 | 1096 | } |
---|
1039 | 1097 | |
---|
.. | .. |
---|
1089 | 1147 | * many parallel threads/processes. Examples are systemd during boot, |
---|
1090 | 1148 | * or git grep. To help these processes get their job done as soon as |
---|
1091 | 1149 | * possible, it is usually better to not grant either weight-raising |
---|
1092 | | - * or device idling to their queues. |
---|
| 1150 | + * or device idling to their queues, unless these queues must be |
---|
| 1151 | + * protected from the I/O flowing through other active queues. |
---|
1093 | 1152 | * |
---|
1094 | 1153 | * In this comment we describe, firstly, the reasons why this fact |
---|
1095 | 1154 | * holds, and, secondly, the next function, which implements the main |
---|
.. | .. |
---|
1101 | 1160 | * cumulatively served, the sooner the target job of these queues gets |
---|
1102 | 1161 | * completed. As a consequence, weight-raising any of these queues, |
---|
1103 | 1162 | * which also implies idling the device for it, is almost always |
---|
1104 | | - * counterproductive. In most cases it just lowers throughput. |
---|
| 1163 | + * counterproductive, unless there are other active queues to isolate |
---|
| 1164 | + * these new queues from. If there no other active queues, then |
---|
| 1165 | + * weight-raising these new queues just lowers throughput in most |
---|
| 1166 | + * cases. |
---|
1105 | 1167 | * |
---|
1106 | 1168 | * On the other hand, a burst of queue creations may be caused also by |
---|
1107 | 1169 | * the start of an application that does not consist of a lot of |
---|
.. | .. |
---|
1135 | 1197 | * are very rare. They typically occur if some service happens to |
---|
1136 | 1198 | * start doing I/O exactly when the interactive task starts. |
---|
1137 | 1199 | * |
---|
1138 | | - * Turning back to the next function, it implements all the steps |
---|
1139 | | - * needed to detect the occurrence of a large burst and to properly |
---|
1140 | | - * mark all the queues belonging to it (so that they can then be |
---|
1141 | | - * treated in a different way). This goal is achieved by maintaining a |
---|
1142 | | - * "burst list" that holds, temporarily, the queues that belong to the |
---|
1143 | | - * burst in progress. The list is then used to mark these queues as |
---|
1144 | | - * belonging to a large burst if the burst does become large. The main |
---|
1145 | | - * steps are the following. |
---|
| 1200 | + * Turning back to the next function, it is invoked only if there are |
---|
| 1201 | + * no active queues (apart from active queues that would belong to the |
---|
| 1202 | + * same, possible burst bfqq would belong to), and it implements all |
---|
| 1203 | + * the steps needed to detect the occurrence of a large burst and to |
---|
| 1204 | + * properly mark all the queues belonging to it (so that they can then |
---|
| 1205 | + * be treated in a different way). This goal is achieved by |
---|
| 1206 | + * maintaining a "burst list" that holds, temporarily, the queues that |
---|
| 1207 | + * belong to the burst in progress. The list is then used to mark |
---|
| 1208 | + * these queues as belonging to a large burst if the burst does become |
---|
| 1209 | + * large. The main steps are the following. |
---|
1146 | 1210 | * |
---|
1147 | 1211 | * . when the very first queue is created, the queue is inserted into the |
---|
1148 | 1212 | * list (as it could be the first queue in a possible burst) |
---|
.. | .. |
---|
1376 | 1440 | * mechanism may be re-designed in such a way to make it possible to |
---|
1377 | 1441 | * know whether preemption is needed without needing to update service |
---|
1378 | 1442 | * trees). In addition, queue preemptions almost always cause random |
---|
1379 | | - * I/O, and thus loss of throughput. Because of these facts, the next |
---|
1380 | | - * function adopts the following simple scheme to avoid both costly |
---|
1381 | | - * operations and too frequent preemptions: it requests the expiration |
---|
1382 | | - * of the in-service queue (unconditionally) only for queues that need |
---|
1383 | | - * to recover a hole, or that either are weight-raised or deserve to |
---|
1384 | | - * be weight-raised. |
---|
| 1443 | + * I/O, which may in turn cause loss of throughput. Finally, there may |
---|
| 1444 | + * even be no in-service queue when the next function is invoked (so, |
---|
| 1445 | + * no queue to compare timestamps with). Because of these facts, the |
---|
| 1446 | + * next function adopts the following simple scheme to avoid costly |
---|
| 1447 | + * operations, too frequent preemptions and too many dependencies on |
---|
| 1448 | + * the state of the scheduler: it requests the expiration of the |
---|
| 1449 | + * in-service queue (unconditionally) only for queues that need to |
---|
| 1450 | + * recover a hole. Then it delegates to other parts of the code the |
---|
| 1451 | + * responsibility of handling the above case 2. |
---|
1385 | 1452 | */ |
---|
1386 | 1453 | static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, |
---|
1387 | 1454 | struct bfq_queue *bfqq, |
---|
1388 | | - bool arrived_in_time, |
---|
1389 | | - bool wr_or_deserves_wr) |
---|
| 1455 | + bool arrived_in_time) |
---|
1390 | 1456 | { |
---|
1391 | 1457 | struct bfq_entity *entity = &bfqq->entity; |
---|
1392 | 1458 | |
---|
1393 | | - if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { |
---|
| 1459 | + /* |
---|
| 1460 | + * In the next compound condition, we check also whether there |
---|
| 1461 | + * is some budget left, because otherwise there is no point in |
---|
| 1462 | + * trying to go on serving bfqq with this same budget: bfqq |
---|
| 1463 | + * would be expired immediately after being selected for |
---|
| 1464 | + * service. This would only cause useless overhead. |
---|
| 1465 | + */ |
---|
| 1466 | + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && |
---|
| 1467 | + bfq_bfqq_budget_left(bfqq) > 0) { |
---|
1394 | 1468 | /* |
---|
1395 | 1469 | * We do not clear the flag non_blocking_wait_rq here, as |
---|
1396 | 1470 | * the latter is used in bfq_activate_bfqq to signal |
---|
.. | .. |
---|
1433 | 1507 | entity->budget = max_t(unsigned long, bfqq->max_budget, |
---|
1434 | 1508 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); |
---|
1435 | 1509 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); |
---|
1436 | | - return wr_or_deserves_wr; |
---|
| 1510 | + return false; |
---|
1437 | 1511 | } |
---|
1438 | 1512 | |
---|
1439 | 1513 | /* |
---|
.. | .. |
---|
1551 | 1625 | bfqd->bfq_wr_min_idle_time); |
---|
1552 | 1626 | } |
---|
1553 | 1627 | |
---|
| 1628 | + |
---|
| 1629 | +/* |
---|
| 1630 | + * Return true if bfqq is in a higher priority class, or has a higher |
---|
| 1631 | + * weight than the in-service queue. |
---|
| 1632 | + */ |
---|
| 1633 | +static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, |
---|
| 1634 | + struct bfq_queue *in_serv_bfqq) |
---|
| 1635 | +{ |
---|
| 1636 | + int bfqq_weight, in_serv_weight; |
---|
| 1637 | + |
---|
| 1638 | + if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class) |
---|
| 1639 | + return true; |
---|
| 1640 | + |
---|
| 1641 | + if (in_serv_bfqq->entity.parent == bfqq->entity.parent) { |
---|
| 1642 | + bfqq_weight = bfqq->entity.weight; |
---|
| 1643 | + in_serv_weight = in_serv_bfqq->entity.weight; |
---|
| 1644 | + } else { |
---|
| 1645 | + if (bfqq->entity.parent) |
---|
| 1646 | + bfqq_weight = bfqq->entity.parent->weight; |
---|
| 1647 | + else |
---|
| 1648 | + bfqq_weight = bfqq->entity.weight; |
---|
| 1649 | + if (in_serv_bfqq->entity.parent) |
---|
| 1650 | + in_serv_weight = in_serv_bfqq->entity.parent->weight; |
---|
| 1651 | + else |
---|
| 1652 | + in_serv_weight = in_serv_bfqq->entity.weight; |
---|
| 1653 | + } |
---|
| 1654 | + |
---|
| 1655 | + return bfqq_weight > in_serv_weight; |
---|
| 1656 | +} |
---|
| 1657 | + |
---|
1554 | 1658 | static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, |
---|
1555 | 1659 | struct bfq_queue *bfqq, |
---|
1556 | 1660 | int old_wr_coeff, |
---|
.. | .. |
---|
1579 | 1683 | */ |
---|
1580 | 1684 | in_burst = bfq_bfqq_in_large_burst(bfqq); |
---|
1581 | 1685 | soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && |
---|
| 1686 | + !BFQQ_TOTALLY_SEEKY(bfqq) && |
---|
1582 | 1687 | !in_burst && |
---|
1583 | 1688 | time_is_before_jiffies(bfqq->soft_rt_next_start) && |
---|
1584 | 1689 | bfqq->dispatched == 0; |
---|
.. | .. |
---|
1594 | 1699 | */ |
---|
1595 | 1700 | bfqq_wants_to_preempt = |
---|
1596 | 1701 | bfq_bfqq_update_budg_for_activation(bfqd, bfqq, |
---|
1597 | | - arrived_in_time, |
---|
1598 | | - wr_or_deserves_wr); |
---|
| 1702 | + arrived_in_time); |
---|
1599 | 1703 | |
---|
1600 | 1704 | /* |
---|
1601 | 1705 | * If bfqq happened to be activated in a burst, but has been |
---|
.. | .. |
---|
1660 | 1764 | |
---|
1661 | 1765 | /* |
---|
1662 | 1766 | * Expire in-service queue only if preemption may be needed |
---|
1663 | | - * for guarantees. In this respect, the function |
---|
1664 | | - * next_queue_may_preempt just checks a simple, necessary |
---|
1665 | | - * condition, and not a sufficient condition based on |
---|
1666 | | - * timestamps. In fact, for the latter condition to be |
---|
1667 | | - * evaluated, timestamps would need first to be updated, and |
---|
1668 | | - * this operation is quite costly (see the comments on the |
---|
1669 | | - * function bfq_bfqq_update_budg_for_activation). |
---|
| 1767 | + * for guarantees. In particular, we care only about two |
---|
| 1768 | + * cases. The first is that bfqq has to recover a service |
---|
| 1769 | + * hole, as explained in the comments on |
---|
| 1770 | + * bfq_bfqq_update_budg_for_activation(), i.e., that |
---|
| 1771 | + * bfqq_wants_to_preempt is true. However, if bfqq does not |
---|
| 1772 | + * carry time-critical I/O, then bfqq's bandwidth is less |
---|
| 1773 | + * important than that of queues that carry time-critical I/O. |
---|
| 1774 | + * So, as a further constraint, we consider this case only if |
---|
| 1775 | + * bfqq is at least as weight-raised, i.e., at least as time |
---|
| 1776 | + * critical, as the in-service queue. |
---|
| 1777 | + * |
---|
| 1778 | + * The second case is that bfqq is in a higher priority class, |
---|
| 1779 | + * or has a higher weight than the in-service queue. If this |
---|
| 1780 | + * condition does not hold, we don't care because, even if |
---|
| 1781 | + * bfqq does not start to be served immediately, the resulting |
---|
| 1782 | + * delay for bfqq's I/O is however lower or much lower than |
---|
| 1783 | + * the ideal completion time to be guaranteed to bfqq's I/O. |
---|
| 1784 | + * |
---|
| 1785 | + * In both cases, preemption is needed only if, according to |
---|
| 1786 | + * the timestamps of both bfqq and of the in-service queue, |
---|
| 1787 | + * bfqq actually is the next queue to serve. So, to reduce |
---|
| 1788 | + * useless preemptions, the return value of |
---|
| 1789 | + * next_queue_may_preempt() is considered in the next compound |
---|
| 1790 | + * condition too. Yet next_queue_may_preempt() just checks a |
---|
| 1791 | + * simple, necessary condition for bfqq to be the next queue |
---|
| 1792 | + * to serve. In fact, to evaluate a sufficient condition, the |
---|
| 1793 | + * timestamps of the in-service queue would need to be |
---|
| 1794 | + * updated, and this operation is quite costly (see the |
---|
| 1795 | + * comments on bfq_bfqq_update_budg_for_activation()). |
---|
1670 | 1796 | */ |
---|
1671 | | - if (bfqd->in_service_queue && bfqq_wants_to_preempt && |
---|
1672 | | - bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && |
---|
| 1797 | + if (bfqd->in_service_queue && |
---|
| 1798 | + ((bfqq_wants_to_preempt && |
---|
| 1799 | + bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || |
---|
| 1800 | + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && |
---|
1673 | 1801 | next_queue_may_preempt(bfqd)) |
---|
1674 | 1802 | bfq_bfqq_expire(bfqd, bfqd->in_service_queue, |
---|
1675 | 1803 | false, BFQQE_PREEMPTED); |
---|
| 1804 | +} |
---|
| 1805 | + |
---|
| 1806 | +static void bfq_reset_inject_limit(struct bfq_data *bfqd, |
---|
| 1807 | + struct bfq_queue *bfqq) |
---|
| 1808 | +{ |
---|
| 1809 | + /* invalidate baseline total service time */ |
---|
| 1810 | + bfqq->last_serv_time_ns = 0; |
---|
| 1811 | + |
---|
| 1812 | + /* |
---|
| 1813 | + * Reset pointer in case we are waiting for |
---|
| 1814 | + * some request completion. |
---|
| 1815 | + */ |
---|
| 1816 | + bfqd->waited_rq = NULL; |
---|
| 1817 | + |
---|
| 1818 | + /* |
---|
| 1819 | + * If bfqq has a short think time, then start by setting the |
---|
| 1820 | + * inject limit to 0 prudentially, because the service time of |
---|
| 1821 | + * an injected I/O request may be higher than the think time |
---|
| 1822 | + * of bfqq, and therefore, if one request was injected when |
---|
| 1823 | + * bfqq remains empty, this injected request might delay the |
---|
| 1824 | + * service of the next I/O request for bfqq significantly. In |
---|
| 1825 | + * case bfqq can actually tolerate some injection, then the |
---|
| 1826 | + * adaptive update will however raise the limit soon. This |
---|
| 1827 | + * lucky circumstance holds exactly because bfqq has a short |
---|
| 1828 | + * think time, and thus, after remaining empty, is likely to |
---|
| 1829 | + * get new I/O enqueued---and then completed---before being |
---|
| 1830 | + * expired. This is the very pattern that gives the |
---|
| 1831 | + * limit-update algorithm the chance to measure the effect of |
---|
| 1832 | + * injection on request service times, and then to update the |
---|
| 1833 | + * limit accordingly. |
---|
| 1834 | + * |
---|
| 1835 | + * However, in the following special case, the inject limit is |
---|
| 1836 | + * left to 1 even if the think time is short: bfqq's I/O is |
---|
| 1837 | + * synchronized with that of some other queue, i.e., bfqq may |
---|
| 1838 | + * receive new I/O only after the I/O of the other queue is |
---|
| 1839 | + * completed. Keeping the inject limit to 1 allows the |
---|
| 1840 | + * blocking I/O to be served while bfqq is in service. And |
---|
| 1841 | + * this is very convenient both for bfqq and for overall |
---|
| 1842 | + * throughput, as explained in detail in the comments in |
---|
| 1843 | + * bfq_update_has_short_ttime(). |
---|
| 1844 | + * |
---|
| 1845 | + * On the opposite end, if bfqq has a long think time, then |
---|
| 1846 | + * start directly by 1, because: |
---|
| 1847 | + * a) on the bright side, keeping at most one request in |
---|
| 1848 | + * service in the drive is unlikely to cause any harm to the |
---|
| 1849 | + * latency of bfqq's requests, as the service time of a single |
---|
| 1850 | + * request is likely to be lower than the think time of bfqq; |
---|
| 1851 | + * b) on the downside, after becoming empty, bfqq is likely to |
---|
| 1852 | + * expire before getting its next request. With this request |
---|
| 1853 | + * arrival pattern, it is very hard to sample total service |
---|
| 1854 | + * times and update the inject limit accordingly (see comments |
---|
| 1855 | + * on bfq_update_inject_limit()). So the limit is likely to be |
---|
| 1856 | + * never, or at least seldom, updated. As a consequence, by |
---|
| 1857 | + * setting the limit to 1, we avoid that no injection ever |
---|
| 1858 | + * occurs with bfqq. On the downside, this proactive step |
---|
| 1859 | + * further reduces chances to actually compute the baseline |
---|
| 1860 | + * total service time. Thus it reduces chances to execute the |
---|
| 1861 | + * limit-update algorithm and possibly raise the limit to more |
---|
| 1862 | + * than 1. |
---|
| 1863 | + */ |
---|
| 1864 | + if (bfq_bfqq_has_short_ttime(bfqq)) |
---|
| 1865 | + bfqq->inject_limit = 0; |
---|
| 1866 | + else |
---|
| 1867 | + bfqq->inject_limit = 1; |
---|
| 1868 | + |
---|
| 1869 | + bfqq->decrease_time_jif = jiffies; |
---|
1676 | 1870 | } |
---|
1677 | 1871 | |
---|
1678 | 1872 | static void bfq_add_request(struct request *rq) |
---|
.. | .. |
---|
1687 | 1881 | bfqq->queued[rq_is_sync(rq)]++; |
---|
1688 | 1882 | bfqd->queued++; |
---|
1689 | 1883 | |
---|
| 1884 | + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { |
---|
| 1885 | + /* |
---|
| 1886 | + * Detect whether bfqq's I/O seems synchronized with |
---|
| 1887 | + * that of some other queue, i.e., whether bfqq, after |
---|
| 1888 | + * remaining empty, happens to receive new I/O only |
---|
| 1889 | + * right after some I/O request of the other queue has |
---|
| 1890 | + * been completed. We call waker queue the other |
---|
| 1891 | + * queue, and we assume, for simplicity, that bfqq may |
---|
| 1892 | + * have at most one waker queue. |
---|
| 1893 | + * |
---|
| 1894 | + * A remarkable throughput boost can be reached by |
---|
| 1895 | + * unconditionally injecting the I/O of the waker |
---|
| 1896 | + * queue, every time a new bfq_dispatch_request |
---|
| 1897 | + * happens to be invoked while I/O is being plugged |
---|
| 1898 | + * for bfqq. In addition to boosting throughput, this |
---|
| 1899 | + * unblocks bfqq's I/O, thereby improving bandwidth |
---|
| 1900 | + * and latency for bfqq. Note that these same results |
---|
| 1901 | + * may be achieved with the general injection |
---|
| 1902 | + * mechanism, but less effectively. For details on |
---|
| 1903 | + * this aspect, see the comments on the choice of the |
---|
| 1904 | + * queue for injection in bfq_select_queue(). |
---|
| 1905 | + * |
---|
| 1906 | + * Turning back to the detection of a waker queue, a |
---|
| 1907 | + * queue Q is deemed as a waker queue for bfqq if, for |
---|
| 1908 | + * two consecutive times, bfqq happens to become non |
---|
| 1909 | + * empty right after a request of Q has been |
---|
| 1910 | + * completed. In particular, on the first time, Q is |
---|
| 1911 | + * tentatively set as a candidate waker queue, while |
---|
| 1912 | + * on the second time, the flag |
---|
| 1913 | + * bfq_bfqq_has_waker(bfqq) is set to confirm that Q |
---|
| 1914 | + * is a waker queue for bfqq. These detection steps |
---|
| 1915 | + * are performed only if bfqq has a long think time, |
---|
| 1916 | + * so as to make it more likely that bfqq's I/O is |
---|
| 1917 | + * actually being blocked by a synchronization. This |
---|
| 1918 | + * last filter, plus the above two-times requirement, |
---|
| 1919 | + * make false positives less likely. |
---|
| 1920 | + * |
---|
| 1921 | + * NOTE |
---|
| 1922 | + * |
---|
| 1923 | + * The sooner a waker queue is detected, the sooner |
---|
| 1924 | + * throughput can be boosted by injecting I/O from the |
---|
| 1925 | + * waker queue. Fortunately, detection is likely to be |
---|
| 1926 | + * actually fast, for the following reasons. While |
---|
| 1927 | + * blocked by synchronization, bfqq has a long think |
---|
| 1928 | + * time. This implies that bfqq's inject limit is at |
---|
| 1929 | + * least equal to 1 (see the comments in |
---|
| 1930 | + * bfq_update_inject_limit()). So, thanks to |
---|
| 1931 | + * injection, the waker queue is likely to be served |
---|
| 1932 | + * during the very first I/O-plugging time interval |
---|
| 1933 | + * for bfqq. This triggers the first step of the |
---|
| 1934 | + * detection mechanism. Thanks again to injection, the |
---|
| 1935 | + * candidate waker queue is then likely to be |
---|
| 1936 | + * confirmed no later than during the next |
---|
| 1937 | + * I/O-plugging interval for bfqq. |
---|
| 1938 | + */ |
---|
| 1939 | + if (bfqd->last_completed_rq_bfqq && |
---|
| 1940 | + !bfq_bfqq_has_short_ttime(bfqq) && |
---|
| 1941 | + ktime_get_ns() - bfqd->last_completion < |
---|
| 1942 | + 200 * NSEC_PER_USEC) { |
---|
| 1943 | + if (bfqd->last_completed_rq_bfqq != bfqq && |
---|
| 1944 | + bfqd->last_completed_rq_bfqq != |
---|
| 1945 | + bfqq->waker_bfqq) { |
---|
| 1946 | + /* |
---|
| 1947 | + * First synchronization detected with |
---|
| 1948 | + * a candidate waker queue, or with a |
---|
| 1949 | + * different candidate waker queue |
---|
| 1950 | + * from the current one. |
---|
| 1951 | + */ |
---|
| 1952 | + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; |
---|
| 1953 | + |
---|
| 1954 | + /* |
---|
| 1955 | + * If the waker queue disappears, then |
---|
| 1956 | + * bfqq->waker_bfqq must be reset. To |
---|
| 1957 | + * this goal, we maintain in each |
---|
| 1958 | + * waker queue a list, woken_list, of |
---|
| 1959 | + * all the queues that reference the |
---|
| 1960 | + * waker queue through their |
---|
| 1961 | + * waker_bfqq pointer. When the waker |
---|
| 1962 | + * queue exits, the waker_bfqq pointer |
---|
| 1963 | + * of all the queues in the woken_list |
---|
| 1964 | + * is reset. |
---|
| 1965 | + * |
---|
| 1966 | + * In addition, if bfqq is already in |
---|
| 1967 | + * the woken_list of a waker queue, |
---|
| 1968 | + * then, before being inserted into |
---|
| 1969 | + * the woken_list of a new waker |
---|
| 1970 | + * queue, bfqq must be removed from |
---|
| 1971 | + * the woken_list of the old waker |
---|
| 1972 | + * queue. |
---|
| 1973 | + */ |
---|
| 1974 | + if (!hlist_unhashed(&bfqq->woken_list_node)) |
---|
| 1975 | + hlist_del_init(&bfqq->woken_list_node); |
---|
| 1976 | + hlist_add_head(&bfqq->woken_list_node, |
---|
| 1977 | + &bfqd->last_completed_rq_bfqq->woken_list); |
---|
| 1978 | + |
---|
| 1979 | + bfq_clear_bfqq_has_waker(bfqq); |
---|
| 1980 | + } else if (bfqd->last_completed_rq_bfqq == |
---|
| 1981 | + bfqq->waker_bfqq && |
---|
| 1982 | + !bfq_bfqq_has_waker(bfqq)) { |
---|
| 1983 | + /* |
---|
| 1984 | + * synchronization with waker_bfqq |
---|
| 1985 | + * seen for the second time |
---|
| 1986 | + */ |
---|
| 1987 | + bfq_mark_bfqq_has_waker(bfqq); |
---|
| 1988 | + } |
---|
| 1989 | + } |
---|
| 1990 | + |
---|
| 1991 | + /* |
---|
| 1992 | + * Periodically reset inject limit, to make sure that |
---|
| 1993 | + * the latter eventually drops in case workload |
---|
| 1994 | + * changes, see step (3) in the comments on |
---|
| 1995 | + * bfq_update_inject_limit(). |
---|
| 1996 | + */ |
---|
| 1997 | + if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + |
---|
| 1998 | + msecs_to_jiffies(1000))) |
---|
| 1999 | + bfq_reset_inject_limit(bfqd, bfqq); |
---|
| 2000 | + |
---|
| 2001 | + /* |
---|
| 2002 | + * The following conditions must hold to setup a new |
---|
| 2003 | + * sampling of total service time, and then a new |
---|
| 2004 | + * update of the inject limit: |
---|
| 2005 | + * - bfqq is in service, because the total service |
---|
| 2006 | + * time is evaluated only for the I/O requests of |
---|
| 2007 | + * the queues in service; |
---|
| 2008 | + * - this is the right occasion to compute or to |
---|
| 2009 | + * lower the baseline total service time, because |
---|
| 2010 | + * there are actually no requests in the drive, |
---|
| 2011 | + * or |
---|
| 2012 | + * the baseline total service time is available, and |
---|
| 2013 | + * this is the right occasion to compute the other |
---|
| 2014 | + * quantity needed to update the inject limit, i.e., |
---|
| 2015 | + * the total service time caused by the amount of |
---|
| 2016 | + * injection allowed by the current value of the |
---|
| 2017 | + * limit. It is the right occasion because injection |
---|
| 2018 | + * has actually been performed during the service |
---|
| 2019 | + * hole, and there are still in-flight requests, |
---|
| 2020 | + * which are very likely to be exactly the injected |
---|
| 2021 | + * requests, or part of them; |
---|
| 2022 | + * - the minimum interval for sampling the total |
---|
| 2023 | + * service time and updating the inject limit has |
---|
| 2024 | + * elapsed. |
---|
| 2025 | + */ |
---|
| 2026 | + if (bfqq == bfqd->in_service_queue && |
---|
| 2027 | + (bfqd->rq_in_driver == 0 || |
---|
| 2028 | + (bfqq->last_serv_time_ns > 0 && |
---|
| 2029 | + bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && |
---|
| 2030 | + time_is_before_eq_jiffies(bfqq->decrease_time_jif + |
---|
| 2031 | + msecs_to_jiffies(10))) { |
---|
| 2032 | + bfqd->last_empty_occupied_ns = ktime_get_ns(); |
---|
| 2033 | + /* |
---|
| 2034 | + * Start the state machine for measuring the |
---|
| 2035 | + * total service time of rq: setting |
---|
| 2036 | + * wait_dispatch will cause bfqd->waited_rq to |
---|
| 2037 | + * be set when rq will be dispatched. |
---|
| 2038 | + */ |
---|
| 2039 | + bfqd->wait_dispatch = true; |
---|
| 2040 | + /* |
---|
| 2041 | + * If there is no I/O in service in the drive, |
---|
| 2042 | + * then possible injection occurred before the |
---|
| 2043 | + * arrival of rq will not affect the total |
---|
| 2044 | + * service time of rq. So the injection limit |
---|
| 2045 | + * must not be updated as a function of such |
---|
| 2046 | + * total service time, unless new injection |
---|
| 2047 | + * occurs before rq is completed. To have the |
---|
| 2048 | + * injection limit updated only in the latter |
---|
| 2049 | + * case, reset rqs_injected here (rqs_injected |
---|
| 2050 | + * will be set in case injection is performed |
---|
| 2051 | + * on bfqq before rq is completed). |
---|
| 2052 | + */ |
---|
| 2053 | + if (bfqd->rq_in_driver == 0) |
---|
| 2054 | + bfqd->rqs_injected = false; |
---|
| 2055 | + } |
---|
| 2056 | + } |
---|
| 2057 | + |
---|
1690 | 2058 | elv_rb_add(&bfqq->sort_list, rq); |
---|
1691 | 2059 | |
---|
1692 | 2060 | /* |
---|
.. | .. |
---|
1698 | 2066 | |
---|
1699 | 2067 | /* |
---|
1700 | 2068 | * Adjust priority tree position, if next_rq changes. |
---|
| 2069 | + * See comments on bfq_pos_tree_add_move() for the unlikely(). |
---|
1701 | 2070 | */ |
---|
1702 | | - if (prev != bfqq->next_rq) |
---|
| 2071 | + if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq)) |
---|
1703 | 2072 | bfq_pos_tree_add_move(bfqd, bfqq); |
---|
1704 | 2073 | |
---|
1705 | 2074 | if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ |
---|
.. | .. |
---|
1839 | 2208 | bfqq->pos_root = NULL; |
---|
1840 | 2209 | } |
---|
1841 | 2210 | } else { |
---|
1842 | | - bfq_pos_tree_add_move(bfqd, bfqq); |
---|
| 2211 | + /* see comments on bfq_pos_tree_add_move() for the unlikely() */ |
---|
| 2212 | + if (unlikely(!bfqd->nonrot_with_queueing)) |
---|
| 2213 | + bfq_pos_tree_add_move(bfqd, bfqq); |
---|
1843 | 2214 | } |
---|
1844 | 2215 | |
---|
1845 | 2216 | if (rq->cmd_flags & REQ_META) |
---|
.. | .. |
---|
1847 | 2218 | |
---|
1848 | 2219 | } |
---|
1849 | 2220 | |
---|
1850 | | -static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) |
---|
| 2221 | +static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, |
---|
| 2222 | + unsigned int nr_segs) |
---|
1851 | 2223 | { |
---|
1852 | | - struct request_queue *q = hctx->queue; |
---|
1853 | 2224 | struct bfq_data *bfqd = q->elevator->elevator_data; |
---|
1854 | 2225 | struct request *free = NULL; |
---|
1855 | 2226 | /* |
---|
.. | .. |
---|
1864 | 2235 | |
---|
1865 | 2236 | spin_lock_irq(&bfqd->lock); |
---|
1866 | 2237 | |
---|
1867 | | - if (bic) |
---|
| 2238 | + if (bic) { |
---|
| 2239 | + /* |
---|
| 2240 | + * Make sure cgroup info is uptodate for current process before |
---|
| 2241 | + * considering the merge. |
---|
| 2242 | + */ |
---|
| 2243 | + bfq_bic_update_cgroup(bic, bio); |
---|
| 2244 | + |
---|
1868 | 2245 | bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); |
---|
1869 | | - else |
---|
| 2246 | + } else { |
---|
1870 | 2247 | bfqd->bio_bfqq = NULL; |
---|
| 2248 | + } |
---|
1871 | 2249 | bfqd->bio_bic = bic; |
---|
1872 | 2250 | |
---|
1873 | | - ret = blk_mq_sched_try_merge(q, bio, &free); |
---|
| 2251 | + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); |
---|
1874 | 2252 | |
---|
1875 | 2253 | if (free) |
---|
1876 | 2254 | blk_mq_free_request(free); |
---|
.. | .. |
---|
1888 | 2266 | __rq = bfq_find_rq_fmerge(bfqd, bio, q); |
---|
1889 | 2267 | if (__rq && elv_bio_merge_ok(__rq, bio)) { |
---|
1890 | 2268 | *req = __rq; |
---|
| 2269 | + |
---|
| 2270 | + if (blk_discard_mergable(__rq)) |
---|
| 2271 | + return ELEVATOR_DISCARD_MERGE; |
---|
1891 | 2272 | return ELEVATOR_FRONT_MERGE; |
---|
1892 | 2273 | } |
---|
1893 | 2274 | |
---|
1894 | 2275 | return ELEVATOR_NO_MERGE; |
---|
1895 | 2276 | } |
---|
1896 | | - |
---|
1897 | | -static struct bfq_queue *bfq_init_rq(struct request *rq); |
---|
1898 | 2277 | |
---|
1899 | 2278 | static void bfq_request_merged(struct request_queue *q, struct request *req, |
---|
1900 | 2279 | enum elv_merge type) |
---|
.. | .. |
---|
1904 | 2283 | blk_rq_pos(req) < |
---|
1905 | 2284 | blk_rq_pos(container_of(rb_prev(&req->rb_node), |
---|
1906 | 2285 | struct request, rb_node))) { |
---|
1907 | | - struct bfq_queue *bfqq = bfq_init_rq(req); |
---|
| 2286 | + struct bfq_queue *bfqq = RQ_BFQQ(req); |
---|
1908 | 2287 | struct bfq_data *bfqd; |
---|
1909 | 2288 | struct request *prev, *next_rq; |
---|
1910 | 2289 | |
---|
.. | .. |
---|
1929 | 2308 | */ |
---|
1930 | 2309 | if (prev != bfqq->next_rq) { |
---|
1931 | 2310 | bfq_updated_next_req(bfqd, bfqq); |
---|
1932 | | - bfq_pos_tree_add_move(bfqd, bfqq); |
---|
| 2311 | + /* |
---|
| 2312 | + * See comments on bfq_pos_tree_add_move() for |
---|
| 2313 | + * the unlikely(). |
---|
| 2314 | + */ |
---|
| 2315 | + if (unlikely(!bfqd->nonrot_with_queueing)) |
---|
| 2316 | + bfq_pos_tree_add_move(bfqd, bfqq); |
---|
1933 | 2317 | } |
---|
1934 | 2318 | } |
---|
1935 | 2319 | } |
---|
.. | .. |
---|
1951 | 2335 | static void bfq_requests_merged(struct request_queue *q, struct request *rq, |
---|
1952 | 2336 | struct request *next) |
---|
1953 | 2337 | { |
---|
1954 | | - struct bfq_queue *bfqq = bfq_init_rq(rq), |
---|
1955 | | - *next_bfqq = bfq_init_rq(next); |
---|
| 2338 | + struct bfq_queue *bfqq = RQ_BFQQ(rq), |
---|
| 2339 | + *next_bfqq = RQ_BFQQ(next); |
---|
1956 | 2340 | |
---|
1957 | 2341 | if (!bfqq) |
---|
1958 | 2342 | return; |
---|
.. | .. |
---|
2131 | 2515 | if (process_refs == 0 || new_process_refs == 0) |
---|
2132 | 2516 | return NULL; |
---|
2133 | 2517 | |
---|
| 2518 | + /* |
---|
| 2519 | + * Make sure merged queues belong to the same parent. Parents could |
---|
| 2520 | + * have changed since the time we decided the two queues are suitable |
---|
| 2521 | + * for merging. |
---|
| 2522 | + */ |
---|
| 2523 | + if (new_bfqq->entity.parent != bfqq->entity.parent) |
---|
| 2524 | + return NULL; |
---|
| 2525 | + |
---|
2134 | 2526 | bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
---|
2135 | 2527 | new_bfqq->pid); |
---|
2136 | 2528 | |
---|
.. | .. |
---|
2155 | 2547 | * are likely to increase the throughput. |
---|
2156 | 2548 | */ |
---|
2157 | 2549 | bfqq->new_bfqq = new_bfqq; |
---|
| 2550 | + /* |
---|
| 2551 | + * The above assignment schedules the following redirections: |
---|
| 2552 | + * each time some I/O for bfqq arrives, the process that |
---|
| 2553 | + * generated that I/O is disassociated from bfqq and |
---|
| 2554 | + * associated with new_bfqq. Here we increases new_bfqq->ref |
---|
| 2555 | + * in advance, adding the number of processes that are |
---|
| 2556 | + * expected to be associated with new_bfqq as they happen to |
---|
| 2557 | + * issue I/O. |
---|
| 2558 | + */ |
---|
2158 | 2559 | new_bfqq->ref += process_refs; |
---|
2159 | 2560 | return new_bfqq; |
---|
2160 | 2561 | } |
---|
.. | .. |
---|
2214 | 2615 | { |
---|
2215 | 2616 | struct bfq_queue *in_service_bfqq, *new_bfqq; |
---|
2216 | 2617 | |
---|
| 2618 | + /* if a merge has already been setup, then proceed with that first */ |
---|
| 2619 | + if (bfqq->new_bfqq) |
---|
| 2620 | + return bfqq->new_bfqq; |
---|
| 2621 | + |
---|
| 2622 | + /* |
---|
| 2623 | + * Do not perform queue merging if the device is non |
---|
| 2624 | + * rotational and performs internal queueing. In fact, such a |
---|
| 2625 | + * device reaches a high speed through internal parallelism |
---|
| 2626 | + * and pipelining. This means that, to reach a high |
---|
| 2627 | + * throughput, it must have many requests enqueued at the same |
---|
| 2628 | + * time. But, in this configuration, the internal scheduling |
---|
| 2629 | + * algorithm of the device does exactly the job of queue |
---|
| 2630 | + * merging: it reorders requests so as to obtain as much as |
---|
| 2631 | + * possible a sequential I/O pattern. As a consequence, with |
---|
| 2632 | + * the workload generated by processes doing interleaved I/O, |
---|
| 2633 | + * the throughput reached by the device is likely to be the |
---|
| 2634 | + * same, with and without queue merging. |
---|
| 2635 | + * |
---|
| 2636 | + * Disabling merging also provides a remarkable benefit in |
---|
| 2637 | + * terms of throughput. Merging tends to make many workloads |
---|
| 2638 | + * artificially more uneven, because of shared queues |
---|
| 2639 | + * remaining non empty for incomparably more time than |
---|
| 2640 | + * non-merged queues. This may accentuate workload |
---|
| 2641 | + * asymmetries. For example, if one of the queues in a set of |
---|
| 2642 | + * merged queues has a higher weight than a normal queue, then |
---|
| 2643 | + * the shared queue may inherit such a high weight and, by |
---|
| 2644 | + * staying almost always active, may force BFQ to perform I/O |
---|
| 2645 | + * plugging most of the time. This evidently makes it harder |
---|
| 2646 | + * for BFQ to let the device reach a high throughput. |
---|
| 2647 | + * |
---|
| 2648 | + * Finally, the likely() macro below is not used because one |
---|
| 2649 | + * of the two branches is more likely than the other, but to |
---|
| 2650 | + * have the code path after the following if() executed as |
---|
| 2651 | + * fast as possible for the case of a non rotational device |
---|
| 2652 | + * with queueing. We want it because this is the fastest kind |
---|
| 2653 | + * of device. On the opposite end, the likely() may lengthen |
---|
| 2654 | + * the execution time of BFQ for the case of slower devices |
---|
| 2655 | + * (rotational or at least without queueing). But in this case |
---|
| 2656 | + * the execution time of BFQ matters very little, if not at |
---|
| 2657 | + * all. |
---|
| 2658 | + */ |
---|
| 2659 | + if (likely(bfqd->nonrot_with_queueing)) |
---|
| 2660 | + return NULL; |
---|
| 2661 | + |
---|
2217 | 2662 | /* |
---|
2218 | 2663 | * Prevent bfqq from being merged if it has been created too |
---|
2219 | 2664 | * long ago. The idea is that true cooperating processes, and |
---|
.. | .. |
---|
2228 | 2673 | if (bfq_too_late_for_merging(bfqq)) |
---|
2229 | 2674 | return NULL; |
---|
2230 | 2675 | |
---|
2231 | | - if (bfqq->new_bfqq) |
---|
2232 | | - return bfqq->new_bfqq; |
---|
2233 | | - |
---|
2234 | 2676 | if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) |
---|
2235 | 2677 | return NULL; |
---|
2236 | 2678 | |
---|
2237 | 2679 | /* If there is only one backlogged queue, don't search. */ |
---|
2238 | | - if (bfqd->busy_queues == 1) |
---|
| 2680 | + if (bfq_tot_busy_queues(bfqd) == 1) |
---|
2239 | 2681 | return NULL; |
---|
2240 | 2682 | |
---|
2241 | 2683 | in_service_bfqq = bfqd->in_service_queue; |
---|
.. | .. |
---|
2277 | 2719 | if (!bic) |
---|
2278 | 2720 | return; |
---|
2279 | 2721 | |
---|
| 2722 | + bic->saved_weight = bfqq->entity.orig_weight; |
---|
2280 | 2723 | bic->saved_ttime = bfqq->ttime; |
---|
2281 | 2724 | bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); |
---|
2282 | 2725 | bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); |
---|
.. | .. |
---|
2295 | 2738 | * to enjoy weight raising if split soon. |
---|
2296 | 2739 | */ |
---|
2297 | 2740 | bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; |
---|
| 2741 | + bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); |
---|
2298 | 2742 | bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); |
---|
2299 | 2743 | bic->saved_last_wr_start_finish = jiffies; |
---|
2300 | 2744 | } else { |
---|
.. | .. |
---|
2304 | 2748 | bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; |
---|
2305 | 2749 | bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; |
---|
2306 | 2750 | } |
---|
| 2751 | +} |
---|
| 2752 | + |
---|
| 2753 | +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
---|
| 2754 | +{ |
---|
| 2755 | + /* |
---|
| 2756 | + * To prevent bfqq's service guarantees from being violated, |
---|
| 2757 | + * bfqq may be left busy, i.e., queued for service, even if |
---|
| 2758 | + * empty (see comments in __bfq_bfqq_expire() for |
---|
| 2759 | + * details). But, if no process will send requests to bfqq any |
---|
| 2760 | + * longer, then there is no point in keeping bfqq queued for |
---|
| 2761 | + * service. In addition, keeping bfqq queued for service, but |
---|
| 2762 | + * with no process ref any longer, may have caused bfqq to be |
---|
| 2763 | + * freed when dequeued from service. But this is assumed to |
---|
| 2764 | + * never happen. |
---|
| 2765 | + */ |
---|
| 2766 | + if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && |
---|
| 2767 | + bfqq != bfqd->in_service_queue) |
---|
| 2768 | + bfq_del_bfqq_busy(bfqd, bfqq, false); |
---|
| 2769 | + |
---|
| 2770 | + bfq_put_queue(bfqq); |
---|
2307 | 2771 | } |
---|
2308 | 2772 | |
---|
2309 | 2773 | static void |
---|
.. | .. |
---|
2352 | 2816 | /* |
---|
2353 | 2817 | * Merge queues (that is, let bic redirect its requests to new_bfqq) |
---|
2354 | 2818 | */ |
---|
2355 | | - bic_set_bfqq(bic, new_bfqq, 1); |
---|
| 2819 | + bic_set_bfqq(bic, new_bfqq, true); |
---|
2356 | 2820 | bfq_mark_bfqq_coop(new_bfqq); |
---|
2357 | 2821 | /* |
---|
2358 | 2822 | * new_bfqq now belongs to at least two bics (it is a shared queue): |
---|
.. | .. |
---|
2365 | 2829 | * assignment causes no harm). |
---|
2366 | 2830 | */ |
---|
2367 | 2831 | new_bfqq->bic = NULL; |
---|
| 2832 | + /* |
---|
| 2833 | + * If the queue is shared, the pid is the pid of one of the associated |
---|
| 2834 | + * processes. Which pid depends on the exact sequence of merge events |
---|
| 2835 | + * the queue underwent. So printing such a pid is useless and confusing |
---|
| 2836 | + * because it reports a random pid between those of the associated |
---|
| 2837 | + * processes. |
---|
| 2838 | + * We mark such a queue with a pid -1, and then print SHARED instead of |
---|
| 2839 | + * a pid in logging messages. |
---|
| 2840 | + */ |
---|
| 2841 | + new_bfqq->pid = -1; |
---|
2368 | 2842 | bfqq->bic = NULL; |
---|
2369 | | - /* release process reference to bfqq */ |
---|
2370 | | - bfq_put_queue(bfqq); |
---|
| 2843 | + bfq_release_process_ref(bfqd, bfqq); |
---|
2371 | 2844 | } |
---|
2372 | 2845 | |
---|
2373 | 2846 | static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, |
---|
.. | .. |
---|
2399 | 2872 | /* |
---|
2400 | 2873 | * bic still points to bfqq, then it has not yet been |
---|
2401 | 2874 | * redirected to some other bfq_queue, and a queue |
---|
2402 | | - * merge beween bfqq and new_bfqq can be safely |
---|
2403 | | - * fulfillled, i.e., bic can be redirected to new_bfqq |
---|
| 2875 | + * merge between bfqq and new_bfqq can be safely |
---|
| 2876 | + * fulfilled, i.e., bic can be redirected to new_bfqq |
---|
2404 | 2877 | * and bfqq can be put. |
---|
2405 | 2878 | */ |
---|
2406 | 2879 | bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, |
---|
.. | .. |
---|
2535 | 3008 | * queue). |
---|
2536 | 3009 | */ |
---|
2537 | 3010 | if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && |
---|
2538 | | - bfq_symmetric_scenario(bfqd)) |
---|
| 3011 | + !bfq_asymmetric_scenario(bfqd, bfqq)) |
---|
2539 | 3012 | sl = min_t(u64, sl, BFQ_MIN_TT); |
---|
2540 | 3013 | else if (bfqq->wr_coeff > 1) |
---|
2541 | 3014 | sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); |
---|
2542 | 3015 | |
---|
2543 | 3016 | bfqd->last_idling_start = ktime_get(); |
---|
| 3017 | + bfqd->last_idling_start_jiffies = jiffies; |
---|
| 3018 | + |
---|
2544 | 3019 | hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), |
---|
2545 | 3020 | HRTIMER_MODE_REL); |
---|
2546 | 3021 | bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); |
---|
.. | .. |
---|
2764 | 3239 | |
---|
2765 | 3240 | if ((bfqd->rq_in_driver > 0 || |
---|
2766 | 3241 | now_ns - bfqd->last_completion < BFQ_MIN_TT) |
---|
2767 | | - && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) |
---|
| 3242 | + && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) |
---|
2768 | 3243 | bfqd->sequential_samples++; |
---|
2769 | 3244 | |
---|
2770 | 3245 | bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); |
---|
.. | .. |
---|
2816 | 3291 | bfq_remove_request(q, rq); |
---|
2817 | 3292 | } |
---|
2818 | 3293 | |
---|
2819 | | -static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
---|
| 3294 | +/* |
---|
| 3295 | + * There is a case where idling does not have to be performed for |
---|
| 3296 | + * throughput concerns, but to preserve the throughput share of |
---|
| 3297 | + * the process associated with bfqq. |
---|
| 3298 | + * |
---|
| 3299 | + * To introduce this case, we can note that allowing the drive |
---|
| 3300 | + * to enqueue more than one request at a time, and hence |
---|
| 3301 | + * delegating de facto final scheduling decisions to the |
---|
| 3302 | + * drive's internal scheduler, entails loss of control on the |
---|
| 3303 | + * actual request service order. In particular, the critical |
---|
| 3304 | + * situation is when requests from different processes happen |
---|
| 3305 | + * to be present, at the same time, in the internal queue(s) |
---|
| 3306 | + * of the drive. In such a situation, the drive, by deciding |
---|
| 3307 | + * the service order of the internally-queued requests, does |
---|
| 3308 | + * determine also the actual throughput distribution among |
---|
| 3309 | + * these processes. But the drive typically has no notion or |
---|
| 3310 | + * concern about per-process throughput distribution, and |
---|
| 3311 | + * makes its decisions only on a per-request basis. Therefore, |
---|
| 3312 | + * the service distribution enforced by the drive's internal |
---|
| 3313 | + * scheduler is likely to coincide with the desired throughput |
---|
| 3314 | + * distribution only in a completely symmetric, or favorably |
---|
| 3315 | + * skewed scenario where: |
---|
| 3316 | + * (i-a) each of these processes must get the same throughput as |
---|
| 3317 | + * the others, |
---|
| 3318 | + * (i-b) in case (i-a) does not hold, it holds that the process |
---|
| 3319 | + * associated with bfqq must receive a lower or equal |
---|
| 3320 | + * throughput than any of the other processes; |
---|
| 3321 | + * (ii) the I/O of each process has the same properties, in |
---|
| 3322 | + * terms of locality (sequential or random), direction |
---|
| 3323 | + * (reads or writes), request sizes, greediness |
---|
| 3324 | + * (from I/O-bound to sporadic), and so on; |
---|
| 3325 | + |
---|
| 3326 | + * In fact, in such a scenario, the drive tends to treat the requests |
---|
| 3327 | + * of each process in about the same way as the requests of the |
---|
| 3328 | + * others, and thus to provide each of these processes with about the |
---|
| 3329 | + * same throughput. This is exactly the desired throughput |
---|
| 3330 | + * distribution if (i-a) holds, or, if (i-b) holds instead, this is an |
---|
| 3331 | + * even more convenient distribution for (the process associated with) |
---|
| 3332 | + * bfqq. |
---|
| 3333 | + * |
---|
| 3334 | + * In contrast, in any asymmetric or unfavorable scenario, device |
---|
| 3335 | + * idling (I/O-dispatch plugging) is certainly needed to guarantee |
---|
| 3336 | + * that bfqq receives its assigned fraction of the device throughput |
---|
| 3337 | + * (see [1] for details). |
---|
| 3338 | + * |
---|
| 3339 | + * The problem is that idling may significantly reduce throughput with |
---|
| 3340 | + * certain combinations of types of I/O and devices. An important |
---|
| 3341 | + * example is sync random I/O on flash storage with command |
---|
| 3342 | + * queueing. So, unless bfqq falls in cases where idling also boosts |
---|
| 3343 | + * throughput, it is important to check conditions (i-a), i(-b) and |
---|
| 3344 | + * (ii) accurately, so as to avoid idling when not strictly needed for |
---|
| 3345 | + * service guarantees. |
---|
| 3346 | + * |
---|
| 3347 | + * Unfortunately, it is extremely difficult to thoroughly check |
---|
| 3348 | + * condition (ii). And, in case there are active groups, it becomes |
---|
| 3349 | + * very difficult to check conditions (i-a) and (i-b) too. In fact, |
---|
| 3350 | + * if there are active groups, then, for conditions (i-a) or (i-b) to |
---|
| 3351 | + * become false 'indirectly', it is enough that an active group |
---|
| 3352 | + * contains more active processes or sub-groups than some other active |
---|
| 3353 | + * group. More precisely, for conditions (i-a) or (i-b) to become |
---|
| 3354 | + * false because of such a group, it is not even necessary that the |
---|
| 3355 | + * group is (still) active: it is sufficient that, even if the group |
---|
| 3356 | + * has become inactive, some of its descendant processes still have |
---|
| 3357 | + * some request already dispatched but still waiting for |
---|
| 3358 | + * completion. In fact, requests have still to be guaranteed their |
---|
| 3359 | + * share of the throughput even after being dispatched. In this |
---|
| 3360 | + * respect, it is easy to show that, if a group frequently becomes |
---|
| 3361 | + * inactive while still having in-flight requests, and if, when this |
---|
| 3362 | + * happens, the group is not considered in the calculation of whether |
---|
| 3363 | + * the scenario is asymmetric, then the group may fail to be |
---|
| 3364 | + * guaranteed its fair share of the throughput (basically because |
---|
| 3365 | + * idling may not be performed for the descendant processes of the |
---|
| 3366 | + * group, but it had to be). We address this issue with the following |
---|
| 3367 | + * bi-modal behavior, implemented in the function |
---|
| 3368 | + * bfq_asymmetric_scenario(). |
---|
| 3369 | + * |
---|
| 3370 | + * If there are groups with requests waiting for completion |
---|
| 3371 | + * (as commented above, some of these groups may even be |
---|
| 3372 | + * already inactive), then the scenario is tagged as |
---|
| 3373 | + * asymmetric, conservatively, without checking any of the |
---|
| 3374 | + * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. |
---|
| 3375 | + * This behavior matches also the fact that groups are created |
---|
| 3376 | + * exactly if controlling I/O is a primary concern (to |
---|
| 3377 | + * preserve bandwidth and latency guarantees). |
---|
| 3378 | + * |
---|
| 3379 | + * On the opposite end, if there are no groups with requests waiting |
---|
| 3380 | + * for completion, then only conditions (i-a) and (i-b) are actually |
---|
| 3381 | + * controlled, i.e., provided that conditions (i-a) or (i-b) holds, |
---|
| 3382 | + * idling is not performed, regardless of whether condition (ii) |
---|
| 3383 | + * holds. In other words, only if conditions (i-a) and (i-b) do not |
---|
| 3384 | + * hold, then idling is allowed, and the device tends to be prevented |
---|
| 3385 | + * from queueing many requests, possibly of several processes. Since |
---|
| 3386 | + * there are no groups with requests waiting for completion, then, to |
---|
| 3387 | + * control conditions (i-a) and (i-b) it is enough to check just |
---|
| 3388 | + * whether all the queues with requests waiting for completion also |
---|
| 3389 | + * have the same weight. |
---|
| 3390 | + * |
---|
| 3391 | + * Not checking condition (ii) evidently exposes bfqq to the |
---|
| 3392 | + * risk of getting less throughput than its fair share. |
---|
| 3393 | + * However, for queues with the same weight, a further |
---|
| 3394 | + * mechanism, preemption, mitigates or even eliminates this |
---|
| 3395 | + * problem. And it does so without consequences on overall |
---|
| 3396 | + * throughput. This mechanism and its benefits are explained |
---|
| 3397 | + * in the next three paragraphs. |
---|
| 3398 | + * |
---|
| 3399 | + * Even if a queue, say Q, is expired when it remains idle, Q |
---|
| 3400 | + * can still preempt the new in-service queue if the next |
---|
| 3401 | + * request of Q arrives soon (see the comments on |
---|
| 3402 | + * bfq_bfqq_update_budg_for_activation). If all queues and |
---|
| 3403 | + * groups have the same weight, this form of preemption, |
---|
| 3404 | + * combined with the hole-recovery heuristic described in the |
---|
| 3405 | + * comments on function bfq_bfqq_update_budg_for_activation, |
---|
| 3406 | + * are enough to preserve a correct bandwidth distribution in |
---|
| 3407 | + * the mid term, even without idling. In fact, even if not |
---|
| 3408 | + * idling allows the internal queues of the device to contain |
---|
| 3409 | + * many requests, and thus to reorder requests, we can rather |
---|
| 3410 | + * safely assume that the internal scheduler still preserves a |
---|
| 3411 | + * minimum of mid-term fairness. |
---|
| 3412 | + * |
---|
| 3413 | + * More precisely, this preemption-based, idleless approach |
---|
| 3414 | + * provides fairness in terms of IOPS, and not sectors per |
---|
| 3415 | + * second. This can be seen with a simple example. Suppose |
---|
| 3416 | + * that there are two queues with the same weight, but that |
---|
| 3417 | + * the first queue receives requests of 8 sectors, while the |
---|
| 3418 | + * second queue receives requests of 1024 sectors. In |
---|
| 3419 | + * addition, suppose that each of the two queues contains at |
---|
| 3420 | + * most one request at a time, which implies that each queue |
---|
| 3421 | + * always remains idle after it is served. Finally, after |
---|
| 3422 | + * remaining idle, each queue receives very quickly a new |
---|
| 3423 | + * request. It follows that the two queues are served |
---|
| 3424 | + * alternatively, preempting each other if needed. This |
---|
| 3425 | + * implies that, although both queues have the same weight, |
---|
| 3426 | + * the queue with large requests receives a service that is |
---|
| 3427 | + * 1024/8 times as high as the service received by the other |
---|
| 3428 | + * queue. |
---|
| 3429 | + * |
---|
| 3430 | + * The motivation for using preemption instead of idling (for |
---|
| 3431 | + * queues with the same weight) is that, by not idling, |
---|
| 3432 | + * service guarantees are preserved (completely or at least in |
---|
| 3433 | + * part) without minimally sacrificing throughput. And, if |
---|
| 3434 | + * there is no active group, then the primary expectation for |
---|
| 3435 | + * this device is probably a high throughput. |
---|
| 3436 | + * |
---|
| 3437 | + * We are now left only with explaining the two sub-conditions in the |
---|
| 3438 | + * additional compound condition that is checked below for deciding |
---|
| 3439 | + * whether the scenario is asymmetric. To explain the first |
---|
| 3440 | + * sub-condition, we need to add that the function |
---|
| 3441 | + * bfq_asymmetric_scenario checks the weights of only |
---|
| 3442 | + * non-weight-raised queues, for efficiency reasons (see comments on |
---|
| 3443 | + * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised |
---|
| 3444 | + * is checked explicitly here. More precisely, the compound condition |
---|
| 3445 | + * below takes into account also the fact that, even if bfqq is being |
---|
| 3446 | + * weight-raised, the scenario is still symmetric if all queues with |
---|
| 3447 | + * requests waiting for completion happen to be |
---|
| 3448 | + * weight-raised. Actually, we should be even more precise here, and |
---|
| 3449 | + * differentiate between interactive weight raising and soft real-time |
---|
| 3450 | + * weight raising. |
---|
| 3451 | + * |
---|
| 3452 | + * The second sub-condition checked in the compound condition is |
---|
| 3453 | + * whether there is a fair amount of already in-flight I/O not |
---|
| 3454 | + * belonging to bfqq. If so, I/O dispatching is to be plugged, for the |
---|
| 3455 | + * following reason. The drive may decide to serve in-flight |
---|
| 3456 | + * non-bfqq's I/O requests before bfqq's ones, thereby delaying the |
---|
| 3457 | + * arrival of new I/O requests for bfqq (recall that bfqq is sync). If |
---|
| 3458 | + * I/O-dispatching is not plugged, then, while bfqq remains empty, a |
---|
| 3459 | + * basically uncontrolled amount of I/O from other queues may be |
---|
| 3460 | + * dispatched too, possibly causing the service of bfqq's I/O to be |
---|
| 3461 | + * delayed even longer in the drive. This problem gets more and more |
---|
| 3462 | + * serious as the speed and the queue depth of the drive grow, |
---|
| 3463 | + * because, as these two quantities grow, the probability to find no |
---|
| 3464 | + * queue busy but many requests in flight grows too. By contrast, |
---|
| 3465 | + * plugging I/O dispatching minimizes the delay induced by already |
---|
| 3466 | + * in-flight I/O, and enables bfqq to recover the bandwidth it may |
---|
| 3467 | + * lose because of this delay. |
---|
| 3468 | + * |
---|
| 3469 | + * As a side note, it is worth considering that the above |
---|
| 3470 | + * device-idling countermeasures may however fail in the following |
---|
| 3471 | + * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled |
---|
| 3472 | + * in a time period during which all symmetry sub-conditions hold, and |
---|
| 3473 | + * therefore the device is allowed to enqueue many requests, but at |
---|
| 3474 | + * some later point in time some sub-condition stops to hold, then it |
---|
| 3475 | + * may become impossible to make requests be served in the desired |
---|
| 3476 | + * order until all the requests already queued in the device have been |
---|
| 3477 | + * served. The last sub-condition commented above somewhat mitigates |
---|
| 3478 | + * this problem for weight-raised queues. |
---|
| 3479 | + */ |
---|
| 3480 | +static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, |
---|
| 3481 | + struct bfq_queue *bfqq) |
---|
| 3482 | +{ |
---|
| 3483 | + /* No point in idling for bfqq if it won't get requests any longer */ |
---|
| 3484 | + if (unlikely(!bfqq_process_refs(bfqq))) |
---|
| 3485 | + return false; |
---|
| 3486 | + |
---|
| 3487 | + return (bfqq->wr_coeff > 1 && |
---|
| 3488 | + (bfqd->wr_busy_queues < |
---|
| 3489 | + bfq_tot_busy_queues(bfqd) || |
---|
| 3490 | + bfqd->rq_in_driver >= |
---|
| 3491 | + bfqq->dispatched + 4)) || |
---|
| 3492 | + bfq_asymmetric_scenario(bfqd, bfqq); |
---|
| 3493 | +} |
---|
| 3494 | + |
---|
| 3495 | +static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
---|
| 3496 | + enum bfqq_expiration reason) |
---|
2820 | 3497 | { |
---|
2821 | 3498 | /* |
---|
2822 | 3499 | * If this bfqq is shared between multiple processes, check |
---|
.. | .. |
---|
2827 | 3504 | if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
---|
2828 | 3505 | bfq_mark_bfqq_split_coop(bfqq); |
---|
2829 | 3506 | |
---|
2830 | | - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
---|
| 3507 | + /* |
---|
| 3508 | + * Consider queues with a higher finish virtual time than |
---|
| 3509 | + * bfqq. If idling_needed_for_service_guarantees(bfqq) returns |
---|
| 3510 | + * true, then bfqq's bandwidth would be violated if an |
---|
| 3511 | + * uncontrolled amount of I/O from these queues were |
---|
| 3512 | + * dispatched while bfqq is waiting for its new I/O to |
---|
| 3513 | + * arrive. This is exactly what may happen if this is a forced |
---|
| 3514 | + * expiration caused by a preemption attempt, and if bfqq is |
---|
| 3515 | + * not re-scheduled. To prevent this from happening, re-queue |
---|
| 3516 | + * bfqq if it needs I/O-dispatch plugging, even if it is |
---|
| 3517 | + * empty. By doing so, bfqq is granted to be served before the |
---|
| 3518 | + * above queues (provided that bfqq is of course eligible). |
---|
| 3519 | + */ |
---|
| 3520 | + if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
---|
| 3521 | + !(reason == BFQQE_PREEMPTED && |
---|
| 3522 | + idling_needed_for_service_guarantees(bfqd, bfqq))) { |
---|
2831 | 3523 | if (bfqq->dispatched == 0) |
---|
2832 | 3524 | /* |
---|
2833 | 3525 | * Overloading budget_timeout field to store |
---|
.. | .. |
---|
2842 | 3534 | bfq_requeue_bfqq(bfqd, bfqq, true); |
---|
2843 | 3535 | /* |
---|
2844 | 3536 | * Resort priority tree of potential close cooperators. |
---|
| 3537 | + * See comments on bfq_pos_tree_add_move() for the unlikely(). |
---|
2845 | 3538 | */ |
---|
2846 | | - bfq_pos_tree_add_move(bfqd, bfqq); |
---|
| 3539 | + if (unlikely(!bfqd->nonrot_with_queueing && |
---|
| 3540 | + !RB_EMPTY_ROOT(&bfqq->sort_list))) |
---|
| 3541 | + bfq_pos_tree_add_move(bfqd, bfqq); |
---|
2847 | 3542 | } |
---|
2848 | 3543 | |
---|
2849 | 3544 | /* |
---|
.. | .. |
---|
3217 | 3912 | jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); |
---|
3218 | 3913 | } |
---|
3219 | 3914 | |
---|
3220 | | -static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) |
---|
3221 | | -{ |
---|
3222 | | - return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && |
---|
3223 | | - blk_queue_nonrot(bfqq->bfqd->queue) && |
---|
3224 | | - bfqq->bfqd->hw_tag; |
---|
3225 | | -} |
---|
3226 | | - |
---|
3227 | 3915 | /** |
---|
3228 | 3916 | * bfq_bfqq_expire - expire a queue. |
---|
3229 | 3917 | * @bfqd: device owning the queue. |
---|
.. | .. |
---|
3299 | 3987 | * requests, then the request pattern is isochronous |
---|
3300 | 3988 | * (see the comments on the function |
---|
3301 | 3989 | * bfq_bfqq_softrt_next_start()). Thus we can compute |
---|
3302 | | - * soft_rt_next_start. If, instead, the queue still |
---|
3303 | | - * has outstanding requests, then we have to wait for |
---|
3304 | | - * the completion of all the outstanding requests to |
---|
3305 | | - * discover whether the request pattern is actually |
---|
3306 | | - * isochronous. |
---|
| 3990 | + * soft_rt_next_start. And we do it, unless bfqq is in |
---|
| 3991 | + * interactive weight raising. We do not do it in the |
---|
| 3992 | + * latter subcase, for the following reason. bfqq may |
---|
| 3993 | + * be conveying the I/O needed to load a soft |
---|
| 3994 | + * real-time application. Such an application will |
---|
| 3995 | + * actually exhibit a soft real-time I/O pattern after |
---|
| 3996 | + * it finally starts doing its job. But, if |
---|
| 3997 | + * soft_rt_next_start is computed here for an |
---|
| 3998 | + * interactive bfqq, and bfqq had received a lot of |
---|
| 3999 | + * service before remaining with no outstanding |
---|
| 4000 | + * request (likely to happen on a fast device), then |
---|
| 4001 | + * soft_rt_next_start would be assigned such a high |
---|
| 4002 | + * value that, for a very long time, bfqq would be |
---|
| 4003 | + * prevented from being possibly considered as soft |
---|
| 4004 | + * real time. |
---|
| 4005 | + * |
---|
| 4006 | + * If, instead, the queue still has outstanding |
---|
| 4007 | + * requests, then we have to wait for the completion |
---|
| 4008 | + * of all the outstanding requests to discover whether |
---|
| 4009 | + * the request pattern is actually isochronous. |
---|
3307 | 4010 | */ |
---|
3308 | | - if (bfqq->dispatched == 0) |
---|
| 4011 | + if (bfqq->dispatched == 0 && |
---|
| 4012 | + bfqq->wr_coeff != bfqd->bfq_wr_coeff) |
---|
3309 | 4013 | bfqq->soft_rt_next_start = |
---|
3310 | 4014 | bfq_bfqq_softrt_next_start(bfqd, bfqq); |
---|
3311 | | - else { |
---|
| 4015 | + else if (bfqq->dispatched > 0) { |
---|
3312 | 4016 | /* |
---|
3313 | 4017 | * Schedule an update of soft_rt_next_start to when |
---|
3314 | 4018 | * the task may be discovered to be isochronous. |
---|
.. | .. |
---|
3322 | 4026 | slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq)); |
---|
3323 | 4027 | |
---|
3324 | 4028 | /* |
---|
| 4029 | + * bfqq expired, so no total service time needs to be computed |
---|
| 4030 | + * any longer: reset state machine for measuring total service |
---|
| 4031 | + * times. |
---|
| 4032 | + */ |
---|
| 4033 | + bfqd->rqs_injected = bfqd->wait_dispatch = false; |
---|
| 4034 | + bfqd->waited_rq = NULL; |
---|
| 4035 | + |
---|
| 4036 | + /* |
---|
3325 | 4037 | * Increase, decrease or leave budget unchanged according to |
---|
3326 | 4038 | * reason. |
---|
3327 | 4039 | */ |
---|
3328 | 4040 | __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
---|
3329 | | - if (__bfq_bfqq_expire(bfqd, bfqq)) |
---|
| 4041 | + if (__bfq_bfqq_expire(bfqd, bfqq, reason)) |
---|
3330 | 4042 | /* bfqq is gone, no more actions on it */ |
---|
3331 | 4043 | return; |
---|
3332 | | - |
---|
3333 | | - bfqq->injected_service = 0; |
---|
3334 | 4044 | |
---|
3335 | 4045 | /* mark bfqq as waiting a request only if a bic still points to it */ |
---|
3336 | 4046 | if (!bfq_bfqq_busy(bfqq) && |
---|
.. | .. |
---|
3399 | 4109 | bfq_bfqq_budget_timeout(bfqq); |
---|
3400 | 4110 | } |
---|
3401 | 4111 | |
---|
3402 | | -/* |
---|
3403 | | - * For a queue that becomes empty, device idling is allowed only if |
---|
3404 | | - * this function returns true for the queue. As a consequence, since |
---|
3405 | | - * device idling plays a critical role in both throughput boosting and |
---|
3406 | | - * service guarantees, the return value of this function plays a |
---|
3407 | | - * critical role in both these aspects as well. |
---|
3408 | | - * |
---|
3409 | | - * In a nutshell, this function returns true only if idling is |
---|
3410 | | - * beneficial for throughput or, even if detrimental for throughput, |
---|
3411 | | - * idling is however necessary to preserve service guarantees (low |
---|
3412 | | - * latency, desired throughput distribution, ...). In particular, on |
---|
3413 | | - * NCQ-capable devices, this function tries to return false, so as to |
---|
3414 | | - * help keep the drives' internal queues full, whenever this helps the |
---|
3415 | | - * device boost the throughput without causing any service-guarantee |
---|
3416 | | - * issue. |
---|
3417 | | - * |
---|
3418 | | - * In more detail, the return value of this function is obtained by, |
---|
3419 | | - * first, computing a number of boolean variables that take into |
---|
3420 | | - * account throughput and service-guarantee issues, and, then, |
---|
3421 | | - * combining these variables in a logical expression. Most of the |
---|
3422 | | - * issues taken into account are not trivial. We discuss these issues |
---|
3423 | | - * individually while introducing the variables. |
---|
3424 | | - */ |
---|
3425 | | -static bool bfq_better_to_idle(struct bfq_queue *bfqq) |
---|
| 4112 | +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, |
---|
| 4113 | + struct bfq_queue *bfqq) |
---|
3426 | 4114 | { |
---|
3427 | | - struct bfq_data *bfqd = bfqq->bfqd; |
---|
3428 | 4115 | bool rot_without_queueing = |
---|
3429 | 4116 | !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, |
---|
3430 | 4117 | bfqq_sequential_and_IO_bound, |
---|
3431 | | - idling_boosts_thr, idling_boosts_thr_without_issues, |
---|
3432 | | - idling_needed_for_service_guarantees, |
---|
3433 | | - asymmetric_scenario; |
---|
| 4118 | + idling_boosts_thr; |
---|
3434 | 4119 | |
---|
3435 | | - if (bfqd->strict_guarantees) |
---|
3436 | | - return true; |
---|
3437 | | - |
---|
3438 | | - /* |
---|
3439 | | - * Idling is performed only if slice_idle > 0. In addition, we |
---|
3440 | | - * do not idle if |
---|
3441 | | - * (a) bfqq is async |
---|
3442 | | - * (b) bfqq is in the idle io prio class: in this case we do |
---|
3443 | | - * not idle because we want to minimize the bandwidth that |
---|
3444 | | - * queues in this class can steal to higher-priority queues |
---|
3445 | | - */ |
---|
3446 | | - if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || |
---|
3447 | | - bfq_class_idle(bfqq)) |
---|
| 4120 | + /* No point in idling for bfqq if it won't get requests any longer */ |
---|
| 4121 | + if (unlikely(!bfqq_process_refs(bfqq))) |
---|
3448 | 4122 | return false; |
---|
3449 | 4123 | |
---|
3450 | 4124 | bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && |
---|
.. | .. |
---|
3477 | 4151 | bfqq_sequential_and_IO_bound); |
---|
3478 | 4152 | |
---|
3479 | 4153 | /* |
---|
3480 | | - * The value of the next variable, |
---|
3481 | | - * idling_boosts_thr_without_issues, is equal to that of |
---|
| 4154 | + * The return value of this function is equal to that of |
---|
3482 | 4155 | * idling_boosts_thr, unless a special case holds. In this |
---|
3483 | 4156 | * special case, described below, idling may cause problems to |
---|
3484 | 4157 | * weight-raised queues. |
---|
.. | .. |
---|
3495 | 4168 | * which enqueue several requests in advance, and further |
---|
3496 | 4169 | * reorder internally-queued requests. |
---|
3497 | 4170 | * |
---|
3498 | | - * For this reason, we force to false the value of |
---|
3499 | | - * idling_boosts_thr_without_issues if there are weight-raised |
---|
3500 | | - * busy queues. In this case, and if bfqq is not weight-raised, |
---|
3501 | | - * this guarantees that the device is not idled for bfqq (if, |
---|
3502 | | - * instead, bfqq is weight-raised, then idling will be |
---|
3503 | | - * guaranteed by another variable, see below). Combined with |
---|
3504 | | - * the timestamping rules of BFQ (see [1] for details), this |
---|
3505 | | - * behavior causes bfqq, and hence any sync non-weight-raised |
---|
3506 | | - * queue, to get a lower number of requests served, and thus |
---|
3507 | | - * to ask for a lower number of requests from the request |
---|
3508 | | - * pool, before the busy weight-raised queues get served |
---|
3509 | | - * again. This often mitigates starvation problems in the |
---|
3510 | | - * presence of heavy write workloads and NCQ, thereby |
---|
3511 | | - * guaranteeing a higher application and system responsiveness |
---|
3512 | | - * in these hostile scenarios. |
---|
| 4171 | + * For this reason, we force to false the return value if |
---|
| 4172 | + * there are weight-raised busy queues. In this case, and if |
---|
| 4173 | + * bfqq is not weight-raised, this guarantees that the device |
---|
| 4174 | + * is not idled for bfqq (if, instead, bfqq is weight-raised, |
---|
| 4175 | + * then idling will be guaranteed by another variable, see |
---|
| 4176 | + * below). Combined with the timestamping rules of BFQ (see |
---|
| 4177 | + * [1] for details), this behavior causes bfqq, and hence any |
---|
| 4178 | + * sync non-weight-raised queue, to get a lower number of |
---|
| 4179 | + * requests served, and thus to ask for a lower number of |
---|
| 4180 | + * requests from the request pool, before the busy |
---|
| 4181 | + * weight-raised queues get served again. This often mitigates |
---|
| 4182 | + * starvation problems in the presence of heavy write |
---|
| 4183 | + * workloads and NCQ, thereby guaranteeing a higher |
---|
| 4184 | + * application and system responsiveness in these hostile |
---|
| 4185 | + * scenarios. |
---|
3513 | 4186 | */ |
---|
3514 | | - idling_boosts_thr_without_issues = idling_boosts_thr && |
---|
| 4187 | + return idling_boosts_thr && |
---|
3515 | 4188 | bfqd->wr_busy_queues == 0; |
---|
| 4189 | +} |
---|
| 4190 | + |
---|
| 4191 | +/* |
---|
| 4192 | + * For a queue that becomes empty, device idling is allowed only if |
---|
| 4193 | + * this function returns true for that queue. As a consequence, since |
---|
| 4194 | + * device idling plays a critical role for both throughput boosting |
---|
| 4195 | + * and service guarantees, the return value of this function plays a |
---|
| 4196 | + * critical role as well. |
---|
| 4197 | + * |
---|
| 4198 | + * In a nutshell, this function returns true only if idling is |
---|
| 4199 | + * beneficial for throughput or, even if detrimental for throughput, |
---|
| 4200 | + * idling is however necessary to preserve service guarantees (low |
---|
| 4201 | + * latency, desired throughput distribution, ...). In particular, on |
---|
| 4202 | + * NCQ-capable devices, this function tries to return false, so as to |
---|
| 4203 | + * help keep the drives' internal queues full, whenever this helps the |
---|
| 4204 | + * device boost the throughput without causing any service-guarantee |
---|
| 4205 | + * issue. |
---|
| 4206 | + * |
---|
| 4207 | + * Most of the issues taken into account to get the return value of |
---|
| 4208 | + * this function are not trivial. We discuss these issues in the two |
---|
| 4209 | + * functions providing the main pieces of information needed by this |
---|
| 4210 | + * function. |
---|
| 4211 | + */ |
---|
| 4212 | +static bool bfq_better_to_idle(struct bfq_queue *bfqq) |
---|
| 4213 | +{ |
---|
| 4214 | + struct bfq_data *bfqd = bfqq->bfqd; |
---|
| 4215 | + bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; |
---|
| 4216 | + |
---|
| 4217 | + /* No point in idling for bfqq if it won't get requests any longer */ |
---|
| 4218 | + if (unlikely(!bfqq_process_refs(bfqq))) |
---|
| 4219 | + return false; |
---|
| 4220 | + |
---|
| 4221 | + if (unlikely(bfqd->strict_guarantees)) |
---|
| 4222 | + return true; |
---|
3516 | 4223 | |
---|
3517 | 4224 | /* |
---|
3518 | | - * There is then a case where idling must be performed not |
---|
3519 | | - * for throughput concerns, but to preserve service |
---|
3520 | | - * guarantees. |
---|
3521 | | - * |
---|
3522 | | - * To introduce this case, we can note that allowing the drive |
---|
3523 | | - * to enqueue more than one request at a time, and hence |
---|
3524 | | - * delegating de facto final scheduling decisions to the |
---|
3525 | | - * drive's internal scheduler, entails loss of control on the |
---|
3526 | | - * actual request service order. In particular, the critical |
---|
3527 | | - * situation is when requests from different processes happen |
---|
3528 | | - * to be present, at the same time, in the internal queue(s) |
---|
3529 | | - * of the drive. In such a situation, the drive, by deciding |
---|
3530 | | - * the service order of the internally-queued requests, does |
---|
3531 | | - * determine also the actual throughput distribution among |
---|
3532 | | - * these processes. But the drive typically has no notion or |
---|
3533 | | - * concern about per-process throughput distribution, and |
---|
3534 | | - * makes its decisions only on a per-request basis. Therefore, |
---|
3535 | | - * the service distribution enforced by the drive's internal |
---|
3536 | | - * scheduler is likely to coincide with the desired |
---|
3537 | | - * device-throughput distribution only in a completely |
---|
3538 | | - * symmetric scenario where: |
---|
3539 | | - * (i) each of these processes must get the same throughput as |
---|
3540 | | - * the others; |
---|
3541 | | - * (ii) the I/O of each process has the same properties, in |
---|
3542 | | - * terms of locality (sequential or random), direction |
---|
3543 | | - * (reads or writes), request sizes, greediness |
---|
3544 | | - * (from I/O-bound to sporadic), and so on. |
---|
3545 | | - * In fact, in such a scenario, the drive tends to treat |
---|
3546 | | - * the requests of each of these processes in about the same |
---|
3547 | | - * way as the requests of the others, and thus to provide |
---|
3548 | | - * each of these processes with about the same throughput |
---|
3549 | | - * (which is exactly the desired throughput distribution). In |
---|
3550 | | - * contrast, in any asymmetric scenario, device idling is |
---|
3551 | | - * certainly needed to guarantee that bfqq receives its |
---|
3552 | | - * assigned fraction of the device throughput (see [1] for |
---|
3553 | | - * details). |
---|
3554 | | - * The problem is that idling may significantly reduce |
---|
3555 | | - * throughput with certain combinations of types of I/O and |
---|
3556 | | - * devices. An important example is sync random I/O, on flash |
---|
3557 | | - * storage with command queueing. So, unless bfqq falls in the |
---|
3558 | | - * above cases where idling also boosts throughput, it would |
---|
3559 | | - * be important to check conditions (i) and (ii) accurately, |
---|
3560 | | - * so as to avoid idling when not strictly needed for service |
---|
3561 | | - * guarantees. |
---|
3562 | | - * |
---|
3563 | | - * Unfortunately, it is extremely difficult to thoroughly |
---|
3564 | | - * check condition (ii). And, in case there are active groups, |
---|
3565 | | - * it becomes very difficult to check condition (i) too. In |
---|
3566 | | - * fact, if there are active groups, then, for condition (i) |
---|
3567 | | - * to become false, it is enough that an active group contains |
---|
3568 | | - * more active processes or sub-groups than some other active |
---|
3569 | | - * group. More precisely, for condition (i) to hold because of |
---|
3570 | | - * such a group, it is not even necessary that the group is |
---|
3571 | | - * (still) active: it is sufficient that, even if the group |
---|
3572 | | - * has become inactive, some of its descendant processes still |
---|
3573 | | - * have some request already dispatched but still waiting for |
---|
3574 | | - * completion. In fact, requests have still to be guaranteed |
---|
3575 | | - * their share of the throughput even after being |
---|
3576 | | - * dispatched. In this respect, it is easy to show that, if a |
---|
3577 | | - * group frequently becomes inactive while still having |
---|
3578 | | - * in-flight requests, and if, when this happens, the group is |
---|
3579 | | - * not considered in the calculation of whether the scenario |
---|
3580 | | - * is asymmetric, then the group may fail to be guaranteed its |
---|
3581 | | - * fair share of the throughput (basically because idling may |
---|
3582 | | - * not be performed for the descendant processes of the group, |
---|
3583 | | - * but it had to be). We address this issue with the |
---|
3584 | | - * following bi-modal behavior, implemented in the function |
---|
3585 | | - * bfq_symmetric_scenario(). |
---|
3586 | | - * |
---|
3587 | | - * If there are groups with requests waiting for completion |
---|
3588 | | - * (as commented above, some of these groups may even be |
---|
3589 | | - * already inactive), then the scenario is tagged as |
---|
3590 | | - * asymmetric, conservatively, without checking any of the |
---|
3591 | | - * conditions (i) and (ii). So the device is idled for bfqq. |
---|
3592 | | - * This behavior matches also the fact that groups are created |
---|
3593 | | - * exactly if controlling I/O is a primary concern (to |
---|
3594 | | - * preserve bandwidth and latency guarantees). |
---|
3595 | | - * |
---|
3596 | | - * On the opposite end, if there are no groups with requests |
---|
3597 | | - * waiting for completion, then only condition (i) is actually |
---|
3598 | | - * controlled, i.e., provided that condition (i) holds, idling |
---|
3599 | | - * is not performed, regardless of whether condition (ii) |
---|
3600 | | - * holds. In other words, only if condition (i) does not hold, |
---|
3601 | | - * then idling is allowed, and the device tends to be |
---|
3602 | | - * prevented from queueing many requests, possibly of several |
---|
3603 | | - * processes. Since there are no groups with requests waiting |
---|
3604 | | - * for completion, then, to control condition (i) it is enough |
---|
3605 | | - * to check just whether all the queues with requests waiting |
---|
3606 | | - * for completion also have the same weight. |
---|
3607 | | - * |
---|
3608 | | - * Not checking condition (ii) evidently exposes bfqq to the |
---|
3609 | | - * risk of getting less throughput than its fair share. |
---|
3610 | | - * However, for queues with the same weight, a further |
---|
3611 | | - * mechanism, preemption, mitigates or even eliminates this |
---|
3612 | | - * problem. And it does so without consequences on overall |
---|
3613 | | - * throughput. This mechanism and its benefits are explained |
---|
3614 | | - * in the next three paragraphs. |
---|
3615 | | - * |
---|
3616 | | - * Even if a queue, say Q, is expired when it remains idle, Q |
---|
3617 | | - * can still preempt the new in-service queue if the next |
---|
3618 | | - * request of Q arrives soon (see the comments on |
---|
3619 | | - * bfq_bfqq_update_budg_for_activation). If all queues and |
---|
3620 | | - * groups have the same weight, this form of preemption, |
---|
3621 | | - * combined with the hole-recovery heuristic described in the |
---|
3622 | | - * comments on function bfq_bfqq_update_budg_for_activation, |
---|
3623 | | - * are enough to preserve a correct bandwidth distribution in |
---|
3624 | | - * the mid term, even without idling. In fact, even if not |
---|
3625 | | - * idling allows the internal queues of the device to contain |
---|
3626 | | - * many requests, and thus to reorder requests, we can rather |
---|
3627 | | - * safely assume that the internal scheduler still preserves a |
---|
3628 | | - * minimum of mid-term fairness. |
---|
3629 | | - * |
---|
3630 | | - * More precisely, this preemption-based, idleless approach |
---|
3631 | | - * provides fairness in terms of IOPS, and not sectors per |
---|
3632 | | - * second. This can be seen with a simple example. Suppose |
---|
3633 | | - * that there are two queues with the same weight, but that |
---|
3634 | | - * the first queue receives requests of 8 sectors, while the |
---|
3635 | | - * second queue receives requests of 1024 sectors. In |
---|
3636 | | - * addition, suppose that each of the two queues contains at |
---|
3637 | | - * most one request at a time, which implies that each queue |
---|
3638 | | - * always remains idle after it is served. Finally, after |
---|
3639 | | - * remaining idle, each queue receives very quickly a new |
---|
3640 | | - * request. It follows that the two queues are served |
---|
3641 | | - * alternatively, preempting each other if needed. This |
---|
3642 | | - * implies that, although both queues have the same weight, |
---|
3643 | | - * the queue with large requests receives a service that is |
---|
3644 | | - * 1024/8 times as high as the service received by the other |
---|
3645 | | - * queue. |
---|
3646 | | - * |
---|
3647 | | - * The motivation for using preemption instead of idling (for |
---|
3648 | | - * queues with the same weight) is that, by not idling, |
---|
3649 | | - * service guarantees are preserved (completely or at least in |
---|
3650 | | - * part) without minimally sacrificing throughput. And, if |
---|
3651 | | - * there is no active group, then the primary expectation for |
---|
3652 | | - * this device is probably a high throughput. |
---|
3653 | | - * |
---|
3654 | | - * We are now left only with explaining the additional |
---|
3655 | | - * compound condition that is checked below for deciding |
---|
3656 | | - * whether the scenario is asymmetric. To explain this |
---|
3657 | | - * compound condition, we need to add that the function |
---|
3658 | | - * bfq_symmetric_scenario checks the weights of only |
---|
3659 | | - * non-weight-raised queues, for efficiency reasons (see |
---|
3660 | | - * comments on bfq_weights_tree_add()). Then the fact that |
---|
3661 | | - * bfqq is weight-raised is checked explicitly here. More |
---|
3662 | | - * precisely, the compound condition below takes into account |
---|
3663 | | - * also the fact that, even if bfqq is being weight-raised, |
---|
3664 | | - * the scenario is still symmetric if all queues with requests |
---|
3665 | | - * waiting for completion happen to be |
---|
3666 | | - * weight-raised. Actually, we should be even more precise |
---|
3667 | | - * here, and differentiate between interactive weight raising |
---|
3668 | | - * and soft real-time weight raising. |
---|
3669 | | - * |
---|
3670 | | - * As a side note, it is worth considering that the above |
---|
3671 | | - * device-idling countermeasures may however fail in the |
---|
3672 | | - * following unlucky scenario: if idling is (correctly) |
---|
3673 | | - * disabled in a time period during which all symmetry |
---|
3674 | | - * sub-conditions hold, and hence the device is allowed to |
---|
3675 | | - * enqueue many requests, but at some later point in time some |
---|
3676 | | - * sub-condition stops to hold, then it may become impossible |
---|
3677 | | - * to let requests be served in the desired order until all |
---|
3678 | | - * the requests already queued in the device have been served. |
---|
| 4225 | + * Idling is performed only if slice_idle > 0. In addition, we |
---|
| 4226 | + * do not idle if |
---|
| 4227 | + * (a) bfqq is async |
---|
| 4228 | + * (b) bfqq is in the idle io prio class: in this case we do |
---|
| 4229 | + * not idle because we want to minimize the bandwidth that |
---|
| 4230 | + * queues in this class can steal to higher-priority queues |
---|
3679 | 4231 | */ |
---|
3680 | | - asymmetric_scenario = (bfqq->wr_coeff > 1 && |
---|
3681 | | - bfqd->wr_busy_queues < bfqd->busy_queues) || |
---|
3682 | | - !bfq_symmetric_scenario(bfqd); |
---|
| 4232 | + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || |
---|
| 4233 | + bfq_class_idle(bfqq)) |
---|
| 4234 | + return false; |
---|
| 4235 | + |
---|
| 4236 | + idling_boosts_thr_with_no_issue = |
---|
| 4237 | + idling_boosts_thr_without_issues(bfqd, bfqq); |
---|
| 4238 | + |
---|
| 4239 | + idling_needed_for_service_guar = |
---|
| 4240 | + idling_needed_for_service_guarantees(bfqd, bfqq); |
---|
3683 | 4241 | |
---|
3684 | 4242 | /* |
---|
3685 | | - * Finally, there is a case where maximizing throughput is the |
---|
3686 | | - * best choice even if it may cause unfairness toward |
---|
3687 | | - * bfqq. Such a case is when bfqq became active in a burst of |
---|
3688 | | - * queue activations. Queues that became active during a large |
---|
3689 | | - * burst benefit only from throughput, as discussed in the |
---|
3690 | | - * comments on bfq_handle_burst. Thus, if bfqq became active |
---|
3691 | | - * in a burst and not idling the device maximizes throughput, |
---|
3692 | | - * then the device must no be idled, because not idling the |
---|
3693 | | - * device provides bfqq and all other queues in the burst with |
---|
3694 | | - * maximum benefit. Combining this and the above case, we can |
---|
3695 | | - * now establish when idling is actually needed to preserve |
---|
3696 | | - * service guarantees. |
---|
3697 | | - */ |
---|
3698 | | - idling_needed_for_service_guarantees = |
---|
3699 | | - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); |
---|
3700 | | - |
---|
3701 | | - /* |
---|
3702 | | - * We have now all the components we need to compute the |
---|
| 4243 | + * We have now the two components we need to compute the |
---|
3703 | 4244 | * return value of the function, which is true only if idling |
---|
3704 | 4245 | * either boosts the throughput (without issues), or is |
---|
3705 | 4246 | * necessary to preserve service guarantees. |
---|
3706 | 4247 | */ |
---|
3707 | | - return idling_boosts_thr_without_issues || |
---|
3708 | | - idling_needed_for_service_guarantees; |
---|
| 4248 | + return idling_boosts_thr_with_no_issue || |
---|
| 4249 | + idling_needed_for_service_guar; |
---|
3709 | 4250 | } |
---|
3710 | 4251 | |
---|
3711 | 4252 | /* |
---|
.. | .. |
---|
3724 | 4265 | return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); |
---|
3725 | 4266 | } |
---|
3726 | 4267 | |
---|
3727 | | -static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) |
---|
| 4268 | +/* |
---|
| 4269 | + * This function chooses the queue from which to pick the next extra |
---|
| 4270 | + * I/O request to inject, if it finds a compatible queue. See the |
---|
| 4271 | + * comments on bfq_update_inject_limit() for details on the injection |
---|
| 4272 | + * mechanism, and for the definitions of the quantities mentioned |
---|
| 4273 | + * below. |
---|
| 4274 | + */ |
---|
| 4275 | +static struct bfq_queue * |
---|
| 4276 | +bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) |
---|
3728 | 4277 | { |
---|
3729 | | - struct bfq_queue *bfqq; |
---|
| 4278 | + struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; |
---|
| 4279 | + unsigned int limit = in_serv_bfqq->inject_limit; |
---|
| 4280 | + /* |
---|
| 4281 | + * If |
---|
| 4282 | + * - bfqq is not weight-raised and therefore does not carry |
---|
| 4283 | + * time-critical I/O, |
---|
| 4284 | + * or |
---|
| 4285 | + * - regardless of whether bfqq is weight-raised, bfqq has |
---|
| 4286 | + * however a long think time, during which it can absorb the |
---|
| 4287 | + * effect of an appropriate number of extra I/O requests |
---|
| 4288 | + * from other queues (see bfq_update_inject_limit for |
---|
| 4289 | + * details on the computation of this number); |
---|
| 4290 | + * then injection can be performed without restrictions. |
---|
| 4291 | + */ |
---|
| 4292 | + bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 || |
---|
| 4293 | + !bfq_bfqq_has_short_ttime(in_serv_bfqq); |
---|
3730 | 4294 | |
---|
3731 | 4295 | /* |
---|
3732 | | - * A linear search; but, with a high probability, very few |
---|
3733 | | - * steps are needed to find a candidate queue, i.e., a queue |
---|
3734 | | - * with enough budget left for its next request. In fact: |
---|
| 4296 | + * If |
---|
| 4297 | + * - the baseline total service time could not be sampled yet, |
---|
| 4298 | + * so the inject limit happens to be still 0, and |
---|
| 4299 | + * - a lot of time has elapsed since the plugging of I/O |
---|
| 4300 | + * dispatching started, so drive speed is being wasted |
---|
| 4301 | + * significantly; |
---|
| 4302 | + * then temporarily raise inject limit to one request. |
---|
| 4303 | + */ |
---|
| 4304 | + if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 && |
---|
| 4305 | + bfq_bfqq_wait_request(in_serv_bfqq) && |
---|
| 4306 | + time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies + |
---|
| 4307 | + bfqd->bfq_slice_idle) |
---|
| 4308 | + ) |
---|
| 4309 | + limit = 1; |
---|
| 4310 | + |
---|
| 4311 | + if (bfqd->rq_in_driver >= limit) |
---|
| 4312 | + return NULL; |
---|
| 4313 | + |
---|
| 4314 | + /* |
---|
| 4315 | + * Linear search of the source queue for injection; but, with |
---|
| 4316 | + * a high probability, very few steps are needed to find a |
---|
| 4317 | + * candidate queue, i.e., a queue with enough budget left for |
---|
| 4318 | + * its next request. In fact: |
---|
3735 | 4319 | * - BFQ dynamically updates the budget of every queue so as |
---|
3736 | 4320 | * to accommodate the expected backlog of the queue; |
---|
3737 | 4321 | * - if a queue gets all its requests dispatched as injected |
---|
3738 | 4322 | * service, then the queue is removed from the active list |
---|
3739 | | - * (and re-added only if it gets new requests, but with |
---|
3740 | | - * enough budget for its new backlog). |
---|
| 4323 | + * (and re-added only if it gets new requests, but then it |
---|
| 4324 | + * is assigned again enough budget for its new backlog). |
---|
3741 | 4325 | */ |
---|
3742 | 4326 | list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) |
---|
3743 | 4327 | if (!RB_EMPTY_ROOT(&bfqq->sort_list) && |
---|
| 4328 | + (in_serv_always_inject || bfqq->wr_coeff > 1) && |
---|
3744 | 4329 | bfq_serv_to_charge(bfqq->next_rq, bfqq) <= |
---|
3745 | | - bfq_bfqq_budget_left(bfqq)) |
---|
3746 | | - return bfqq; |
---|
| 4330 | + bfq_bfqq_budget_left(bfqq)) { |
---|
| 4331 | + /* |
---|
| 4332 | + * Allow for only one large in-flight request |
---|
| 4333 | + * on non-rotational devices, for the |
---|
| 4334 | + * following reason. On non-rotationl drives, |
---|
| 4335 | + * large requests take much longer than |
---|
| 4336 | + * smaller requests to be served. In addition, |
---|
| 4337 | + * the drive prefers to serve large requests |
---|
| 4338 | + * w.r.t. to small ones, if it can choose. So, |
---|
| 4339 | + * having more than one large requests queued |
---|
| 4340 | + * in the drive may easily make the next first |
---|
| 4341 | + * request of the in-service queue wait for so |
---|
| 4342 | + * long to break bfqq's service guarantees. On |
---|
| 4343 | + * the bright side, large requests let the |
---|
| 4344 | + * drive reach a very high throughput, even if |
---|
| 4345 | + * there is only one in-flight large request |
---|
| 4346 | + * at a time. |
---|
| 4347 | + */ |
---|
| 4348 | + if (blk_queue_nonrot(bfqd->queue) && |
---|
| 4349 | + blk_rq_sectors(bfqq->next_rq) >= |
---|
| 4350 | + BFQQ_SECT_THR_NONROT) |
---|
| 4351 | + limit = min_t(unsigned int, 1, limit); |
---|
| 4352 | + else |
---|
| 4353 | + limit = in_serv_bfqq->inject_limit; |
---|
| 4354 | + |
---|
| 4355 | + if (bfqd->rq_in_driver < limit) { |
---|
| 4356 | + bfqd->rqs_injected = true; |
---|
| 4357 | + return bfqq; |
---|
| 4358 | + } |
---|
| 4359 | + } |
---|
3747 | 4360 | |
---|
3748 | 4361 | return NULL; |
---|
3749 | 4362 | } |
---|
.. | .. |
---|
3830 | 4443 | * for a new request, or has requests waiting for a completion and |
---|
3831 | 4444 | * may idle after their completion, then keep it anyway. |
---|
3832 | 4445 | * |
---|
3833 | | - * Yet, to boost throughput, inject service from other queues if |
---|
3834 | | - * possible. |
---|
| 4446 | + * Yet, inject service from other queues if it boosts |
---|
| 4447 | + * throughput and is possible. |
---|
3835 | 4448 | */ |
---|
3836 | 4449 | if (bfq_bfqq_wait_request(bfqq) || |
---|
3837 | 4450 | (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { |
---|
3838 | | - if (bfq_bfqq_injectable(bfqq) && |
---|
3839 | | - bfqq->injected_service * bfqq->inject_coeff < |
---|
3840 | | - bfqq->entity.service * 10) |
---|
| 4451 | + struct bfq_queue *async_bfqq = |
---|
| 4452 | + bfqq->bic && bfqq->bic->bfqq[0] && |
---|
| 4453 | + bfq_bfqq_busy(bfqq->bic->bfqq[0]) && |
---|
| 4454 | + bfqq->bic->bfqq[0]->next_rq ? |
---|
| 4455 | + bfqq->bic->bfqq[0] : NULL; |
---|
| 4456 | + |
---|
| 4457 | + /* |
---|
| 4458 | + * The next three mutually-exclusive ifs decide |
---|
| 4459 | + * whether to try injection, and choose the queue to |
---|
| 4460 | + * pick an I/O request from. |
---|
| 4461 | + * |
---|
| 4462 | + * The first if checks whether the process associated |
---|
| 4463 | + * with bfqq has also async I/O pending. If so, it |
---|
| 4464 | + * injects such I/O unconditionally. Injecting async |
---|
| 4465 | + * I/O from the same process can cause no harm to the |
---|
| 4466 | + * process. On the contrary, it can only increase |
---|
| 4467 | + * bandwidth and reduce latency for the process. |
---|
| 4468 | + * |
---|
| 4469 | + * The second if checks whether there happens to be a |
---|
| 4470 | + * non-empty waker queue for bfqq, i.e., a queue whose |
---|
| 4471 | + * I/O needs to be completed for bfqq to receive new |
---|
| 4472 | + * I/O. This happens, e.g., if bfqq is associated with |
---|
| 4473 | + * a process that does some sync. A sync generates |
---|
| 4474 | + * extra blocking I/O, which must be completed before |
---|
| 4475 | + * the process associated with bfqq can go on with its |
---|
| 4476 | + * I/O. If the I/O of the waker queue is not served, |
---|
| 4477 | + * then bfqq remains empty, and no I/O is dispatched, |
---|
| 4478 | + * until the idle timeout fires for bfqq. This is |
---|
| 4479 | + * likely to result in lower bandwidth and higher |
---|
| 4480 | + * latencies for bfqq, and in a severe loss of total |
---|
| 4481 | + * throughput. The best action to take is therefore to |
---|
| 4482 | + * serve the waker queue as soon as possible. So do it |
---|
| 4483 | + * (without relying on the third alternative below for |
---|
| 4484 | + * eventually serving waker_bfqq's I/O; see the last |
---|
| 4485 | + * paragraph for further details). This systematic |
---|
| 4486 | + * injection of I/O from the waker queue does not |
---|
| 4487 | + * cause any delay to bfqq's I/O. On the contrary, |
---|
| 4488 | + * next bfqq's I/O is brought forward dramatically, |
---|
| 4489 | + * for it is not blocked for milliseconds. |
---|
| 4490 | + * |
---|
| 4491 | + * The third if checks whether bfqq is a queue for |
---|
| 4492 | + * which it is better to avoid injection. It is so if |
---|
| 4493 | + * bfqq delivers more throughput when served without |
---|
| 4494 | + * any further I/O from other queues in the middle, or |
---|
| 4495 | + * if the service times of bfqq's I/O requests both |
---|
| 4496 | + * count more than overall throughput, and may be |
---|
| 4497 | + * easily increased by injection (this happens if bfqq |
---|
| 4498 | + * has a short think time). If none of these |
---|
| 4499 | + * conditions holds, then a candidate queue for |
---|
| 4500 | + * injection is looked for through |
---|
| 4501 | + * bfq_choose_bfqq_for_injection(). Note that the |
---|
| 4502 | + * latter may return NULL (for example if the inject |
---|
| 4503 | + * limit for bfqq is currently 0). |
---|
| 4504 | + * |
---|
| 4505 | + * NOTE: motivation for the second alternative |
---|
| 4506 | + * |
---|
| 4507 | + * Thanks to the way the inject limit is updated in |
---|
| 4508 | + * bfq_update_has_short_ttime(), it is rather likely |
---|
| 4509 | + * that, if I/O is being plugged for bfqq and the |
---|
| 4510 | + * waker queue has pending I/O requests that are |
---|
| 4511 | + * blocking bfqq's I/O, then the third alternative |
---|
| 4512 | + * above lets the waker queue get served before the |
---|
| 4513 | + * I/O-plugging timeout fires. So one may deem the |
---|
| 4514 | + * second alternative superfluous. It is not, because |
---|
| 4515 | + * the third alternative may be way less effective in |
---|
| 4516 | + * case of a synchronization. For two main |
---|
| 4517 | + * reasons. First, throughput may be low because the |
---|
| 4518 | + * inject limit may be too low to guarantee the same |
---|
| 4519 | + * amount of injected I/O, from the waker queue or |
---|
| 4520 | + * other queues, that the second alternative |
---|
| 4521 | + * guarantees (the second alternative unconditionally |
---|
| 4522 | + * injects a pending I/O request of the waker queue |
---|
| 4523 | + * for each bfq_dispatch_request()). Second, with the |
---|
| 4524 | + * third alternative, the duration of the plugging, |
---|
| 4525 | + * i.e., the time before bfqq finally receives new I/O, |
---|
| 4526 | + * may not be minimized, because the waker queue may |
---|
| 4527 | + * happen to be served only after other queues. |
---|
| 4528 | + */ |
---|
| 4529 | + if (async_bfqq && |
---|
| 4530 | + icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && |
---|
| 4531 | + bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= |
---|
| 4532 | + bfq_bfqq_budget_left(async_bfqq)) |
---|
| 4533 | + bfqq = bfqq->bic->bfqq[0]; |
---|
| 4534 | + else if (bfq_bfqq_has_waker(bfqq) && |
---|
| 4535 | + bfq_bfqq_busy(bfqq->waker_bfqq) && |
---|
| 4536 | + bfqq->waker_bfqq->next_rq && |
---|
| 4537 | + bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, |
---|
| 4538 | + bfqq->waker_bfqq) <= |
---|
| 4539 | + bfq_bfqq_budget_left(bfqq->waker_bfqq) |
---|
| 4540 | + ) |
---|
| 4541 | + bfqq = bfqq->waker_bfqq; |
---|
| 4542 | + else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && |
---|
| 4543 | + (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || |
---|
| 4544 | + !bfq_bfqq_has_short_ttime(bfqq))) |
---|
3841 | 4545 | bfqq = bfq_choose_bfqq_for_injection(bfqd); |
---|
3842 | 4546 | else |
---|
3843 | 4547 | bfqq = NULL; |
---|
.. | .. |
---|
3929 | 4633 | |
---|
3930 | 4634 | bfq_bfqq_served(bfqq, service_to_charge); |
---|
3931 | 4635 | |
---|
| 4636 | + if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) { |
---|
| 4637 | + bfqd->wait_dispatch = false; |
---|
| 4638 | + bfqd->waited_rq = rq; |
---|
| 4639 | + } |
---|
| 4640 | + |
---|
3932 | 4641 | bfq_dispatch_remove(bfqd->queue, rq); |
---|
3933 | 4642 | |
---|
3934 | | - if (bfqq != bfqd->in_service_queue) { |
---|
3935 | | - if (likely(bfqd->in_service_queue)) |
---|
3936 | | - bfqd->in_service_queue->injected_service += |
---|
3937 | | - bfq_serv_to_charge(rq, bfqq); |
---|
3938 | | - |
---|
| 4643 | + if (bfqq != bfqd->in_service_queue) |
---|
3939 | 4644 | goto return_rq; |
---|
3940 | | - } |
---|
3941 | 4645 | |
---|
3942 | 4646 | /* |
---|
3943 | 4647 | * If weight raising has to terminate for bfqq, then next |
---|
.. | .. |
---|
3957 | 4661 | * belongs to CLASS_IDLE and other queues are waiting for |
---|
3958 | 4662 | * service. |
---|
3959 | 4663 | */ |
---|
3960 | | - if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq))) |
---|
| 4664 | + if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) |
---|
3961 | 4665 | goto return_rq; |
---|
3962 | 4666 | |
---|
3963 | 4667 | bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); |
---|
.. | .. |
---|
3975 | 4679 | * most a call to dispatch for nothing |
---|
3976 | 4680 | */ |
---|
3977 | 4681 | return !list_empty_careful(&bfqd->dispatch) || |
---|
3978 | | - bfqd->busy_queues > 0; |
---|
| 4682 | + bfq_tot_busy_queues(bfqd) > 0; |
---|
3979 | 4683 | } |
---|
3980 | 4684 | |
---|
3981 | 4685 | static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) |
---|
.. | .. |
---|
4029 | 4733 | goto start_rq; |
---|
4030 | 4734 | } |
---|
4031 | 4735 | |
---|
4032 | | - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
---|
| 4736 | + bfq_log(bfqd, "dispatch requests: %d busy queues", |
---|
| 4737 | + bfq_tot_busy_queues(bfqd)); |
---|
4033 | 4738 | |
---|
4034 | | - if (bfqd->busy_queues == 0) |
---|
| 4739 | + if (bfq_tot_busy_queues(bfqd) == 0) |
---|
4035 | 4740 | goto exit; |
---|
4036 | 4741 | |
---|
4037 | 4742 | /* |
---|
.. | .. |
---|
4043 | 4748 | * some unlucky request wait for as long as the device |
---|
4044 | 4749 | * wishes. |
---|
4045 | 4750 | * |
---|
4046 | | - * Of course, serving one request at at time may cause loss of |
---|
| 4751 | + * Of course, serving one request at a time may cause loss of |
---|
4047 | 4752 | * throughput. |
---|
4048 | 4753 | */ |
---|
4049 | 4754 | if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) |
---|
.. | .. |
---|
4065 | 4770 | return rq; |
---|
4066 | 4771 | } |
---|
4067 | 4772 | |
---|
4068 | | -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) |
---|
| 4773 | +#ifdef CONFIG_BFQ_CGROUP_DEBUG |
---|
4069 | 4774 | static void bfq_update_dispatch_stats(struct request_queue *q, |
---|
4070 | 4775 | struct request *rq, |
---|
4071 | 4776 | struct bfq_queue *in_serv_queue, |
---|
.. | .. |
---|
4089 | 4794 | * In addition, the following queue lock guarantees that |
---|
4090 | 4795 | * bfqq_group(bfqq) exists as well. |
---|
4091 | 4796 | */ |
---|
4092 | | - spin_lock_irq(q->queue_lock); |
---|
| 4797 | + spin_lock_irq(&q->queue_lock); |
---|
4093 | 4798 | if (idle_timer_disabled) |
---|
4094 | 4799 | /* |
---|
4095 | 4800 | * Since the idle timer has been disabled, |
---|
.. | .. |
---|
4108 | 4813 | bfqg_stats_set_start_empty_time(bfqg); |
---|
4109 | 4814 | bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); |
---|
4110 | 4815 | } |
---|
4111 | | - spin_unlock_irq(q->queue_lock); |
---|
| 4816 | + spin_unlock_irq(&q->queue_lock); |
---|
4112 | 4817 | } |
---|
4113 | 4818 | #else |
---|
4114 | 4819 | static inline void bfq_update_dispatch_stats(struct request_queue *q, |
---|
4115 | 4820 | struct request *rq, |
---|
4116 | 4821 | struct bfq_queue *in_serv_queue, |
---|
4117 | 4822 | bool idle_timer_disabled) {} |
---|
4118 | | -#endif |
---|
| 4823 | +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
---|
4119 | 4824 | |
---|
4120 | 4825 | static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) |
---|
4121 | 4826 | { |
---|
4122 | 4827 | struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; |
---|
4123 | 4828 | struct request *rq; |
---|
4124 | 4829 | struct bfq_queue *in_serv_queue; |
---|
4125 | | - bool waiting_rq, idle_timer_disabled; |
---|
| 4830 | + bool waiting_rq, idle_timer_disabled = false; |
---|
4126 | 4831 | |
---|
4127 | 4832 | spin_lock_irq(&bfqd->lock); |
---|
4128 | 4833 | |
---|
.. | .. |
---|
4130 | 4835 | waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); |
---|
4131 | 4836 | |
---|
4132 | 4837 | rq = __bfq_dispatch_request(hctx); |
---|
4133 | | - |
---|
4134 | | - idle_timer_disabled = |
---|
4135 | | - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); |
---|
| 4838 | + if (in_serv_queue == bfqd->in_service_queue) { |
---|
| 4839 | + idle_timer_disabled = |
---|
| 4840 | + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); |
---|
| 4841 | + } |
---|
4136 | 4842 | |
---|
4137 | 4843 | spin_unlock_irq(&bfqd->lock); |
---|
4138 | | - |
---|
4139 | | - bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, |
---|
4140 | | - idle_timer_disabled); |
---|
| 4844 | + bfq_update_dispatch_stats(hctx->queue, rq, |
---|
| 4845 | + idle_timer_disabled ? in_serv_queue : NULL, |
---|
| 4846 | + idle_timer_disabled); |
---|
4141 | 4847 | |
---|
4142 | 4848 | return rq; |
---|
4143 | 4849 | } |
---|
.. | .. |
---|
4151 | 4857 | */ |
---|
4152 | 4858 | void bfq_put_queue(struct bfq_queue *bfqq) |
---|
4153 | 4859 | { |
---|
4154 | | -#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
| 4860 | + struct bfq_queue *item; |
---|
| 4861 | + struct hlist_node *n; |
---|
4155 | 4862 | struct bfq_group *bfqg = bfqq_group(bfqq); |
---|
4156 | | -#endif |
---|
4157 | 4863 | |
---|
4158 | 4864 | if (bfqq->bfqd) |
---|
4159 | 4865 | bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", |
---|
.. | .. |
---|
4195 | 4901 | bfqq->bfqd->burst_size--; |
---|
4196 | 4902 | } |
---|
4197 | 4903 | |
---|
| 4904 | + /* |
---|
| 4905 | + * bfqq does not exist any longer, so it cannot be woken by |
---|
| 4906 | + * any other queue, and cannot wake any other queue. Then bfqq |
---|
| 4907 | + * must be removed from the woken list of its possible waker |
---|
| 4908 | + * queue, and all queues in the woken list of bfqq must stop |
---|
| 4909 | + * having a waker queue. Strictly speaking, these updates |
---|
| 4910 | + * should be performed when bfqq remains with no I/O source |
---|
| 4911 | + * attached to it, which happens before bfqq gets freed. In |
---|
| 4912 | + * particular, this happens when the last process associated |
---|
| 4913 | + * with bfqq exits or gets associated with a different |
---|
| 4914 | + * queue. However, both events lead to bfqq being freed soon, |
---|
| 4915 | + * and dangling references would come out only after bfqq gets |
---|
| 4916 | + * freed. So these updates are done here, as a simple and safe |
---|
| 4917 | + * way to handle all cases. |
---|
| 4918 | + */ |
---|
| 4919 | + /* remove bfqq from woken list */ |
---|
| 4920 | + if (!hlist_unhashed(&bfqq->woken_list_node)) |
---|
| 4921 | + hlist_del_init(&bfqq->woken_list_node); |
---|
| 4922 | + |
---|
| 4923 | + /* reset waker for all queues in woken list */ |
---|
| 4924 | + hlist_for_each_entry_safe(item, n, &bfqq->woken_list, |
---|
| 4925 | + woken_list_node) { |
---|
| 4926 | + item->waker_bfqq = NULL; |
---|
| 4927 | + bfq_clear_bfqq_has_waker(item); |
---|
| 4928 | + hlist_del_init(&item->woken_list_node); |
---|
| 4929 | + } |
---|
| 4930 | + |
---|
| 4931 | + if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) |
---|
| 4932 | + bfqq->bfqd->last_completed_rq_bfqq = NULL; |
---|
| 4933 | + |
---|
4198 | 4934 | kmem_cache_free(bfq_pool, bfqq); |
---|
4199 | | -#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
4200 | 4935 | bfqg_and_blkg_put(bfqg); |
---|
4201 | | -#endif |
---|
4202 | 4936 | } |
---|
4203 | 4937 | |
---|
4204 | | -static void bfq_put_cooperator(struct bfq_queue *bfqq) |
---|
| 4938 | +void bfq_put_cooperator(struct bfq_queue *bfqq) |
---|
4205 | 4939 | { |
---|
4206 | 4940 | struct bfq_queue *__bfqq, *next; |
---|
4207 | 4941 | |
---|
.. | .. |
---|
4223 | 4957 | static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
---|
4224 | 4958 | { |
---|
4225 | 4959 | if (bfqq == bfqd->in_service_queue) { |
---|
4226 | | - __bfq_bfqq_expire(bfqd, bfqq); |
---|
| 4960 | + __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT); |
---|
4227 | 4961 | bfq_schedule_dispatch(bfqd); |
---|
4228 | 4962 | } |
---|
4229 | 4963 | |
---|
.. | .. |
---|
4231 | 4965 | |
---|
4232 | 4966 | bfq_put_cooperator(bfqq); |
---|
4233 | 4967 | |
---|
4234 | | - bfq_put_queue(bfqq); /* release process reference */ |
---|
| 4968 | + bfq_release_process_ref(bfqd, bfqq); |
---|
4235 | 4969 | } |
---|
4236 | 4970 | |
---|
4237 | 4971 | static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) |
---|
.. | .. |
---|
4246 | 4980 | unsigned long flags; |
---|
4247 | 4981 | |
---|
4248 | 4982 | spin_lock_irqsave(&bfqd->lock, flags); |
---|
4249 | | - bfqq->bic = NULL; |
---|
4250 | | - bfq_exit_bfqq(bfqd, bfqq); |
---|
4251 | 4983 | bic_set_bfqq(bic, NULL, is_sync); |
---|
| 4984 | + bfq_exit_bfqq(bfqd, bfqq); |
---|
4252 | 4985 | spin_unlock_irqrestore(&bfqd->lock, flags); |
---|
4253 | 4986 | } |
---|
4254 | 4987 | } |
---|
.. | .. |
---|
4281 | 5014 | pr_err("bdi %s: bfq: bad prio class %d\n", |
---|
4282 | 5015 | bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), |
---|
4283 | 5016 | ioprio_class); |
---|
4284 | | - /* fall through */ |
---|
| 5017 | + fallthrough; |
---|
4285 | 5018 | case IOPRIO_CLASS_NONE: |
---|
4286 | 5019 | /* |
---|
4287 | 5020 | * No prio set, inherit CPU scheduling settings. |
---|
.. | .. |
---|
4334 | 5067 | |
---|
4335 | 5068 | bfqq = bic_to_bfqq(bic, false); |
---|
4336 | 5069 | if (bfqq) { |
---|
4337 | | - /* release process reference on this queue */ |
---|
4338 | | - bfq_put_queue(bfqq); |
---|
4339 | | - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); |
---|
| 5070 | + struct bfq_queue *old_bfqq = bfqq; |
---|
| 5071 | + |
---|
| 5072 | + bfqq = bfq_get_queue(bfqd, bio, false, bic); |
---|
4340 | 5073 | bic_set_bfqq(bic, bfqq, false); |
---|
| 5074 | + bfq_release_process_ref(bfqd, old_bfqq); |
---|
4341 | 5075 | } |
---|
4342 | 5076 | |
---|
4343 | 5077 | bfqq = bic_to_bfqq(bic, true); |
---|
.. | .. |
---|
4351 | 5085 | RB_CLEAR_NODE(&bfqq->entity.rb_node); |
---|
4352 | 5086 | INIT_LIST_HEAD(&bfqq->fifo); |
---|
4353 | 5087 | INIT_HLIST_NODE(&bfqq->burst_list_node); |
---|
| 5088 | + INIT_HLIST_NODE(&bfqq->woken_list_node); |
---|
| 5089 | + INIT_HLIST_HEAD(&bfqq->woken_list); |
---|
4354 | 5090 | |
---|
4355 | 5091 | bfqq->ref = 0; |
---|
4356 | 5092 | bfqq->bfqd = bfqd; |
---|
.. | .. |
---|
4369 | 5105 | bfq_mark_bfqq_has_short_ttime(bfqq); |
---|
4370 | 5106 | bfq_mark_bfqq_sync(bfqq); |
---|
4371 | 5107 | bfq_mark_bfqq_just_created(bfqq); |
---|
4372 | | - /* |
---|
4373 | | - * Aggressively inject a lot of service: up to 90%. |
---|
4374 | | - * This coefficient remains constant during bfqq life, |
---|
4375 | | - * but this behavior might be changed, after enough |
---|
4376 | | - * testing and tuning. |
---|
4377 | | - */ |
---|
4378 | | - bfqq->inject_coeff = 1; |
---|
4379 | 5108 | } else |
---|
4380 | 5109 | bfq_clear_bfqq_sync(bfqq); |
---|
4381 | 5110 | |
---|
.. | .. |
---|
4419 | 5148 | return &bfqg->async_bfqq[0][ioprio]; |
---|
4420 | 5149 | case IOPRIO_CLASS_NONE: |
---|
4421 | 5150 | ioprio = IOPRIO_NORM; |
---|
4422 | | - /* fall through */ |
---|
| 5151 | + fallthrough; |
---|
4423 | 5152 | case IOPRIO_CLASS_BE: |
---|
4424 | 5153 | return &bfqg->async_bfqq[1][ioprio]; |
---|
4425 | 5154 | case IOPRIO_CLASS_IDLE: |
---|
.. | .. |
---|
4439 | 5168 | struct bfq_queue *bfqq; |
---|
4440 | 5169 | struct bfq_group *bfqg; |
---|
4441 | 5170 | |
---|
4442 | | - rcu_read_lock(); |
---|
4443 | | - |
---|
4444 | | - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); |
---|
4445 | | - if (!bfqg) { |
---|
4446 | | - bfqq = &bfqd->oom_bfqq; |
---|
4447 | | - goto out; |
---|
4448 | | - } |
---|
4449 | | - |
---|
| 5171 | + bfqg = bfq_bio_bfqg(bfqd, bio); |
---|
4450 | 5172 | if (!is_sync) { |
---|
4451 | 5173 | async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
---|
4452 | 5174 | ioprio); |
---|
.. | .. |
---|
4490 | 5212 | out: |
---|
4491 | 5213 | bfqq->ref++; /* get a process reference to this queue */ |
---|
4492 | 5214 | bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); |
---|
4493 | | - rcu_read_unlock(); |
---|
4494 | 5215 | return bfqq; |
---|
4495 | 5216 | } |
---|
4496 | 5217 | |
---|
.. | .. |
---|
4513 | 5234 | struct request *rq) |
---|
4514 | 5235 | { |
---|
4515 | 5236 | bfqq->seek_history <<= 1; |
---|
4516 | | - bfqq->seek_history |= |
---|
4517 | | - get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && |
---|
4518 | | - (!blk_queue_nonrot(bfqd->queue) || |
---|
4519 | | - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); |
---|
| 5237 | + bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); |
---|
| 5238 | + |
---|
| 5239 | + if (bfqq->wr_coeff > 1 && |
---|
| 5240 | + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && |
---|
| 5241 | + BFQQ_TOTALLY_SEEKY(bfqq)) |
---|
| 5242 | + bfq_bfqq_end_wr(bfqq); |
---|
4520 | 5243 | } |
---|
4521 | 5244 | |
---|
4522 | 5245 | static void bfq_update_has_short_ttime(struct bfq_data *bfqd, |
---|
4523 | 5246 | struct bfq_queue *bfqq, |
---|
4524 | 5247 | struct bfq_io_cq *bic) |
---|
4525 | 5248 | { |
---|
4526 | | - bool has_short_ttime = true; |
---|
| 5249 | + bool has_short_ttime = true, state_changed; |
---|
4527 | 5250 | |
---|
4528 | 5251 | /* |
---|
4529 | 5252 | * No need to update has_short_ttime if bfqq is async or in |
---|
.. | .. |
---|
4548 | 5271 | bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) |
---|
4549 | 5272 | has_short_ttime = false; |
---|
4550 | 5273 | |
---|
4551 | | - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", |
---|
4552 | | - has_short_ttime); |
---|
| 5274 | + state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); |
---|
4553 | 5275 | |
---|
4554 | 5276 | if (has_short_ttime) |
---|
4555 | 5277 | bfq_mark_bfqq_has_short_ttime(bfqq); |
---|
4556 | 5278 | else |
---|
4557 | 5279 | bfq_clear_bfqq_has_short_ttime(bfqq); |
---|
| 5280 | + |
---|
| 5281 | + /* |
---|
| 5282 | + * Until the base value for the total service time gets |
---|
| 5283 | + * finally computed for bfqq, the inject limit does depend on |
---|
| 5284 | + * the think-time state (short|long). In particular, the limit |
---|
| 5285 | + * is 0 or 1 if the think time is deemed, respectively, as |
---|
| 5286 | + * short or long (details in the comments in |
---|
| 5287 | + * bfq_update_inject_limit()). Accordingly, the next |
---|
| 5288 | + * instructions reset the inject limit if the think-time state |
---|
| 5289 | + * has changed and the above base value is still to be |
---|
| 5290 | + * computed. |
---|
| 5291 | + * |
---|
| 5292 | + * However, the reset is performed only if more than 100 ms |
---|
| 5293 | + * have elapsed since the last update of the inject limit, or |
---|
| 5294 | + * (inclusive) if the change is from short to long think |
---|
| 5295 | + * time. The reason for this waiting is as follows. |
---|
| 5296 | + * |
---|
| 5297 | + * bfqq may have a long think time because of a |
---|
| 5298 | + * synchronization with some other queue, i.e., because the |
---|
| 5299 | + * I/O of some other queue may need to be completed for bfqq |
---|
| 5300 | + * to receive new I/O. Details in the comments on the choice |
---|
| 5301 | + * of the queue for injection in bfq_select_queue(). |
---|
| 5302 | + * |
---|
| 5303 | + * As stressed in those comments, if such a synchronization is |
---|
| 5304 | + * actually in place, then, without injection on bfqq, the |
---|
| 5305 | + * blocking I/O cannot happen to served while bfqq is in |
---|
| 5306 | + * service. As a consequence, if bfqq is granted |
---|
| 5307 | + * I/O-dispatch-plugging, then bfqq remains empty, and no I/O |
---|
| 5308 | + * is dispatched, until the idle timeout fires. This is likely |
---|
| 5309 | + * to result in lower bandwidth and higher latencies for bfqq, |
---|
| 5310 | + * and in a severe loss of total throughput. |
---|
| 5311 | + * |
---|
| 5312 | + * On the opposite end, a non-zero inject limit may allow the |
---|
| 5313 | + * I/O that blocks bfqq to be executed soon, and therefore |
---|
| 5314 | + * bfqq to receive new I/O soon. |
---|
| 5315 | + * |
---|
| 5316 | + * But, if the blocking gets actually eliminated, then the |
---|
| 5317 | + * next think-time sample for bfqq may be very low. This in |
---|
| 5318 | + * turn may cause bfqq's think time to be deemed |
---|
| 5319 | + * short. Without the 100 ms barrier, this new state change |
---|
| 5320 | + * would cause the body of the next if to be executed |
---|
| 5321 | + * immediately. But this would set to 0 the inject |
---|
| 5322 | + * limit. Without injection, the blocking I/O would cause the |
---|
| 5323 | + * think time of bfqq to become long again, and therefore the |
---|
| 5324 | + * inject limit to be raised again, and so on. The only effect |
---|
| 5325 | + * of such a steady oscillation between the two think-time |
---|
| 5326 | + * states would be to prevent effective injection on bfqq. |
---|
| 5327 | + * |
---|
| 5328 | + * In contrast, if the inject limit is not reset during such a |
---|
| 5329 | + * long time interval as 100 ms, then the number of short |
---|
| 5330 | + * think time samples can grow significantly before the reset |
---|
| 5331 | + * is performed. As a consequence, the think time state can |
---|
| 5332 | + * become stable before the reset. Therefore there will be no |
---|
| 5333 | + * state change when the 100 ms elapse, and no reset of the |
---|
| 5334 | + * inject limit. The inject limit remains steadily equal to 1 |
---|
| 5335 | + * both during and after the 100 ms. So injection can be |
---|
| 5336 | + * performed at all times, and throughput gets boosted. |
---|
| 5337 | + * |
---|
| 5338 | + * An inject limit equal to 1 is however in conflict, in |
---|
| 5339 | + * general, with the fact that the think time of bfqq is |
---|
| 5340 | + * short, because injection may be likely to delay bfqq's I/O |
---|
| 5341 | + * (as explained in the comments in |
---|
| 5342 | + * bfq_update_inject_limit()). But this does not happen in |
---|
| 5343 | + * this special case, because bfqq's low think time is due to |
---|
| 5344 | + * an effective handling of a synchronization, through |
---|
| 5345 | + * injection. In this special case, bfqq's I/O does not get |
---|
| 5346 | + * delayed by injection; on the contrary, bfqq's I/O is |
---|
| 5347 | + * brought forward, because it is not blocked for |
---|
| 5348 | + * milliseconds. |
---|
| 5349 | + * |
---|
| 5350 | + * In addition, serving the blocking I/O much sooner, and much |
---|
| 5351 | + * more frequently than once per I/O-plugging timeout, makes |
---|
| 5352 | + * it much quicker to detect a waker queue (the concept of |
---|
| 5353 | + * waker queue is defined in the comments in |
---|
| 5354 | + * bfq_add_request()). This makes it possible to start sooner |
---|
| 5355 | + * to boost throughput more effectively, by injecting the I/O |
---|
| 5356 | + * of the waker queue unconditionally on every |
---|
| 5357 | + * bfq_dispatch_request(). |
---|
| 5358 | + * |
---|
| 5359 | + * One last, important benefit of not resetting the inject |
---|
| 5360 | + * limit before 100 ms is that, during this time interval, the |
---|
| 5361 | + * base value for the total service time is likely to get |
---|
| 5362 | + * finally computed for bfqq, freeing the inject limit from |
---|
| 5363 | + * its relation with the think time. |
---|
| 5364 | + */ |
---|
| 5365 | + if (state_changed && bfqq->last_serv_time_ns == 0 && |
---|
| 5366 | + (time_is_before_eq_jiffies(bfqq->decrease_time_jif + |
---|
| 5367 | + msecs_to_jiffies(100)) || |
---|
| 5368 | + !has_short_ttime)) |
---|
| 5369 | + bfq_reset_inject_limit(bfqd, bfqq); |
---|
4558 | 5370 | } |
---|
4559 | 5371 | |
---|
4560 | 5372 | /* |
---|
.. | .. |
---|
4564 | 5376 | static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
---|
4565 | 5377 | struct request *rq) |
---|
4566 | 5378 | { |
---|
4567 | | - struct bfq_io_cq *bic = RQ_BIC(rq); |
---|
4568 | | - |
---|
4569 | 5379 | if (rq->cmd_flags & REQ_META) |
---|
4570 | 5380 | bfqq->meta_pending++; |
---|
4571 | | - |
---|
4572 | | - bfq_update_io_thinktime(bfqd, bfqq); |
---|
4573 | | - bfq_update_has_short_ttime(bfqd, bfqq, bic); |
---|
4574 | | - bfq_update_io_seektime(bfqd, bfqq, rq); |
---|
4575 | | - |
---|
4576 | | - bfq_log_bfqq(bfqd, bfqq, |
---|
4577 | | - "rq_enqueued: has_short_ttime=%d (seeky %d)", |
---|
4578 | | - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); |
---|
4579 | 5381 | |
---|
4580 | 5382 | bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
---|
4581 | 5383 | |
---|
.. | .. |
---|
4585 | 5387 | bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); |
---|
4586 | 5388 | |
---|
4587 | 5389 | /* |
---|
4588 | | - * There is just this request queued: if the request |
---|
4589 | | - * is small and the queue is not to be expired, then |
---|
4590 | | - * just exit. |
---|
| 5390 | + * There is just this request queued: if |
---|
| 5391 | + * - the request is small, and |
---|
| 5392 | + * - we are idling to boost throughput, and |
---|
| 5393 | + * - the queue is not to be expired, |
---|
| 5394 | + * then just exit. |
---|
4591 | 5395 | * |
---|
4592 | 5396 | * In this way, if the device is being idled to wait |
---|
4593 | 5397 | * for a new request from the in-service queue, we |
---|
4594 | 5398 | * avoid unplugging the device and committing the |
---|
4595 | | - * device to serve just a small request. On the |
---|
4596 | | - * contrary, we wait for the block layer to decide |
---|
4597 | | - * when to unplug the device: hopefully, new requests |
---|
4598 | | - * will be merged to this one quickly, then the device |
---|
4599 | | - * will be unplugged and larger requests will be |
---|
4600 | | - * dispatched. |
---|
| 5399 | + * device to serve just a small request. In contrast |
---|
| 5400 | + * we wait for the block layer to decide when to |
---|
| 5401 | + * unplug the device: hopefully, new requests will be |
---|
| 5402 | + * merged to this one quickly, then the device will be |
---|
| 5403 | + * unplugged and larger requests will be dispatched. |
---|
4601 | 5404 | */ |
---|
4602 | | - if (small_req && !budget_timeout) |
---|
| 5405 | + if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && |
---|
| 5406 | + !budget_timeout) |
---|
4603 | 5407 | return; |
---|
4604 | 5408 | |
---|
4605 | 5409 | /* |
---|
4606 | | - * A large enough request arrived, or the queue is to |
---|
4607 | | - * be expired: in both cases disk idling is to be |
---|
4608 | | - * stopped, so clear wait_request flag and reset |
---|
4609 | | - * timer. |
---|
| 5410 | + * A large enough request arrived, or idling is being |
---|
| 5411 | + * performed to preserve service guarantees, or |
---|
| 5412 | + * finally the queue is to be expired: in all these |
---|
| 5413 | + * cases disk idling is to be stopped, so clear |
---|
| 5414 | + * wait_request flag and reset timer. |
---|
4610 | 5415 | */ |
---|
4611 | 5416 | bfq_clear_bfqq_wait_request(bfqq); |
---|
4612 | 5417 | hrtimer_try_to_cancel(&bfqd->idle_slice_timer); |
---|
.. | .. |
---|
4632 | 5437 | bool waiting, idle_timer_disabled = false; |
---|
4633 | 5438 | |
---|
4634 | 5439 | if (new_bfqq) { |
---|
4635 | | - if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) |
---|
4636 | | - new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); |
---|
4637 | 5440 | /* |
---|
4638 | 5441 | * Release the request's reference to the old bfqq |
---|
4639 | 5442 | * and make sure one is taken to the shared queue. |
---|
.. | .. |
---|
4663 | 5466 | bfqq = new_bfqq; |
---|
4664 | 5467 | } |
---|
4665 | 5468 | |
---|
| 5469 | + bfq_update_io_thinktime(bfqd, bfqq); |
---|
| 5470 | + bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq)); |
---|
| 5471 | + bfq_update_io_seektime(bfqd, bfqq, rq); |
---|
| 5472 | + |
---|
4666 | 5473 | waiting = bfqq && bfq_bfqq_wait_request(bfqq); |
---|
4667 | 5474 | bfq_add_request(rq); |
---|
4668 | 5475 | idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); |
---|
.. | .. |
---|
4675 | 5482 | return idle_timer_disabled; |
---|
4676 | 5483 | } |
---|
4677 | 5484 | |
---|
4678 | | -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) |
---|
| 5485 | +#ifdef CONFIG_BFQ_CGROUP_DEBUG |
---|
4679 | 5486 | static void bfq_update_insert_stats(struct request_queue *q, |
---|
4680 | 5487 | struct bfq_queue *bfqq, |
---|
4681 | 5488 | bool idle_timer_disabled, |
---|
.. | .. |
---|
4694 | 5501 | * In addition, the following queue lock guarantees that |
---|
4695 | 5502 | * bfqq_group(bfqq) exists as well. |
---|
4696 | 5503 | */ |
---|
4697 | | - spin_lock_irq(q->queue_lock); |
---|
| 5504 | + spin_lock_irq(&q->queue_lock); |
---|
4698 | 5505 | bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); |
---|
4699 | 5506 | if (idle_timer_disabled) |
---|
4700 | 5507 | bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
---|
4701 | | - spin_unlock_irq(q->queue_lock); |
---|
| 5508 | + spin_unlock_irq(&q->queue_lock); |
---|
4702 | 5509 | } |
---|
4703 | 5510 | #else |
---|
4704 | 5511 | static inline void bfq_update_insert_stats(struct request_queue *q, |
---|
4705 | 5512 | struct bfq_queue *bfqq, |
---|
4706 | 5513 | bool idle_timer_disabled, |
---|
4707 | 5514 | unsigned int cmd_flags) {} |
---|
4708 | | -#endif |
---|
| 5515 | +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
---|
| 5516 | + |
---|
| 5517 | +static struct bfq_queue *bfq_init_rq(struct request *rq); |
---|
4709 | 5518 | |
---|
4710 | 5519 | static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, |
---|
4711 | 5520 | bool at_head) |
---|
.. | .. |
---|
4716 | 5525 | bool idle_timer_disabled = false; |
---|
4717 | 5526 | unsigned int cmd_flags; |
---|
4718 | 5527 | |
---|
| 5528 | +#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
| 5529 | + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) |
---|
| 5530 | + bfqg_stats_update_legacy_io(q, rq); |
---|
| 5531 | +#endif |
---|
4719 | 5532 | spin_lock_irq(&bfqd->lock); |
---|
| 5533 | + bfqq = bfq_init_rq(rq); |
---|
4720 | 5534 | if (blk_mq_sched_try_insert_merge(q, rq)) { |
---|
4721 | 5535 | spin_unlock_irq(&bfqd->lock); |
---|
4722 | 5536 | return; |
---|
4723 | 5537 | } |
---|
4724 | 5538 | |
---|
4725 | | - spin_unlock_irq(&bfqd->lock); |
---|
4726 | | - |
---|
4727 | 5539 | blk_mq_sched_request_inserted(rq); |
---|
4728 | 5540 | |
---|
4729 | | - spin_lock_irq(&bfqd->lock); |
---|
4730 | | - bfqq = bfq_init_rq(rq); |
---|
4731 | 5541 | if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { |
---|
4732 | 5542 | if (at_head) |
---|
4733 | 5543 | list_add(&rq->queuelist, &bfqd->dispatch); |
---|
.. | .. |
---|
4776 | 5586 | |
---|
4777 | 5587 | static void bfq_update_hw_tag(struct bfq_data *bfqd) |
---|
4778 | 5588 | { |
---|
| 5589 | + struct bfq_queue *bfqq = bfqd->in_service_queue; |
---|
| 5590 | + |
---|
4779 | 5591 | bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, |
---|
4780 | 5592 | bfqd->rq_in_driver); |
---|
4781 | 5593 | |
---|
.. | .. |
---|
4788 | 5600 | * sum is not exact, as it's not taking into account deactivated |
---|
4789 | 5601 | * requests. |
---|
4790 | 5602 | */ |
---|
4791 | | - if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) |
---|
| 5603 | + if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) |
---|
| 5604 | + return; |
---|
| 5605 | + |
---|
| 5606 | + /* |
---|
| 5607 | + * If active queue hasn't enough requests and can idle, bfq might not |
---|
| 5608 | + * dispatch sufficient requests to hardware. Don't zero hw_tag in this |
---|
| 5609 | + * case |
---|
| 5610 | + */ |
---|
| 5611 | + if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && |
---|
| 5612 | + bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < |
---|
| 5613 | + BFQ_HW_QUEUE_THRESHOLD && |
---|
| 5614 | + bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) |
---|
4792 | 5615 | return; |
---|
4793 | 5616 | |
---|
4794 | 5617 | if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) |
---|
.. | .. |
---|
4797 | 5620 | bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; |
---|
4798 | 5621 | bfqd->max_rq_in_driver = 0; |
---|
4799 | 5622 | bfqd->hw_tag_samples = 0; |
---|
| 5623 | + |
---|
| 5624 | + bfqd->nonrot_with_queueing = |
---|
| 5625 | + blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; |
---|
4800 | 5626 | } |
---|
4801 | 5627 | |
---|
4802 | 5628 | static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) |
---|
.. | .. |
---|
4852 | 5678 | 1UL<<(BFQ_RATE_SHIFT - 10)) |
---|
4853 | 5679 | bfq_update_rate_reset(bfqd, NULL); |
---|
4854 | 5680 | bfqd->last_completion = now_ns; |
---|
| 5681 | + bfqd->last_completed_rq_bfqq = bfqq; |
---|
4855 | 5682 | |
---|
4856 | 5683 | /* |
---|
4857 | 5684 | * If we are waiting to discover whether the request pattern |
---|
.. | .. |
---|
4859 | 5686 | * isochronous, and both requisites for this condition to hold |
---|
4860 | 5687 | * are now satisfied, then compute soft_rt_next_start (see the |
---|
4861 | 5688 | * comments on the function bfq_bfqq_softrt_next_start()). We |
---|
4862 | | - * schedule this delayed check when bfqq expires, if it still |
---|
4863 | | - * has in-flight requests. |
---|
| 5689 | + * do not compute soft_rt_next_start if bfqq is in interactive |
---|
| 5690 | + * weight raising (see the comments in bfq_bfqq_expire() for |
---|
| 5691 | + * an explanation). We schedule this delayed update when bfqq |
---|
| 5692 | + * expires, if it still has in-flight requests. |
---|
4864 | 5693 | */ |
---|
4865 | 5694 | if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && |
---|
4866 | | - RB_EMPTY_ROOT(&bfqq->sort_list)) |
---|
| 5695 | + RB_EMPTY_ROOT(&bfqq->sort_list) && |
---|
| 5696 | + bfqq->wr_coeff != bfqd->bfq_wr_coeff) |
---|
4867 | 5697 | bfqq->soft_rt_next_start = |
---|
4868 | 5698 | bfq_bfqq_softrt_next_start(bfqd, bfqq); |
---|
4869 | 5699 | |
---|
.. | .. |
---|
4921 | 5751 | } |
---|
4922 | 5752 | |
---|
4923 | 5753 | /* |
---|
| 5754 | + * The processes associated with bfqq may happen to generate their |
---|
| 5755 | + * cumulative I/O at a lower rate than the rate at which the device |
---|
| 5756 | + * could serve the same I/O. This is rather probable, e.g., if only |
---|
| 5757 | + * one process is associated with bfqq and the device is an SSD. It |
---|
| 5758 | + * results in bfqq becoming often empty while in service. In this |
---|
| 5759 | + * respect, if BFQ is allowed to switch to another queue when bfqq |
---|
| 5760 | + * remains empty, then the device goes on being fed with I/O requests, |
---|
| 5761 | + * and the throughput is not affected. In contrast, if BFQ is not |
---|
| 5762 | + * allowed to switch to another queue---because bfqq is sync and |
---|
| 5763 | + * I/O-dispatch needs to be plugged while bfqq is temporarily |
---|
| 5764 | + * empty---then, during the service of bfqq, there will be frequent |
---|
| 5765 | + * "service holes", i.e., time intervals during which bfqq gets empty |
---|
| 5766 | + * and the device can only consume the I/O already queued in its |
---|
| 5767 | + * hardware queues. During service holes, the device may even get to |
---|
| 5768 | + * remaining idle. In the end, during the service of bfqq, the device |
---|
| 5769 | + * is driven at a lower speed than the one it can reach with the kind |
---|
| 5770 | + * of I/O flowing through bfqq. |
---|
| 5771 | + * |
---|
| 5772 | + * To counter this loss of throughput, BFQ implements a "request |
---|
| 5773 | + * injection mechanism", which tries to fill the above service holes |
---|
| 5774 | + * with I/O requests taken from other queues. The hard part in this |
---|
| 5775 | + * mechanism is finding the right amount of I/O to inject, so as to |
---|
| 5776 | + * both boost throughput and not break bfqq's bandwidth and latency |
---|
| 5777 | + * guarantees. In this respect, the mechanism maintains a per-queue |
---|
| 5778 | + * inject limit, computed as below. While bfqq is empty, the injection |
---|
| 5779 | + * mechanism dispatches extra I/O requests only until the total number |
---|
| 5780 | + * of I/O requests in flight---i.e., already dispatched but not yet |
---|
| 5781 | + * completed---remains lower than this limit. |
---|
| 5782 | + * |
---|
| 5783 | + * A first definition comes in handy to introduce the algorithm by |
---|
| 5784 | + * which the inject limit is computed. We define as first request for |
---|
| 5785 | + * bfqq, an I/O request for bfqq that arrives while bfqq is in |
---|
| 5786 | + * service, and causes bfqq to switch from empty to non-empty. The |
---|
| 5787 | + * algorithm updates the limit as a function of the effect of |
---|
| 5788 | + * injection on the service times of only the first requests of |
---|
| 5789 | + * bfqq. The reason for this restriction is that these are the |
---|
| 5790 | + * requests whose service time is affected most, because they are the |
---|
| 5791 | + * first to arrive after injection possibly occurred. |
---|
| 5792 | + * |
---|
| 5793 | + * To evaluate the effect of injection, the algorithm measures the |
---|
| 5794 | + * "total service time" of first requests. We define as total service |
---|
| 5795 | + * time of an I/O request, the time that elapses since when the |
---|
| 5796 | + * request is enqueued into bfqq, to when it is completed. This |
---|
| 5797 | + * quantity allows the whole effect of injection to be measured. It is |
---|
| 5798 | + * easy to see why. Suppose that some requests of other queues are |
---|
| 5799 | + * actually injected while bfqq is empty, and that a new request R |
---|
| 5800 | + * then arrives for bfqq. If the device does start to serve all or |
---|
| 5801 | + * part of the injected requests during the service hole, then, |
---|
| 5802 | + * because of this extra service, it may delay the next invocation of |
---|
| 5803 | + * the dispatch hook of BFQ. Then, even after R gets eventually |
---|
| 5804 | + * dispatched, the device may delay the actual service of R if it is |
---|
| 5805 | + * still busy serving the extra requests, or if it decides to serve, |
---|
| 5806 | + * before R, some extra request still present in its queues. As a |
---|
| 5807 | + * conclusion, the cumulative extra delay caused by injection can be |
---|
| 5808 | + * easily evaluated by just comparing the total service time of first |
---|
| 5809 | + * requests with and without injection. |
---|
| 5810 | + * |
---|
| 5811 | + * The limit-update algorithm works as follows. On the arrival of a |
---|
| 5812 | + * first request of bfqq, the algorithm measures the total time of the |
---|
| 5813 | + * request only if one of the three cases below holds, and, for each |
---|
| 5814 | + * case, it updates the limit as described below: |
---|
| 5815 | + * |
---|
| 5816 | + * (1) If there is no in-flight request. This gives a baseline for the |
---|
| 5817 | + * total service time of the requests of bfqq. If the baseline has |
---|
| 5818 | + * not been computed yet, then, after computing it, the limit is |
---|
| 5819 | + * set to 1, to start boosting throughput, and to prepare the |
---|
| 5820 | + * ground for the next case. If the baseline has already been |
---|
| 5821 | + * computed, then it is updated, in case it results to be lower |
---|
| 5822 | + * than the previous value. |
---|
| 5823 | + * |
---|
| 5824 | + * (2) If the limit is higher than 0 and there are in-flight |
---|
| 5825 | + * requests. By comparing the total service time in this case with |
---|
| 5826 | + * the above baseline, it is possible to know at which extent the |
---|
| 5827 | + * current value of the limit is inflating the total service |
---|
| 5828 | + * time. If the inflation is below a certain threshold, then bfqq |
---|
| 5829 | + * is assumed to be suffering from no perceivable loss of its |
---|
| 5830 | + * service guarantees, and the limit is even tentatively |
---|
| 5831 | + * increased. If the inflation is above the threshold, then the |
---|
| 5832 | + * limit is decreased. Due to the lack of any hysteresis, this |
---|
| 5833 | + * logic makes the limit oscillate even in steady workload |
---|
| 5834 | + * conditions. Yet we opted for it, because it is fast in reaching |
---|
| 5835 | + * the best value for the limit, as a function of the current I/O |
---|
| 5836 | + * workload. To reduce oscillations, this step is disabled for a |
---|
| 5837 | + * short time interval after the limit happens to be decreased. |
---|
| 5838 | + * |
---|
| 5839 | + * (3) Periodically, after resetting the limit, to make sure that the |
---|
| 5840 | + * limit eventually drops in case the workload changes. This is |
---|
| 5841 | + * needed because, after the limit has gone safely up for a |
---|
| 5842 | + * certain workload, it is impossible to guess whether the |
---|
| 5843 | + * baseline total service time may have changed, without measuring |
---|
| 5844 | + * it again without injection. A more effective version of this |
---|
| 5845 | + * step might be to just sample the baseline, by interrupting |
---|
| 5846 | + * injection only once, and then to reset/lower the limit only if |
---|
| 5847 | + * the total service time with the current limit does happen to be |
---|
| 5848 | + * too large. |
---|
| 5849 | + * |
---|
| 5850 | + * More details on each step are provided in the comments on the |
---|
| 5851 | + * pieces of code that implement these steps: the branch handling the |
---|
| 5852 | + * transition from empty to non empty in bfq_add_request(), the branch |
---|
| 5853 | + * handling injection in bfq_select_queue(), and the function |
---|
| 5854 | + * bfq_choose_bfqq_for_injection(). These comments also explain some |
---|
| 5855 | + * exceptions, made by the injection mechanism in some special cases. |
---|
| 5856 | + */ |
---|
| 5857 | +static void bfq_update_inject_limit(struct bfq_data *bfqd, |
---|
| 5858 | + struct bfq_queue *bfqq) |
---|
| 5859 | +{ |
---|
| 5860 | + u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; |
---|
| 5861 | + unsigned int old_limit = bfqq->inject_limit; |
---|
| 5862 | + |
---|
| 5863 | + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { |
---|
| 5864 | + u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; |
---|
| 5865 | + |
---|
| 5866 | + if (tot_time_ns >= threshold && old_limit > 0) { |
---|
| 5867 | + bfqq->inject_limit--; |
---|
| 5868 | + bfqq->decrease_time_jif = jiffies; |
---|
| 5869 | + } else if (tot_time_ns < threshold && |
---|
| 5870 | + old_limit <= bfqd->max_rq_in_driver) |
---|
| 5871 | + bfqq->inject_limit++; |
---|
| 5872 | + } |
---|
| 5873 | + |
---|
| 5874 | + /* |
---|
| 5875 | + * Either we still have to compute the base value for the |
---|
| 5876 | + * total service time, and there seem to be the right |
---|
| 5877 | + * conditions to do it, or we can lower the last base value |
---|
| 5878 | + * computed. |
---|
| 5879 | + * |
---|
| 5880 | + * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O |
---|
| 5881 | + * request in flight, because this function is in the code |
---|
| 5882 | + * path that handles the completion of a request of bfqq, and, |
---|
| 5883 | + * in particular, this function is executed before |
---|
| 5884 | + * bfqd->rq_in_driver is decremented in such a code path. |
---|
| 5885 | + */ |
---|
| 5886 | + if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || |
---|
| 5887 | + tot_time_ns < bfqq->last_serv_time_ns) { |
---|
| 5888 | + if (bfqq->last_serv_time_ns == 0) { |
---|
| 5889 | + /* |
---|
| 5890 | + * Now we certainly have a base value: make sure we |
---|
| 5891 | + * start trying injection. |
---|
| 5892 | + */ |
---|
| 5893 | + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); |
---|
| 5894 | + } |
---|
| 5895 | + bfqq->last_serv_time_ns = tot_time_ns; |
---|
| 5896 | + } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) |
---|
| 5897 | + /* |
---|
| 5898 | + * No I/O injected and no request still in service in |
---|
| 5899 | + * the drive: these are the exact conditions for |
---|
| 5900 | + * computing the base value of the total service time |
---|
| 5901 | + * for bfqq. So let's update this value, because it is |
---|
| 5902 | + * rather variable. For example, it varies if the size |
---|
| 5903 | + * or the spatial locality of the I/O requests in bfqq |
---|
| 5904 | + * change. |
---|
| 5905 | + */ |
---|
| 5906 | + bfqq->last_serv_time_ns = tot_time_ns; |
---|
| 5907 | + |
---|
| 5908 | + |
---|
| 5909 | + /* update complete, not waiting for any request completion any longer */ |
---|
| 5910 | + bfqd->waited_rq = NULL; |
---|
| 5911 | + bfqd->rqs_injected = false; |
---|
| 5912 | +} |
---|
| 5913 | + |
---|
| 5914 | +/* |
---|
4924 | 5915 | * Handle either a requeue or a finish for rq. The things to do are |
---|
4925 | 5916 | * the same in both cases: all references to rq are to be dropped. In |
---|
4926 | 5917 | * particular, rq is considered completed from the point of view of |
---|
.. | .. |
---|
4930 | 5921 | { |
---|
4931 | 5922 | struct bfq_queue *bfqq = RQ_BFQQ(rq); |
---|
4932 | 5923 | struct bfq_data *bfqd; |
---|
4933 | | - |
---|
4934 | | - /* |
---|
4935 | | - * Requeue and finish hooks are invoked in blk-mq without |
---|
4936 | | - * checking whether the involved request is actually still |
---|
4937 | | - * referenced in the scheduler. To handle this fact, the |
---|
4938 | | - * following two checks make this function exit in case of |
---|
4939 | | - * spurious invocations, for which there is nothing to do. |
---|
4940 | | - * |
---|
4941 | | - * First, check whether rq has nothing to do with an elevator. |
---|
4942 | | - */ |
---|
4943 | | - if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) |
---|
4944 | | - return; |
---|
4945 | 5924 | |
---|
4946 | 5925 | /* |
---|
4947 | 5926 | * rq either is not associated with any icq, or is an already |
---|
.. | .. |
---|
4963 | 5942 | unsigned long flags; |
---|
4964 | 5943 | |
---|
4965 | 5944 | spin_lock_irqsave(&bfqd->lock, flags); |
---|
| 5945 | + |
---|
| 5946 | + if (rq == bfqd->waited_rq) |
---|
| 5947 | + bfq_update_inject_limit(bfqd, bfqq); |
---|
4966 | 5948 | |
---|
4967 | 5949 | bfq_completed_request(bfqq, bfqd); |
---|
4968 | 5950 | bfq_finish_requeue_request_body(bfqq); |
---|
.. | .. |
---|
5012 | 5994 | } |
---|
5013 | 5995 | |
---|
5014 | 5996 | /* |
---|
| 5997 | + * Removes the association between the current task and bfqq, assuming |
---|
| 5998 | + * that bic points to the bfq iocontext of the task. |
---|
5015 | 5999 | * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
---|
5016 | 6000 | * was the last process referring to that bfqq. |
---|
5017 | 6001 | */ |
---|
.. | .. |
---|
5027 | 6011 | return bfqq; |
---|
5028 | 6012 | } |
---|
5029 | 6013 | |
---|
5030 | | - bic_set_bfqq(bic, NULL, 1); |
---|
| 6014 | + bic_set_bfqq(bic, NULL, true); |
---|
5031 | 6015 | |
---|
5032 | 6016 | bfq_put_cooperator(bfqq); |
---|
5033 | 6017 | |
---|
5034 | | - bfq_put_queue(bfqq); |
---|
| 6018 | + bfq_release_process_ref(bfqq->bfqd, bfqq); |
---|
5035 | 6019 | return NULL; |
---|
5036 | 6020 | } |
---|
5037 | 6021 | |
---|
.. | .. |
---|
5104 | 6088 | * comments on bfq_init_rq for the reason behind this delayed |
---|
5105 | 6089 | * preparation. |
---|
5106 | 6090 | */ |
---|
5107 | | -static void bfq_prepare_request(struct request *rq, struct bio *bio) |
---|
| 6091 | +static void bfq_prepare_request(struct request *rq) |
---|
5108 | 6092 | { |
---|
5109 | 6093 | /* |
---|
5110 | 6094 | * Regardless of whether we have an icq attached, we have to |
---|
.. | .. |
---|
5127 | 6111 | * preparation is that, after the prepare_request hook is invoked for |
---|
5128 | 6112 | * rq, rq may still be transformed into a request with no icq, i.e., a |
---|
5129 | 6113 | * request not associated with any queue. No bfq hook is invoked to |
---|
5130 | | - * signal this tranformation. As a consequence, should these |
---|
| 6114 | + * signal this transformation. As a consequence, should these |
---|
5131 | 6115 | * preparation operations be performed when the prepare_request hook |
---|
5132 | 6116 | * is invoked, and should rq be transformed one moment later, bfq |
---|
5133 | 6117 | * would end up in an inconsistent state, because it would have |
---|
.. | .. |
---|
5218 | 6202 | } |
---|
5219 | 6203 | } |
---|
5220 | 6204 | |
---|
5221 | | - if (unlikely(bfq_bfqq_just_created(bfqq))) |
---|
| 6205 | + /* |
---|
| 6206 | + * Consider bfqq as possibly belonging to a burst of newly |
---|
| 6207 | + * created queues only if: |
---|
| 6208 | + * 1) A burst is actually happening (bfqd->burst_size > 0) |
---|
| 6209 | + * or |
---|
| 6210 | + * 2) There is no other active queue. In fact, if, in |
---|
| 6211 | + * contrast, there are active queues not belonging to the |
---|
| 6212 | + * possible burst bfqq may belong to, then there is no gain |
---|
| 6213 | + * in considering bfqq as belonging to a burst, and |
---|
| 6214 | + * therefore in not weight-raising bfqq. See comments on |
---|
| 6215 | + * bfq_handle_burst(). |
---|
| 6216 | + * |
---|
| 6217 | + * This filtering also helps eliminating false positives, |
---|
| 6218 | + * occurring when bfqq does not belong to an actual large |
---|
| 6219 | + * burst, but some background task (e.g., a service) happens |
---|
| 6220 | + * to trigger the creation of new queues very close to when |
---|
| 6221 | + * bfqq and its possible companion queues are created. See |
---|
| 6222 | + * comments on bfq_handle_burst() for further details also on |
---|
| 6223 | + * this issue. |
---|
| 6224 | + */ |
---|
| 6225 | + if (unlikely(bfq_bfqq_just_created(bfqq) && |
---|
| 6226 | + (bfqd->burst_size > 0 || |
---|
| 6227 | + bfq_tot_busy_queues(bfqd) == 0))) |
---|
5222 | 6228 | bfq_handle_burst(bfqd, bfqq); |
---|
5223 | 6229 | |
---|
5224 | 6230 | return bfqq; |
---|
.. | .. |
---|
5267 | 6273 | bfq_bfqq_expire(bfqd, bfqq, true, reason); |
---|
5268 | 6274 | |
---|
5269 | 6275 | schedule_dispatch: |
---|
5270 | | - spin_unlock_irqrestore(&bfqd->lock, flags); |
---|
5271 | 6276 | bfq_schedule_dispatch(bfqd); |
---|
| 6277 | + spin_unlock_irqrestore(&bfqd->lock, flags); |
---|
5272 | 6278 | } |
---|
5273 | 6279 | |
---|
5274 | 6280 | /* |
---|
.. | .. |
---|
5381 | 6387 | struct blk_mq_tags *tags = hctx->sched_tags; |
---|
5382 | 6388 | unsigned int min_shallow; |
---|
5383 | 6389 | |
---|
5384 | | - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); |
---|
5385 | | - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); |
---|
| 6390 | + min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); |
---|
| 6391 | + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); |
---|
5386 | 6392 | } |
---|
5387 | 6393 | |
---|
5388 | 6394 | static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) |
---|
.. | .. |
---|
5405 | 6411 | |
---|
5406 | 6412 | hrtimer_cancel(&bfqd->idle_slice_timer); |
---|
5407 | 6413 | |
---|
5408 | | -#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
5409 | 6414 | /* release oom-queue reference to root group */ |
---|
5410 | 6415 | bfqg_and_blkg_put(bfqd->root_group); |
---|
5411 | 6416 | |
---|
| 6417 | +#ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
5412 | 6418 | blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); |
---|
5413 | 6419 | #else |
---|
5414 | 6420 | spin_lock_irq(&bfqd->lock); |
---|
.. | .. |
---|
5454 | 6460 | } |
---|
5455 | 6461 | eq->elevator_data = bfqd; |
---|
5456 | 6462 | |
---|
5457 | | - spin_lock_irq(q->queue_lock); |
---|
| 6463 | + spin_lock_irq(&q->queue_lock); |
---|
5458 | 6464 | q->elevator = eq; |
---|
5459 | | - spin_unlock_irq(q->queue_lock); |
---|
| 6465 | + spin_unlock_irq(&q->queue_lock); |
---|
5460 | 6466 | |
---|
5461 | 6467 | /* |
---|
5462 | 6468 | * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
---|
.. | .. |
---|
5488 | 6494 | HRTIMER_MODE_REL); |
---|
5489 | 6495 | bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
---|
5490 | 6496 | |
---|
5491 | | - bfqd->queue_weights_tree = RB_ROOT; |
---|
| 6497 | + bfqd->queue_weights_tree = RB_ROOT_CACHED; |
---|
5492 | 6498 | bfqd->num_groups_with_pending_reqs = 0; |
---|
5493 | 6499 | |
---|
5494 | 6500 | INIT_LIST_HEAD(&bfqd->active_list); |
---|
.. | .. |
---|
5496 | 6502 | INIT_HLIST_HEAD(&bfqd->burst_list); |
---|
5497 | 6503 | |
---|
5498 | 6504 | bfqd->hw_tag = -1; |
---|
| 6505 | + bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); |
---|
5499 | 6506 | |
---|
5500 | 6507 | bfqd->bfq_max_budget = bfq_default_max_budget; |
---|
5501 | 6508 | |
---|
.. | .. |
---|
5796 | 6803 | }; |
---|
5797 | 6804 | |
---|
5798 | 6805 | static struct elevator_type iosched_bfq_mq = { |
---|
5799 | | - .ops.mq = { |
---|
| 6806 | + .ops = { |
---|
5800 | 6807 | .limit_depth = bfq_limit_depth, |
---|
5801 | 6808 | .prepare_request = bfq_prepare_request, |
---|
5802 | 6809 | .requeue_request = bfq_finish_requeue_request, |
---|
.. | .. |
---|
5818 | 6825 | .exit_sched = bfq_exit_queue, |
---|
5819 | 6826 | }, |
---|
5820 | 6827 | |
---|
5821 | | - .uses_mq = true, |
---|
5822 | 6828 | .icq_size = sizeof(struct bfq_io_cq), |
---|
5823 | 6829 | .icq_align = __alignof__(struct bfq_io_cq), |
---|
5824 | 6830 | .elevator_attrs = bfq_attrs, |
---|