.. | .. |
---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ |
---|
1 | 2 | /* |
---|
2 | 3 | * Header file for the BFQ I/O scheduler: data structures and |
---|
3 | 4 | * prototypes of interface functions among BFQ components. |
---|
4 | | - * |
---|
5 | | - * This program is free software; you can redistribute it and/or |
---|
6 | | - * modify it under the terms of the GNU General Public License as |
---|
7 | | - * published by the Free Software Foundation; either version 2 of the |
---|
8 | | - * License, or (at your option) any later version. |
---|
9 | | - * |
---|
10 | | - * This program is distributed in the hope that it will be useful, |
---|
11 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
12 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
13 | | - * General Public License for more details. |
---|
14 | 5 | */ |
---|
15 | 6 | #ifndef _BFQ_H |
---|
16 | 7 | #define _BFQ_H |
---|
.. | .. |
---|
18 | 9 | #include <linux/blktrace_api.h> |
---|
19 | 10 | #include <linux/hrtimer.h> |
---|
20 | 11 | #include <linux/blk-cgroup.h> |
---|
| 12 | + |
---|
| 13 | +#include "blk-cgroup-rwstat.h" |
---|
21 | 14 | |
---|
22 | 15 | #define BFQ_IOPRIO_CLASSES 3 |
---|
23 | 16 | #define BFQ_CL_IDLE_TIMEOUT (HZ/5) |
---|
.. | .. |
---|
31 | 24 | #define BFQ_WEIGHT_LEGACY_DFL 100 |
---|
32 | 25 | #define BFQ_DEFAULT_GRP_IOPRIO 0 |
---|
33 | 26 | #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
---|
| 27 | + |
---|
| 28 | +#define MAX_PID_STR_LENGTH 12 |
---|
34 | 29 | |
---|
35 | 30 | /* |
---|
36 | 31 | * Soft real-time applications are extremely more latency sensitive |
---|
.. | .. |
---|
89 | 84 | * expiration. This peculiar definition allows for the following |
---|
90 | 85 | * optimization, not yet exploited: while a given entity is still in |
---|
91 | 86 | * service, we already know which is the best candidate for next |
---|
92 | | - * service among the other active entitities in the same parent |
---|
| 87 | + * service among the other active entities in the same parent |
---|
93 | 88 | * entity. We can then quickly compare the timestamps of the |
---|
94 | 89 | * in-service entity with those of such best candidate. |
---|
95 | 90 | * |
---|
.. | .. |
---|
140 | 135 | * |
---|
141 | 136 | * Unless cgroups are used, the weight value is calculated from the |
---|
142 | 137 | * ioprio to export the same interface as CFQ. When dealing with |
---|
143 | | - * ``well-behaved'' queues (i.e., queues that do not spend too much |
---|
| 138 | + * "well-behaved" queues (i.e., queues that do not spend too much |
---|
144 | 139 | * time to consume their budget and have true sequential behavior, and |
---|
145 | 140 | * when there are no external factors breaking anticipation) the |
---|
146 | 141 | * relative weights at each level of the cgroups hierarchy should be |
---|
.. | .. |
---|
155 | 150 | * Flag, true if the entity is on a tree (either the active or |
---|
156 | 151 | * the idle one of its service_tree) or is in service. |
---|
157 | 152 | */ |
---|
158 | | - bool on_st; |
---|
| 153 | + bool on_st_or_in_serv; |
---|
159 | 154 | |
---|
160 | 155 | /* B-WF2Q+ start and finish timestamps [sectors/weight] */ |
---|
161 | 156 | u64 start, finish; |
---|
.. | .. |
---|
175 | 170 | /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ |
---|
176 | 171 | int budget; |
---|
177 | 172 | |
---|
| 173 | + /* device weight, if non-zero, it overrides the default weight of |
---|
| 174 | + * bfq_group_data */ |
---|
| 175 | + int dev_weight; |
---|
178 | 176 | /* weight of the queue */ |
---|
179 | 177 | int weight; |
---|
180 | 178 | /* next weight if a change is in progress */ |
---|
.. | .. |
---|
239 | 237 | unsigned short ioprio, ioprio_class; |
---|
240 | 238 | /* next ioprio and ioprio class if a change is in progress */ |
---|
241 | 239 | unsigned short new_ioprio, new_ioprio_class; |
---|
| 240 | + |
---|
| 241 | + /* last total-service-time sample, see bfq_update_inject_limit() */ |
---|
| 242 | + u64 last_serv_time_ns; |
---|
| 243 | + /* limit for request injection */ |
---|
| 244 | + unsigned int inject_limit; |
---|
| 245 | + /* last time the inject limit has been decreased, in jiffies */ |
---|
| 246 | + unsigned long decrease_time_jif; |
---|
242 | 247 | |
---|
243 | 248 | /* |
---|
244 | 249 | * Shared bfq_queue if queue is cooperating with one or more |
---|
.. | .. |
---|
357 | 362 | |
---|
358 | 363 | /* max service rate measured so far */ |
---|
359 | 364 | u32 max_service_rate; |
---|
| 365 | + |
---|
360 | 366 | /* |
---|
361 | | - * Ratio between the service received by bfqq while it is in |
---|
362 | | - * service, and the cumulative service (of requests of other |
---|
363 | | - * queues) that may be injected while bfqq is empty but still |
---|
364 | | - * in service. To increase precision, the coefficient is |
---|
365 | | - * measured in tenths of unit. Here are some example of (1) |
---|
366 | | - * ratios, (2) resulting percentages of service injected |
---|
367 | | - * w.r.t. to the total service dispatched while bfqq is in |
---|
368 | | - * service, and (3) corresponding values of the coefficient: |
---|
369 | | - * 1 (50%) -> 10 |
---|
370 | | - * 2 (33%) -> 20 |
---|
371 | | - * 10 (9%) -> 100 |
---|
372 | | - * 9.9 (9%) -> 99 |
---|
373 | | - * 1.5 (40%) -> 15 |
---|
374 | | - * 0.5 (66%) -> 5 |
---|
375 | | - * 0.1 (90%) -> 1 |
---|
376 | | - * |
---|
377 | | - * So, if the coefficient is lower than 10, then |
---|
378 | | - * injected service is more than bfqq service. |
---|
| 367 | + * Pointer to the waker queue for this queue, i.e., to the |
---|
| 368 | + * queue Q such that this queue happens to get new I/O right |
---|
| 369 | + * after some I/O request of Q is completed. For details, see |
---|
| 370 | + * the comments on the choice of the queue for injection in |
---|
| 371 | + * bfq_select_queue(). |
---|
379 | 372 | */ |
---|
380 | | - unsigned int inject_coeff; |
---|
381 | | - /* amount of service injected in current service slot */ |
---|
382 | | - unsigned int injected_service; |
---|
| 373 | + struct bfq_queue *waker_bfqq; |
---|
| 374 | + /* node for woken_list, see below */ |
---|
| 375 | + struct hlist_node woken_list_node; |
---|
| 376 | + /* |
---|
| 377 | + * Head of the list of the woken queues for this queue, i.e., |
---|
| 378 | + * of the list of the queues for which this queue is a waker |
---|
| 379 | + * queue. This list is used to reset the waker_bfqq pointer in |
---|
| 380 | + * the woken queues when this queue exits. |
---|
| 381 | + */ |
---|
| 382 | + struct hlist_head woken_list; |
---|
383 | 383 | }; |
---|
384 | 384 | |
---|
385 | 385 | /** |
---|
.. | .. |
---|
419 | 419 | bool was_in_burst_list; |
---|
420 | 420 | |
---|
421 | 421 | /* |
---|
| 422 | + * Save the weight when a merge occurs, to be able |
---|
| 423 | + * to restore it in case of split. If the weight is not |
---|
| 424 | + * correctly resumed when the queue is recycled, |
---|
| 425 | + * then the weight of the recycled queue could differ |
---|
| 426 | + * from the weight of the original queue. |
---|
| 427 | + */ |
---|
| 428 | + unsigned int saved_weight; |
---|
| 429 | + |
---|
| 430 | + /* |
---|
422 | 431 | * Similar to previous fields: save wr information. |
---|
423 | 432 | */ |
---|
424 | 433 | unsigned long saved_wr_coeff; |
---|
.. | .. |
---|
450 | 459 | * weight-raised @bfq_queue (see the comments to the functions |
---|
451 | 460 | * bfq_weights_tree_[add|remove] for further details). |
---|
452 | 461 | */ |
---|
453 | | - struct rb_root queue_weights_tree; |
---|
| 462 | + struct rb_root_cached queue_weights_tree; |
---|
454 | 463 | |
---|
455 | 464 | /* |
---|
456 | 465 | * Number of groups with at least one descendant process that |
---|
.. | .. |
---|
501 | 510 | unsigned int num_groups_with_pending_reqs; |
---|
502 | 511 | |
---|
503 | 512 | /* |
---|
504 | | - * Number of bfq_queues containing requests (including the |
---|
505 | | - * queue in service, even if it is idling). |
---|
| 513 | + * Per-class (RT, BE, IDLE) number of bfq_queues containing |
---|
| 514 | + * requests (including the queue in service, even if it is |
---|
| 515 | + * idling). |
---|
506 | 516 | */ |
---|
507 | | - int busy_queues; |
---|
| 517 | + unsigned int busy_queues[3]; |
---|
508 | 518 | /* number of weight-raised busy @bfq_queues */ |
---|
509 | 519 | int wr_busy_queues; |
---|
510 | 520 | /* number of queued requests */ |
---|
511 | 521 | int queued; |
---|
512 | 522 | /* number of requests dispatched and waiting for completion */ |
---|
513 | 523 | int rq_in_driver; |
---|
| 524 | + |
---|
| 525 | + /* true if the device is non rotational and performs queueing */ |
---|
| 526 | + bool nonrot_with_queueing; |
---|
514 | 527 | |
---|
515 | 528 | /* |
---|
516 | 529 | * Maximum number of requests in driver in the last |
---|
.. | .. |
---|
543 | 556 | /* time of last request completion (ns) */ |
---|
544 | 557 | u64 last_completion; |
---|
545 | 558 | |
---|
| 559 | + /* bfqq owning the last completed rq */ |
---|
| 560 | + struct bfq_queue *last_completed_rq_bfqq; |
---|
| 561 | + |
---|
| 562 | + /* time of last transition from empty to non-empty (ns) */ |
---|
| 563 | + u64 last_empty_occupied_ns; |
---|
| 564 | + |
---|
| 565 | + /* |
---|
| 566 | + * Flag set to activate the sampling of the total service time |
---|
| 567 | + * of a just-arrived first I/O request (see |
---|
| 568 | + * bfq_update_inject_limit()). This will cause the setting of |
---|
| 569 | + * waited_rq when the request is finally dispatched. |
---|
| 570 | + */ |
---|
| 571 | + bool wait_dispatch; |
---|
| 572 | + /* |
---|
| 573 | + * If set, then bfq_update_inject_limit() is invoked when |
---|
| 574 | + * waited_rq is eventually completed. |
---|
| 575 | + */ |
---|
| 576 | + struct request *waited_rq; |
---|
| 577 | + /* |
---|
| 578 | + * True if some request has been injected during the last service hole. |
---|
| 579 | + */ |
---|
| 580 | + bool rqs_injected; |
---|
| 581 | + |
---|
546 | 582 | /* time of first rq dispatch in current observation interval (ns) */ |
---|
547 | 583 | u64 first_dispatch; |
---|
548 | 584 | /* time of last rq dispatch in current observation interval (ns) */ |
---|
.. | .. |
---|
552 | 588 | ktime_t last_budget_start; |
---|
553 | 589 | /* beginning of the last idle slice */ |
---|
554 | 590 | ktime_t last_idling_start; |
---|
| 591 | + unsigned long last_idling_start_jiffies; |
---|
555 | 592 | |
---|
556 | 593 | /* number of samples in current observation interval */ |
---|
557 | 594 | int peak_rate_samples; |
---|
.. | .. |
---|
732 | 769 | * update |
---|
733 | 770 | */ |
---|
734 | 771 | BFQQF_coop, /* bfqq is shared */ |
---|
735 | | - BFQQF_split_coop /* shared bfqq will be split */ |
---|
| 772 | + BFQQF_split_coop, /* shared bfqq will be split */ |
---|
| 773 | + BFQQF_has_waker /* bfqq has a waker queue */ |
---|
736 | 774 | }; |
---|
737 | 775 | |
---|
738 | 776 | #define BFQ_BFQQ_FNS(name) \ |
---|
.. | .. |
---|
752 | 790 | BFQ_BFQQ_FNS(coop); |
---|
753 | 791 | BFQ_BFQQ_FNS(split_coop); |
---|
754 | 792 | BFQ_BFQQ_FNS(softrt_update); |
---|
| 793 | +BFQ_BFQQ_FNS(has_waker); |
---|
755 | 794 | #undef BFQ_BFQQ_FNS |
---|
756 | 795 | |
---|
757 | 796 | /* Expiration reasons. */ |
---|
.. | .. |
---|
766 | 805 | BFQQE_PREEMPTED /* preemption in progress */ |
---|
767 | 806 | }; |
---|
768 | 807 | |
---|
| 808 | +struct bfq_stat { |
---|
| 809 | + struct percpu_counter cpu_cnt; |
---|
| 810 | + atomic64_t aux_cnt; |
---|
| 811 | +}; |
---|
| 812 | + |
---|
769 | 813 | struct bfqg_stats { |
---|
770 | | -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) |
---|
| 814 | + /* basic stats */ |
---|
| 815 | + struct blkg_rwstat bytes; |
---|
| 816 | + struct blkg_rwstat ios; |
---|
| 817 | +#ifdef CONFIG_BFQ_CGROUP_DEBUG |
---|
771 | 818 | /* number of ios merged */ |
---|
772 | 819 | struct blkg_rwstat merged; |
---|
773 | 820 | /* total time spent on device in ns, may not be accurate w/ queueing */ |
---|
.. | .. |
---|
777 | 824 | /* number of IOs queued up */ |
---|
778 | 825 | struct blkg_rwstat queued; |
---|
779 | 826 | /* total disk time and nr sectors dispatched by this group */ |
---|
780 | | - struct blkg_stat time; |
---|
| 827 | + struct bfq_stat time; |
---|
781 | 828 | /* sum of number of ios queued across all samples */ |
---|
782 | | - struct blkg_stat avg_queue_size_sum; |
---|
| 829 | + struct bfq_stat avg_queue_size_sum; |
---|
783 | 830 | /* count of samples taken for average */ |
---|
784 | | - struct blkg_stat avg_queue_size_samples; |
---|
| 831 | + struct bfq_stat avg_queue_size_samples; |
---|
785 | 832 | /* how many times this group has been removed from service tree */ |
---|
786 | | - struct blkg_stat dequeue; |
---|
| 833 | + struct bfq_stat dequeue; |
---|
787 | 834 | /* total time spent waiting for it to be assigned a timeslice. */ |
---|
788 | | - struct blkg_stat group_wait_time; |
---|
| 835 | + struct bfq_stat group_wait_time; |
---|
789 | 836 | /* time spent idling for this blkcg_gq */ |
---|
790 | | - struct blkg_stat idle_time; |
---|
| 837 | + struct bfq_stat idle_time; |
---|
791 | 838 | /* total time with empty current active q with other requests queued */ |
---|
792 | | - struct blkg_stat empty_time; |
---|
| 839 | + struct bfq_stat empty_time; |
---|
793 | 840 | /* fields after this shouldn't be cleared on stat reset */ |
---|
794 | 841 | u64 start_group_wait_time; |
---|
795 | 842 | u64 start_idle_time; |
---|
796 | 843 | u64 start_empty_time; |
---|
797 | 844 | uint16_t flags; |
---|
798 | | -#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ |
---|
| 845 | +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ |
---|
799 | 846 | }; |
---|
800 | 847 | |
---|
801 | 848 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
.. | .. |
---|
854 | 901 | |
---|
855 | 902 | /* reference counter (see comments in bfq_bic_update_cgroup) */ |
---|
856 | 903 | int ref; |
---|
| 904 | + /* Is bfq_group still online? */ |
---|
| 905 | + bool online; |
---|
857 | 906 | |
---|
858 | 907 | struct bfq_entity entity; |
---|
859 | 908 | struct bfq_sched_data sched_data; |
---|
.. | .. |
---|
874 | 923 | |
---|
875 | 924 | #else |
---|
876 | 925 | struct bfq_group { |
---|
| 926 | + struct bfq_entity entity; |
---|
877 | 927 | struct bfq_sched_data sched_data; |
---|
878 | 928 | |
---|
879 | 929 | struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
---|
.. | .. |
---|
897 | 947 | struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); |
---|
898 | 948 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
---|
899 | 949 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
---|
900 | | - struct rb_root *root); |
---|
| 950 | + struct rb_root_cached *root); |
---|
901 | 951 | void __bfq_weights_tree_remove(struct bfq_data *bfqd, |
---|
902 | 952 | struct bfq_queue *bfqq, |
---|
903 | | - struct rb_root *root); |
---|
| 953 | + struct rb_root_cached *root); |
---|
904 | 954 | void bfq_weights_tree_remove(struct bfq_data *bfqd, |
---|
905 | 955 | struct bfq_queue *bfqq); |
---|
906 | 956 | void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
---|
907 | 957 | bool compensate, enum bfqq_expiration reason); |
---|
908 | 958 | void bfq_put_queue(struct bfq_queue *bfqq); |
---|
| 959 | +void bfq_put_cooperator(struct bfq_queue *bfqq); |
---|
909 | 960 | void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
---|
| 961 | +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
---|
910 | 962 | void bfq_schedule_dispatch(struct bfq_data *bfqd); |
---|
911 | 963 | void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
---|
912 | 964 | |
---|
.. | .. |
---|
914 | 966 | |
---|
915 | 967 | /* ---------------- cgroups-support interface ---------------- */ |
---|
916 | 968 | |
---|
| 969 | +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); |
---|
917 | 970 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, |
---|
918 | 971 | unsigned int op); |
---|
919 | 972 | void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); |
---|
.. | .. |
---|
931 | 984 | void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); |
---|
932 | 985 | void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); |
---|
933 | 986 | void bfq_end_wr_async(struct bfq_data *bfqd); |
---|
934 | | -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, |
---|
935 | | - struct blkcg *blkcg); |
---|
| 987 | +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); |
---|
936 | 988 | struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); |
---|
937 | 989 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
---|
938 | 990 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); |
---|
.. | .. |
---|
977 | 1029 | |
---|
978 | 1030 | struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); |
---|
979 | 1031 | struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); |
---|
| 1032 | +unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); |
---|
980 | 1033 | struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); |
---|
981 | 1034 | struct bfq_entity *bfq_entity_of(struct rb_node *node); |
---|
982 | 1035 | unsigned short bfq_ioprio_to_weight(int ioprio); |
---|
.. | .. |
---|
1006 | 1059 | /* --------------- end of interface of B-WF2Q+ ---------------- */ |
---|
1007 | 1060 | |
---|
1008 | 1061 | /* Logging facilities. */ |
---|
| 1062 | +static inline void bfq_pid_to_str(int pid, char *str, int len) |
---|
| 1063 | +{ |
---|
| 1064 | + if (pid != -1) |
---|
| 1065 | + snprintf(str, len, "%d", pid); |
---|
| 1066 | + else |
---|
| 1067 | + snprintf(str, len, "SHARED-"); |
---|
| 1068 | +} |
---|
| 1069 | + |
---|
1009 | 1070 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
---|
1010 | 1071 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
---|
1011 | 1072 | |
---|
1012 | 1073 | #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ |
---|
| 1074 | + char pid_str[MAX_PID_STR_LENGTH]; \ |
---|
| 1075 | + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ |
---|
| 1076 | + break; \ |
---|
| 1077 | + bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ |
---|
1013 | 1078 | blk_add_cgroup_trace_msg((bfqd)->queue, \ |
---|
1014 | 1079 | bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ |
---|
1015 | | - "bfq%d%c " fmt, (bfqq)->pid, \ |
---|
| 1080 | + "bfq%s%c " fmt, pid_str, \ |
---|
1016 | 1081 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ |
---|
1017 | 1082 | } while (0) |
---|
1018 | 1083 | |
---|
.. | .. |
---|
1023 | 1088 | |
---|
1024 | 1089 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ |
---|
1025 | 1090 | |
---|
1026 | | -#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
---|
1027 | | - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ |
---|
| 1091 | +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ |
---|
| 1092 | + char pid_str[MAX_PID_STR_LENGTH]; \ |
---|
| 1093 | + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ |
---|
| 1094 | + break; \ |
---|
| 1095 | + bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ |
---|
| 1096 | + blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ |
---|
1028 | 1097 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ |
---|
1029 | | - ##args) |
---|
| 1098 | + ##args); \ |
---|
| 1099 | +} while (0) |
---|
1030 | 1100 | #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) |
---|
1031 | 1101 | |
---|
1032 | 1102 | #endif /* CONFIG_BFQ_GROUP_IOSCHED */ |
---|