hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/block/bfq-iosched.h
....@@ -1,16 +1,7 @@
1
+/* SPDX-License-Identifier: GPL-2.0-or-later */
12 /*
23 * Header file for the BFQ I/O scheduler: data structures and
34 * prototypes of interface functions among BFQ components.
4
- *
5
- * This program is free software; you can redistribute it and/or
6
- * modify it under the terms of the GNU General Public License as
7
- * published by the Free Software Foundation; either version 2 of the
8
- * License, or (at your option) any later version.
9
- *
10
- * This program is distributed in the hope that it will be useful,
11
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
- * General Public License for more details.
145 */
156 #ifndef _BFQ_H
167 #define _BFQ_H
....@@ -18,6 +9,8 @@
189 #include <linux/blktrace_api.h>
1910 #include <linux/hrtimer.h>
2011 #include <linux/blk-cgroup.h>
12
+
13
+#include "blk-cgroup-rwstat.h"
2114
2215 #define BFQ_IOPRIO_CLASSES 3
2316 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
....@@ -31,6 +24,8 @@
3124 #define BFQ_WEIGHT_LEGACY_DFL 100
3225 #define BFQ_DEFAULT_GRP_IOPRIO 0
3326 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
27
+
28
+#define MAX_PID_STR_LENGTH 12
3429
3530 /*
3631 * Soft real-time applications are extremely more latency sensitive
....@@ -89,7 +84,7 @@
8984 * expiration. This peculiar definition allows for the following
9085 * optimization, not yet exploited: while a given entity is still in
9186 * service, we already know which is the best candidate for next
92
- * service among the other active entitities in the same parent
87
+ * service among the other active entities in the same parent
9388 * entity. We can then quickly compare the timestamps of the
9489 * in-service entity with those of such best candidate.
9590 *
....@@ -140,7 +135,7 @@
140135 *
141136 * Unless cgroups are used, the weight value is calculated from the
142137 * ioprio to export the same interface as CFQ. When dealing with
143
- * ``well-behaved'' queues (i.e., queues that do not spend too much
138
+ * "well-behaved" queues (i.e., queues that do not spend too much
144139 * time to consume their budget and have true sequential behavior, and
145140 * when there are no external factors breaking anticipation) the
146141 * relative weights at each level of the cgroups hierarchy should be
....@@ -155,7 +150,7 @@
155150 * Flag, true if the entity is on a tree (either the active or
156151 * the idle one of its service_tree) or is in service.
157152 */
158
- bool on_st;
153
+ bool on_st_or_in_serv;
159154
160155 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
161156 u64 start, finish;
....@@ -175,6 +170,9 @@
175170 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
176171 int budget;
177172
173
+ /* device weight, if non-zero, it overrides the default weight of
174
+ * bfq_group_data */
175
+ int dev_weight;
178176 /* weight of the queue */
179177 int weight;
180178 /* next weight if a change is in progress */
....@@ -239,6 +237,13 @@
239237 unsigned short ioprio, ioprio_class;
240238 /* next ioprio and ioprio class if a change is in progress */
241239 unsigned short new_ioprio, new_ioprio_class;
240
+
241
+ /* last total-service-time sample, see bfq_update_inject_limit() */
242
+ u64 last_serv_time_ns;
243
+ /* limit for request injection */
244
+ unsigned int inject_limit;
245
+ /* last time the inject limit has been decreased, in jiffies */
246
+ unsigned long decrease_time_jif;
242247
243248 /*
244249 * Shared bfq_queue if queue is cooperating with one or more
....@@ -357,29 +362,24 @@
357362
358363 /* max service rate measured so far */
359364 u32 max_service_rate;
365
+
360366 /*
361
- * Ratio between the service received by bfqq while it is in
362
- * service, and the cumulative service (of requests of other
363
- * queues) that may be injected while bfqq is empty but still
364
- * in service. To increase precision, the coefficient is
365
- * measured in tenths of unit. Here are some example of (1)
366
- * ratios, (2) resulting percentages of service injected
367
- * w.r.t. to the total service dispatched while bfqq is in
368
- * service, and (3) corresponding values of the coefficient:
369
- * 1 (50%) -> 10
370
- * 2 (33%) -> 20
371
- * 10 (9%) -> 100
372
- * 9.9 (9%) -> 99
373
- * 1.5 (40%) -> 15
374
- * 0.5 (66%) -> 5
375
- * 0.1 (90%) -> 1
376
- *
377
- * So, if the coefficient is lower than 10, then
378
- * injected service is more than bfqq service.
367
+ * Pointer to the waker queue for this queue, i.e., to the
368
+ * queue Q such that this queue happens to get new I/O right
369
+ * after some I/O request of Q is completed. For details, see
370
+ * the comments on the choice of the queue for injection in
371
+ * bfq_select_queue().
379372 */
380
- unsigned int inject_coeff;
381
- /* amount of service injected in current service slot */
382
- unsigned int injected_service;
373
+ struct bfq_queue *waker_bfqq;
374
+ /* node for woken_list, see below */
375
+ struct hlist_node woken_list_node;
376
+ /*
377
+ * Head of the list of the woken queues for this queue, i.e.,
378
+ * of the list of the queues for which this queue is a waker
379
+ * queue. This list is used to reset the waker_bfqq pointer in
380
+ * the woken queues when this queue exits.
381
+ */
382
+ struct hlist_head woken_list;
383383 };
384384
385385 /**
....@@ -419,6 +419,15 @@
419419 bool was_in_burst_list;
420420
421421 /*
422
+ * Save the weight when a merge occurs, to be able
423
+ * to restore it in case of split. If the weight is not
424
+ * correctly resumed when the queue is recycled,
425
+ * then the weight of the recycled queue could differ
426
+ * from the weight of the original queue.
427
+ */
428
+ unsigned int saved_weight;
429
+
430
+ /*
422431 * Similar to previous fields: save wr information.
423432 */
424433 unsigned long saved_wr_coeff;
....@@ -450,7 +459,7 @@
450459 * weight-raised @bfq_queue (see the comments to the functions
451460 * bfq_weights_tree_[add|remove] for further details).
452461 */
453
- struct rb_root queue_weights_tree;
462
+ struct rb_root_cached queue_weights_tree;
454463
455464 /*
456465 * Number of groups with at least one descendant process that
....@@ -501,16 +510,20 @@
501510 unsigned int num_groups_with_pending_reqs;
502511
503512 /*
504
- * Number of bfq_queues containing requests (including the
505
- * queue in service, even if it is idling).
513
+ * Per-class (RT, BE, IDLE) number of bfq_queues containing
514
+ * requests (including the queue in service, even if it is
515
+ * idling).
506516 */
507
- int busy_queues;
517
+ unsigned int busy_queues[3];
508518 /* number of weight-raised busy @bfq_queues */
509519 int wr_busy_queues;
510520 /* number of queued requests */
511521 int queued;
512522 /* number of requests dispatched and waiting for completion */
513523 int rq_in_driver;
524
+
525
+ /* true if the device is non rotational and performs queueing */
526
+ bool nonrot_with_queueing;
514527
515528 /*
516529 * Maximum number of requests in driver in the last
....@@ -543,6 +556,29 @@
543556 /* time of last request completion (ns) */
544557 u64 last_completion;
545558
559
+ /* bfqq owning the last completed rq */
560
+ struct bfq_queue *last_completed_rq_bfqq;
561
+
562
+ /* time of last transition from empty to non-empty (ns) */
563
+ u64 last_empty_occupied_ns;
564
+
565
+ /*
566
+ * Flag set to activate the sampling of the total service time
567
+ * of a just-arrived first I/O request (see
568
+ * bfq_update_inject_limit()). This will cause the setting of
569
+ * waited_rq when the request is finally dispatched.
570
+ */
571
+ bool wait_dispatch;
572
+ /*
573
+ * If set, then bfq_update_inject_limit() is invoked when
574
+ * waited_rq is eventually completed.
575
+ */
576
+ struct request *waited_rq;
577
+ /*
578
+ * True if some request has been injected during the last service hole.
579
+ */
580
+ bool rqs_injected;
581
+
546582 /* time of first rq dispatch in current observation interval (ns) */
547583 u64 first_dispatch;
548584 /* time of last rq dispatch in current observation interval (ns) */
....@@ -552,6 +588,7 @@
552588 ktime_t last_budget_start;
553589 /* beginning of the last idle slice */
554590 ktime_t last_idling_start;
591
+ unsigned long last_idling_start_jiffies;
555592
556593 /* number of samples in current observation interval */
557594 int peak_rate_samples;
....@@ -732,7 +769,8 @@
732769 * update
733770 */
734771 BFQQF_coop, /* bfqq is shared */
735
- BFQQF_split_coop /* shared bfqq will be split */
772
+ BFQQF_split_coop, /* shared bfqq will be split */
773
+ BFQQF_has_waker /* bfqq has a waker queue */
736774 };
737775
738776 #define BFQ_BFQQ_FNS(name) \
....@@ -752,6 +790,7 @@
752790 BFQ_BFQQ_FNS(coop);
753791 BFQ_BFQQ_FNS(split_coop);
754792 BFQ_BFQQ_FNS(softrt_update);
793
+BFQ_BFQQ_FNS(has_waker);
755794 #undef BFQ_BFQQ_FNS
756795
757796 /* Expiration reasons. */
....@@ -766,8 +805,16 @@
766805 BFQQE_PREEMPTED /* preemption in progress */
767806 };
768807
808
+struct bfq_stat {
809
+ struct percpu_counter cpu_cnt;
810
+ atomic64_t aux_cnt;
811
+};
812
+
769813 struct bfqg_stats {
770
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
814
+ /* basic stats */
815
+ struct blkg_rwstat bytes;
816
+ struct blkg_rwstat ios;
817
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
771818 /* number of ios merged */
772819 struct blkg_rwstat merged;
773820 /* total time spent on device in ns, may not be accurate w/ queueing */
....@@ -777,25 +824,25 @@
777824 /* number of IOs queued up */
778825 struct blkg_rwstat queued;
779826 /* total disk time and nr sectors dispatched by this group */
780
- struct blkg_stat time;
827
+ struct bfq_stat time;
781828 /* sum of number of ios queued across all samples */
782
- struct blkg_stat avg_queue_size_sum;
829
+ struct bfq_stat avg_queue_size_sum;
783830 /* count of samples taken for average */
784
- struct blkg_stat avg_queue_size_samples;
831
+ struct bfq_stat avg_queue_size_samples;
785832 /* how many times this group has been removed from service tree */
786
- struct blkg_stat dequeue;
833
+ struct bfq_stat dequeue;
787834 /* total time spent waiting for it to be assigned a timeslice. */
788
- struct blkg_stat group_wait_time;
835
+ struct bfq_stat group_wait_time;
789836 /* time spent idling for this blkcg_gq */
790
- struct blkg_stat idle_time;
837
+ struct bfq_stat idle_time;
791838 /* total time with empty current active q with other requests queued */
792
- struct blkg_stat empty_time;
839
+ struct bfq_stat empty_time;
793840 /* fields after this shouldn't be cleared on stat reset */
794841 u64 start_group_wait_time;
795842 u64 start_idle_time;
796843 u64 start_empty_time;
797844 uint16_t flags;
798
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
845
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
799846 };
800847
801848 #ifdef CONFIG_BFQ_GROUP_IOSCHED
....@@ -854,6 +901,8 @@
854901
855902 /* reference counter (see comments in bfq_bic_update_cgroup) */
856903 int ref;
904
+ /* Is bfq_group still online? */
905
+ bool online;
857906
858907 struct bfq_entity entity;
859908 struct bfq_sched_data sched_data;
....@@ -874,6 +923,7 @@
874923
875924 #else
876925 struct bfq_group {
926
+ struct bfq_entity entity;
877927 struct bfq_sched_data sched_data;
878928
879929 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
....@@ -897,16 +947,18 @@
897947 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
898948 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
899949 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
900
- struct rb_root *root);
950
+ struct rb_root_cached *root);
901951 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
902952 struct bfq_queue *bfqq,
903
- struct rb_root *root);
953
+ struct rb_root_cached *root);
904954 void bfq_weights_tree_remove(struct bfq_data *bfqd,
905955 struct bfq_queue *bfqq);
906956 void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
907957 bool compensate, enum bfqq_expiration reason);
908958 void bfq_put_queue(struct bfq_queue *bfqq);
959
+void bfq_put_cooperator(struct bfq_queue *bfqq);
909960 void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
961
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
910962 void bfq_schedule_dispatch(struct bfq_data *bfqd);
911963 void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
912964
....@@ -914,6 +966,7 @@
914966
915967 /* ---------------- cgroups-support interface ---------------- */
916968
969
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
917970 void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
918971 unsigned int op);
919972 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
....@@ -931,8 +984,7 @@
931984 void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
932985 void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
933986 void bfq_end_wr_async(struct bfq_data *bfqd);
934
-struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
935
- struct blkcg *blkcg);
987
+struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio);
936988 struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
937989 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
938990 struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
....@@ -977,6 +1029,7 @@
9771029
9781030 struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
9791031 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
1032
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
9801033 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
9811034 struct bfq_entity *bfq_entity_of(struct rb_node *node);
9821035 unsigned short bfq_ioprio_to_weight(int ioprio);
....@@ -1006,13 +1059,25 @@
10061059 /* --------------- end of interface of B-WF2Q+ ---------------- */
10071060
10081061 /* Logging facilities. */
1062
+static inline void bfq_pid_to_str(int pid, char *str, int len)
1063
+{
1064
+ if (pid != -1)
1065
+ snprintf(str, len, "%d", pid);
1066
+ else
1067
+ snprintf(str, len, "SHARED-");
1068
+}
1069
+
10091070 #ifdef CONFIG_BFQ_GROUP_IOSCHED
10101071 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
10111072
10121073 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1074
+ char pid_str[MAX_PID_STR_LENGTH]; \
1075
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
1076
+ break; \
1077
+ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
10131078 blk_add_cgroup_trace_msg((bfqd)->queue, \
10141079 bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
1015
- "bfq%d%c " fmt, (bfqq)->pid, \
1080
+ "bfq%s%c " fmt, pid_str, \
10161081 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
10171082 } while (0)
10181083
....@@ -1023,10 +1088,15 @@
10231088
10241089 #else /* CONFIG_BFQ_GROUP_IOSCHED */
10251090
1026
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
1027
- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
1091
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1092
+ char pid_str[MAX_PID_STR_LENGTH]; \
1093
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
1094
+ break; \
1095
+ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
1096
+ blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
10281097 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
1029
- ##args)
1098
+ ##args); \
1099
+} while (0)
10301100 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
10311101
10321102 #endif /* CONFIG_BFQ_GROUP_IOSCHED */