.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * The Kyber I/O scheduler. Controls latency by throttling queue depths using |
---|
3 | 4 | * scalable techniques. |
---|
4 | 5 | * |
---|
5 | 6 | * Copyright (C) 2017 Facebook |
---|
6 | | - * |
---|
7 | | - * This program is free software; you can redistribute it and/or |
---|
8 | | - * modify it under the terms of the GNU General Public |
---|
9 | | - * License v2 as published by the Free Software Foundation. |
---|
10 | | - * |
---|
11 | | - * This program is distributed in the hope that it will be useful, |
---|
12 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
14 | | - * General Public License for more details. |
---|
15 | | - * |
---|
16 | | - * You should have received a copy of the GNU General Public License |
---|
17 | | - * along with this program. If not, see <https://www.gnu.org/licenses/>. |
---|
18 | 7 | */ |
---|
19 | 8 | |
---|
20 | 9 | #include <linux/kernel.h> |
---|
.. | .. |
---|
29 | 18 | #include "blk-mq-debugfs.h" |
---|
30 | 19 | #include "blk-mq-sched.h" |
---|
31 | 20 | #include "blk-mq-tag.h" |
---|
32 | | -#include "blk-stat.h" |
---|
33 | 21 | |
---|
34 | | -/* Scheduling domains. */ |
---|
| 22 | +#define CREATE_TRACE_POINTS |
---|
| 23 | +#include <trace/events/kyber.h> |
---|
| 24 | + |
---|
| 25 | +/* |
---|
| 26 | + * Scheduling domains: the device is divided into multiple domains based on the |
---|
| 27 | + * request type. |
---|
| 28 | + */ |
---|
35 | 29 | enum { |
---|
36 | 30 | KYBER_READ, |
---|
37 | | - KYBER_SYNC_WRITE, |
---|
38 | | - KYBER_OTHER, /* Async writes, discard, etc. */ |
---|
| 31 | + KYBER_WRITE, |
---|
| 32 | + KYBER_DISCARD, |
---|
| 33 | + KYBER_OTHER, |
---|
39 | 34 | KYBER_NUM_DOMAINS, |
---|
40 | 35 | }; |
---|
41 | 36 | |
---|
42 | | -enum { |
---|
43 | | - KYBER_MIN_DEPTH = 256, |
---|
| 37 | +static const char *kyber_domain_names[] = { |
---|
| 38 | + [KYBER_READ] = "READ", |
---|
| 39 | + [KYBER_WRITE] = "WRITE", |
---|
| 40 | + [KYBER_DISCARD] = "DISCARD", |
---|
| 41 | + [KYBER_OTHER] = "OTHER", |
---|
| 42 | +}; |
---|
44 | 43 | |
---|
| 44 | +enum { |
---|
45 | 45 | /* |
---|
46 | 46 | * In order to prevent starvation of synchronous requests by a flood of |
---|
47 | 47 | * asynchronous requests, we reserve 25% of requests for synchronous |
---|
.. | .. |
---|
51 | 51 | }; |
---|
52 | 52 | |
---|
53 | 53 | /* |
---|
54 | | - * Initial device-wide depths for each scheduling domain. |
---|
| 54 | + * Maximum device-wide depth for each scheduling domain. |
---|
55 | 55 | * |
---|
56 | | - * Even for fast devices with lots of tags like NVMe, you can saturate |
---|
57 | | - * the device with only a fraction of the maximum possible queue depth. |
---|
58 | | - * So, we cap these to a reasonable value. |
---|
| 56 | + * Even for fast devices with lots of tags like NVMe, you can saturate the |
---|
| 57 | + * device with only a fraction of the maximum possible queue depth. So, we cap |
---|
| 58 | + * these to a reasonable value. |
---|
59 | 59 | */ |
---|
60 | 60 | static const unsigned int kyber_depth[] = { |
---|
61 | 61 | [KYBER_READ] = 256, |
---|
62 | | - [KYBER_SYNC_WRITE] = 128, |
---|
63 | | - [KYBER_OTHER] = 64, |
---|
| 62 | + [KYBER_WRITE] = 128, |
---|
| 63 | + [KYBER_DISCARD] = 64, |
---|
| 64 | + [KYBER_OTHER] = 16, |
---|
64 | 65 | }; |
---|
65 | 66 | |
---|
66 | 67 | /* |
---|
67 | | - * Scheduling domain batch sizes. We favor reads. |
---|
| 68 | + * Default latency targets for each scheduling domain. |
---|
| 69 | + */ |
---|
| 70 | +static const u64 kyber_latency_targets[] = { |
---|
| 71 | + [KYBER_READ] = 2ULL * NSEC_PER_MSEC, |
---|
| 72 | + [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, |
---|
| 73 | + [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, |
---|
| 74 | +}; |
---|
| 75 | + |
---|
| 76 | +/* |
---|
| 77 | + * Batch size (number of requests we'll dispatch in a row) for each scheduling |
---|
| 78 | + * domain. |
---|
68 | 79 | */ |
---|
69 | 80 | static const unsigned int kyber_batch_size[] = { |
---|
70 | 81 | [KYBER_READ] = 16, |
---|
71 | | - [KYBER_SYNC_WRITE] = 8, |
---|
72 | | - [KYBER_OTHER] = 8, |
---|
| 82 | + [KYBER_WRITE] = 8, |
---|
| 83 | + [KYBER_DISCARD] = 1, |
---|
| 84 | + [KYBER_OTHER] = 1, |
---|
| 85 | +}; |
---|
| 86 | + |
---|
| 87 | +/* |
---|
| 88 | + * Requests latencies are recorded in a histogram with buckets defined relative |
---|
| 89 | + * to the target latency: |
---|
| 90 | + * |
---|
| 91 | + * <= 1/4 * target latency |
---|
| 92 | + * <= 1/2 * target latency |
---|
| 93 | + * <= 3/4 * target latency |
---|
| 94 | + * <= target latency |
---|
| 95 | + * <= 1 1/4 * target latency |
---|
| 96 | + * <= 1 1/2 * target latency |
---|
| 97 | + * <= 1 3/4 * target latency |
---|
| 98 | + * > 1 3/4 * target latency |
---|
| 99 | + */ |
---|
| 100 | +enum { |
---|
| 101 | + /* |
---|
| 102 | + * The width of the latency histogram buckets is |
---|
| 103 | + * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. |
---|
| 104 | + */ |
---|
| 105 | + KYBER_LATENCY_SHIFT = 2, |
---|
| 106 | + /* |
---|
| 107 | + * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, |
---|
| 108 | + * thus, "good". |
---|
| 109 | + */ |
---|
| 110 | + KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, |
---|
| 111 | + /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ |
---|
| 112 | + KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, |
---|
| 113 | +}; |
---|
| 114 | + |
---|
| 115 | +/* |
---|
| 116 | + * We measure both the total latency and the I/O latency (i.e., latency after |
---|
| 117 | + * submitting to the device). |
---|
| 118 | + */ |
---|
| 119 | +enum { |
---|
| 120 | + KYBER_TOTAL_LATENCY, |
---|
| 121 | + KYBER_IO_LATENCY, |
---|
| 122 | +}; |
---|
| 123 | + |
---|
| 124 | +static const char *kyber_latency_type_names[] = { |
---|
| 125 | + [KYBER_TOTAL_LATENCY] = "total", |
---|
| 126 | + [KYBER_IO_LATENCY] = "I/O", |
---|
| 127 | +}; |
---|
| 128 | + |
---|
| 129 | +/* |
---|
| 130 | + * Per-cpu latency histograms: total latency and I/O latency for each scheduling |
---|
| 131 | + * domain except for KYBER_OTHER. |
---|
| 132 | + */ |
---|
| 133 | +struct kyber_cpu_latency { |
---|
| 134 | + atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
---|
73 | 135 | }; |
---|
74 | 136 | |
---|
75 | 137 | /* |
---|
.. | .. |
---|
88 | 150 | struct kyber_queue_data { |
---|
89 | 151 | struct request_queue *q; |
---|
90 | 152 | |
---|
91 | | - struct blk_stat_callback *cb; |
---|
92 | | - |
---|
93 | 153 | /* |
---|
94 | | - * The device is divided into multiple scheduling domains based on the |
---|
95 | | - * request type. Each domain has a fixed number of in-flight requests of |
---|
96 | | - * that type device-wide, limited by these tokens. |
---|
| 154 | + * Each scheduling domain has a limited number of in-flight requests |
---|
| 155 | + * device-wide, limited by these tokens. |
---|
97 | 156 | */ |
---|
98 | 157 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; |
---|
99 | 158 | |
---|
.. | .. |
---|
103 | 162 | */ |
---|
104 | 163 | unsigned int async_depth; |
---|
105 | 164 | |
---|
| 165 | + struct kyber_cpu_latency __percpu *cpu_latency; |
---|
| 166 | + |
---|
| 167 | + /* Timer for stats aggregation and adjusting domain tokens. */ |
---|
| 168 | + struct timer_list timer; |
---|
| 169 | + |
---|
| 170 | + unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
---|
| 171 | + |
---|
| 172 | + unsigned long latency_timeout[KYBER_OTHER]; |
---|
| 173 | + |
---|
| 174 | + int domain_p99[KYBER_OTHER]; |
---|
| 175 | + |
---|
106 | 176 | /* Target latencies in nanoseconds. */ |
---|
107 | | - u64 read_lat_nsec, write_lat_nsec; |
---|
| 177 | + u64 latency_targets[KYBER_OTHER]; |
---|
108 | 178 | }; |
---|
109 | 179 | |
---|
110 | 180 | struct kyber_hctx_data { |
---|
.. | .. |
---|
114 | 184 | unsigned int batching; |
---|
115 | 185 | struct kyber_ctx_queue *kcqs; |
---|
116 | 186 | struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; |
---|
117 | | - wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; |
---|
| 187 | + struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; |
---|
118 | 188 | struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; |
---|
119 | 189 | atomic_t wait_index[KYBER_NUM_DOMAINS]; |
---|
120 | 190 | }; |
---|
.. | .. |
---|
124 | 194 | |
---|
125 | 195 | static unsigned int kyber_sched_domain(unsigned int op) |
---|
126 | 196 | { |
---|
127 | | - if ((op & REQ_OP_MASK) == REQ_OP_READ) |
---|
| 197 | + switch (op & REQ_OP_MASK) { |
---|
| 198 | + case REQ_OP_READ: |
---|
128 | 199 | return KYBER_READ; |
---|
129 | | - else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) |
---|
130 | | - return KYBER_SYNC_WRITE; |
---|
131 | | - else |
---|
| 200 | + case REQ_OP_WRITE: |
---|
| 201 | + return KYBER_WRITE; |
---|
| 202 | + case REQ_OP_DISCARD: |
---|
| 203 | + return KYBER_DISCARD; |
---|
| 204 | + default: |
---|
132 | 205 | return KYBER_OTHER; |
---|
| 206 | + } |
---|
133 | 207 | } |
---|
134 | 208 | |
---|
135 | | -enum { |
---|
136 | | - NONE = 0, |
---|
137 | | - GOOD = 1, |
---|
138 | | - GREAT = 2, |
---|
139 | | - BAD = -1, |
---|
140 | | - AWFUL = -2, |
---|
141 | | -}; |
---|
142 | | - |
---|
143 | | -#define IS_GOOD(status) ((status) > 0) |
---|
144 | | -#define IS_BAD(status) ((status) < 0) |
---|
145 | | - |
---|
146 | | -static int kyber_lat_status(struct blk_stat_callback *cb, |
---|
147 | | - unsigned int sched_domain, u64 target) |
---|
| 209 | +static void flush_latency_buckets(struct kyber_queue_data *kqd, |
---|
| 210 | + struct kyber_cpu_latency *cpu_latency, |
---|
| 211 | + unsigned int sched_domain, unsigned int type) |
---|
148 | 212 | { |
---|
149 | | - u64 latency; |
---|
| 213 | + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
---|
| 214 | + atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; |
---|
| 215 | + unsigned int bucket; |
---|
150 | 216 | |
---|
151 | | - if (!cb->stat[sched_domain].nr_samples) |
---|
152 | | - return NONE; |
---|
153 | | - |
---|
154 | | - latency = cb->stat[sched_domain].mean; |
---|
155 | | - if (latency >= 2 * target) |
---|
156 | | - return AWFUL; |
---|
157 | | - else if (latency > target) |
---|
158 | | - return BAD; |
---|
159 | | - else if (latency <= target / 2) |
---|
160 | | - return GREAT; |
---|
161 | | - else /* (latency <= target) */ |
---|
162 | | - return GOOD; |
---|
| 217 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
---|
| 218 | + buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); |
---|
163 | 219 | } |
---|
164 | 220 | |
---|
165 | 221 | /* |
---|
166 | | - * Adjust the read or synchronous write depth given the status of reads and |
---|
167 | | - * writes. The goal is that the latencies of the two domains are fair (i.e., if |
---|
168 | | - * one is good, then the other is good). |
---|
| 222 | + * Calculate the histogram bucket with the given percentile rank, or -1 if there |
---|
| 223 | + * aren't enough samples yet. |
---|
169 | 224 | */ |
---|
170 | | -static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, |
---|
171 | | - unsigned int sched_domain, int this_status, |
---|
172 | | - int other_status) |
---|
| 225 | +static int calculate_percentile(struct kyber_queue_data *kqd, |
---|
| 226 | + unsigned int sched_domain, unsigned int type, |
---|
| 227 | + unsigned int percentile) |
---|
173 | 228 | { |
---|
174 | | - unsigned int orig_depth, depth; |
---|
| 229 | + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
---|
| 230 | + unsigned int bucket, samples = 0, percentile_samples; |
---|
| 231 | + |
---|
| 232 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
---|
| 233 | + samples += buckets[bucket]; |
---|
| 234 | + |
---|
| 235 | + if (!samples) |
---|
| 236 | + return -1; |
---|
175 | 237 | |
---|
176 | 238 | /* |
---|
177 | | - * If this domain had no samples, or reads and writes are both good or |
---|
178 | | - * both bad, don't adjust the depth. |
---|
| 239 | + * We do the calculation once we have 500 samples or one second passes |
---|
| 240 | + * since the first sample was recorded, whichever comes first. |
---|
179 | 241 | */ |
---|
180 | | - if (this_status == NONE || |
---|
181 | | - (IS_GOOD(this_status) && IS_GOOD(other_status)) || |
---|
182 | | - (IS_BAD(this_status) && IS_BAD(other_status))) |
---|
183 | | - return; |
---|
184 | | - |
---|
185 | | - orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; |
---|
186 | | - |
---|
187 | | - if (other_status == NONE) { |
---|
188 | | - depth++; |
---|
189 | | - } else { |
---|
190 | | - switch (this_status) { |
---|
191 | | - case GOOD: |
---|
192 | | - if (other_status == AWFUL) |
---|
193 | | - depth -= max(depth / 4, 1U); |
---|
194 | | - else |
---|
195 | | - depth -= max(depth / 8, 1U); |
---|
196 | | - break; |
---|
197 | | - case GREAT: |
---|
198 | | - if (other_status == AWFUL) |
---|
199 | | - depth /= 2; |
---|
200 | | - else |
---|
201 | | - depth -= max(depth / 4, 1U); |
---|
202 | | - break; |
---|
203 | | - case BAD: |
---|
204 | | - depth++; |
---|
205 | | - break; |
---|
206 | | - case AWFUL: |
---|
207 | | - if (other_status == GREAT) |
---|
208 | | - depth += 2; |
---|
209 | | - else |
---|
210 | | - depth++; |
---|
211 | | - break; |
---|
212 | | - } |
---|
| 242 | + if (!kqd->latency_timeout[sched_domain]) |
---|
| 243 | + kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); |
---|
| 244 | + if (samples < 500 && |
---|
| 245 | + time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { |
---|
| 246 | + return -1; |
---|
213 | 247 | } |
---|
| 248 | + kqd->latency_timeout[sched_domain] = 0; |
---|
214 | 249 | |
---|
| 250 | + percentile_samples = DIV_ROUND_UP(samples * percentile, 100); |
---|
| 251 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { |
---|
| 252 | + if (buckets[bucket] >= percentile_samples) |
---|
| 253 | + break; |
---|
| 254 | + percentile_samples -= buckets[bucket]; |
---|
| 255 | + } |
---|
| 256 | + memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); |
---|
| 257 | + |
---|
| 258 | + trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], |
---|
| 259 | + kyber_latency_type_names[type], percentile, |
---|
| 260 | + bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); |
---|
| 261 | + |
---|
| 262 | + return bucket; |
---|
| 263 | +} |
---|
| 264 | + |
---|
| 265 | +static void kyber_resize_domain(struct kyber_queue_data *kqd, |
---|
| 266 | + unsigned int sched_domain, unsigned int depth) |
---|
| 267 | +{ |
---|
215 | 268 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); |
---|
216 | | - if (depth != orig_depth) |
---|
| 269 | + if (depth != kqd->domain_tokens[sched_domain].sb.depth) { |
---|
217 | 270 | sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); |
---|
| 271 | + trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], |
---|
| 272 | + depth); |
---|
| 273 | + } |
---|
218 | 274 | } |
---|
219 | 275 | |
---|
220 | | -/* |
---|
221 | | - * Adjust the depth of other requests given the status of reads and synchronous |
---|
222 | | - * writes. As long as either domain is doing fine, we don't throttle, but if |
---|
223 | | - * both domains are doing badly, we throttle heavily. |
---|
224 | | - */ |
---|
225 | | -static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, |
---|
226 | | - int read_status, int write_status, |
---|
227 | | - bool have_samples) |
---|
| 276 | +static void kyber_timer_fn(struct timer_list *t) |
---|
228 | 277 | { |
---|
229 | | - unsigned int orig_depth, depth; |
---|
230 | | - int status; |
---|
| 278 | + struct kyber_queue_data *kqd = from_timer(kqd, t, timer); |
---|
| 279 | + unsigned int sched_domain; |
---|
| 280 | + int cpu; |
---|
| 281 | + bool bad = false; |
---|
231 | 282 | |
---|
232 | | - orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; |
---|
| 283 | + /* Sum all of the per-cpu latency histograms. */ |
---|
| 284 | + for_each_online_cpu(cpu) { |
---|
| 285 | + struct kyber_cpu_latency *cpu_latency; |
---|
233 | 286 | |
---|
234 | | - if (read_status == NONE && write_status == NONE) { |
---|
235 | | - depth += 2; |
---|
236 | | - } else if (have_samples) { |
---|
237 | | - if (read_status == NONE) |
---|
238 | | - status = write_status; |
---|
239 | | - else if (write_status == NONE) |
---|
240 | | - status = read_status; |
---|
241 | | - else |
---|
242 | | - status = max(read_status, write_status); |
---|
243 | | - switch (status) { |
---|
244 | | - case GREAT: |
---|
245 | | - depth += 2; |
---|
246 | | - break; |
---|
247 | | - case GOOD: |
---|
248 | | - depth++; |
---|
249 | | - break; |
---|
250 | | - case BAD: |
---|
251 | | - depth -= max(depth / 4, 1U); |
---|
252 | | - break; |
---|
253 | | - case AWFUL: |
---|
254 | | - depth /= 2; |
---|
255 | | - break; |
---|
| 287 | + cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); |
---|
| 288 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
---|
| 289 | + flush_latency_buckets(kqd, cpu_latency, sched_domain, |
---|
| 290 | + KYBER_TOTAL_LATENCY); |
---|
| 291 | + flush_latency_buckets(kqd, cpu_latency, sched_domain, |
---|
| 292 | + KYBER_IO_LATENCY); |
---|
256 | 293 | } |
---|
257 | 294 | } |
---|
258 | 295 | |
---|
259 | | - depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); |
---|
260 | | - if (depth != orig_depth) |
---|
261 | | - sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); |
---|
262 | | -} |
---|
| 296 | + /* |
---|
| 297 | + * Check if any domains have a high I/O latency, which might indicate |
---|
| 298 | + * congestion in the device. Note that we use the p90; we don't want to |
---|
| 299 | + * be too sensitive to outliers here. |
---|
| 300 | + */ |
---|
| 301 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
---|
| 302 | + int p90; |
---|
263 | 303 | |
---|
264 | | -/* |
---|
265 | | - * Apply heuristics for limiting queue depths based on gathered latency |
---|
266 | | - * statistics. |
---|
267 | | - */ |
---|
268 | | -static void kyber_stat_timer_fn(struct blk_stat_callback *cb) |
---|
269 | | -{ |
---|
270 | | - struct kyber_queue_data *kqd = cb->data; |
---|
271 | | - int read_status, write_status; |
---|
272 | | - |
---|
273 | | - read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); |
---|
274 | | - write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); |
---|
275 | | - |
---|
276 | | - kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); |
---|
277 | | - kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); |
---|
278 | | - kyber_adjust_other_depth(kqd, read_status, write_status, |
---|
279 | | - cb->stat[KYBER_OTHER].nr_samples != 0); |
---|
| 304 | + p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, |
---|
| 305 | + 90); |
---|
| 306 | + if (p90 >= KYBER_GOOD_BUCKETS) |
---|
| 307 | + bad = true; |
---|
| 308 | + } |
---|
280 | 309 | |
---|
281 | 310 | /* |
---|
282 | | - * Continue monitoring latencies if we aren't hitting the targets or |
---|
283 | | - * we're still throttling other requests. |
---|
| 311 | + * Adjust the scheduling domain depths. If we determined that there was |
---|
| 312 | + * congestion, we throttle all domains with good latencies. Either way, |
---|
| 313 | + * we ease up on throttling domains with bad latencies. |
---|
284 | 314 | */ |
---|
285 | | - if (!blk_stat_is_active(kqd->cb) && |
---|
286 | | - ((IS_BAD(read_status) || IS_BAD(write_status) || |
---|
287 | | - kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) |
---|
288 | | - blk_stat_activate_msecs(kqd->cb, 100); |
---|
| 315 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
---|
| 316 | + unsigned int orig_depth, depth; |
---|
| 317 | + int p99; |
---|
| 318 | + |
---|
| 319 | + p99 = calculate_percentile(kqd, sched_domain, |
---|
| 320 | + KYBER_TOTAL_LATENCY, 99); |
---|
| 321 | + /* |
---|
| 322 | + * This is kind of subtle: different domains will not |
---|
| 323 | + * necessarily have enough samples to calculate the latency |
---|
| 324 | + * percentiles during the same window, so we have to remember |
---|
| 325 | + * the p99 for the next time we observe congestion; once we do, |
---|
| 326 | + * we don't want to throttle again until we get more data, so we |
---|
| 327 | + * reset it to -1. |
---|
| 328 | + */ |
---|
| 329 | + if (bad) { |
---|
| 330 | + if (p99 < 0) |
---|
| 331 | + p99 = kqd->domain_p99[sched_domain]; |
---|
| 332 | + kqd->domain_p99[sched_domain] = -1; |
---|
| 333 | + } else if (p99 >= 0) { |
---|
| 334 | + kqd->domain_p99[sched_domain] = p99; |
---|
| 335 | + } |
---|
| 336 | + if (p99 < 0) |
---|
| 337 | + continue; |
---|
| 338 | + |
---|
| 339 | + /* |
---|
| 340 | + * If this domain has bad latency, throttle less. Otherwise, |
---|
| 341 | + * throttle more iff we determined that there is congestion. |
---|
| 342 | + * |
---|
| 343 | + * The new depth is scaled linearly with the p99 latency vs the |
---|
| 344 | + * latency target. E.g., if the p99 is 3/4 of the target, then |
---|
| 345 | + * we throttle down to 3/4 of the current depth, and if the p99 |
---|
| 346 | + * is 2x the target, then we double the depth. |
---|
| 347 | + */ |
---|
| 348 | + if (bad || p99 >= KYBER_GOOD_BUCKETS) { |
---|
| 349 | + orig_depth = kqd->domain_tokens[sched_domain].sb.depth; |
---|
| 350 | + depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; |
---|
| 351 | + kyber_resize_domain(kqd, sched_domain, depth); |
---|
| 352 | + } |
---|
| 353 | + } |
---|
289 | 354 | } |
---|
290 | 355 | |
---|
291 | | -static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) |
---|
| 356 | +static unsigned int kyber_sched_tags_shift(struct request_queue *q) |
---|
292 | 357 | { |
---|
293 | 358 | /* |
---|
294 | 359 | * All of the hardware queues have the same depth, so we can just grab |
---|
295 | 360 | * the shift of the first one. |
---|
296 | 361 | */ |
---|
297 | | - return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; |
---|
298 | | -} |
---|
299 | | - |
---|
300 | | -static int kyber_bucket_fn(const struct request *rq) |
---|
301 | | -{ |
---|
302 | | - return kyber_sched_domain(rq->cmd_flags); |
---|
| 362 | + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; |
---|
303 | 363 | } |
---|
304 | 364 | |
---|
305 | 365 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) |
---|
306 | 366 | { |
---|
307 | 367 | struct kyber_queue_data *kqd; |
---|
308 | | - unsigned int max_tokens; |
---|
309 | 368 | unsigned int shift; |
---|
310 | 369 | int ret = -ENOMEM; |
---|
311 | 370 | int i; |
---|
312 | 371 | |
---|
313 | | - kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
---|
| 372 | + kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
---|
314 | 373 | if (!kqd) |
---|
315 | 374 | goto err; |
---|
| 375 | + |
---|
316 | 376 | kqd->q = q; |
---|
317 | 377 | |
---|
318 | | - kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, |
---|
319 | | - KYBER_NUM_DOMAINS, kqd); |
---|
320 | | - if (!kqd->cb) |
---|
| 378 | + kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, |
---|
| 379 | + GFP_KERNEL | __GFP_ZERO); |
---|
| 380 | + if (!kqd->cpu_latency) |
---|
321 | 381 | goto err_kqd; |
---|
322 | 382 | |
---|
323 | | - /* |
---|
324 | | - * The maximum number of tokens for any scheduling domain is at least |
---|
325 | | - * the queue depth of a single hardware queue. If the hardware doesn't |
---|
326 | | - * have many tags, still provide a reasonable number. |
---|
327 | | - */ |
---|
328 | | - max_tokens = max_t(unsigned int, q->tag_set->queue_depth, |
---|
329 | | - KYBER_MIN_DEPTH); |
---|
| 383 | + timer_setup(&kqd->timer, kyber_timer_fn, 0); |
---|
| 384 | + |
---|
330 | 385 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
---|
331 | 386 | WARN_ON(!kyber_depth[i]); |
---|
332 | 387 | WARN_ON(!kyber_batch_size[i]); |
---|
333 | 388 | ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], |
---|
334 | | - max_tokens, -1, false, GFP_KERNEL, |
---|
335 | | - q->node); |
---|
| 389 | + kyber_depth[i], -1, false, |
---|
| 390 | + GFP_KERNEL, q->node); |
---|
336 | 391 | if (ret) { |
---|
337 | 392 | while (--i >= 0) |
---|
338 | 393 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
---|
339 | | - goto err_cb; |
---|
| 394 | + goto err_buckets; |
---|
340 | 395 | } |
---|
341 | | - sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); |
---|
342 | 396 | } |
---|
343 | 397 | |
---|
344 | | - shift = kyber_sched_tags_shift(kqd); |
---|
345 | | - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
---|
| 398 | + for (i = 0; i < KYBER_OTHER; i++) { |
---|
| 399 | + kqd->domain_p99[i] = -1; |
---|
| 400 | + kqd->latency_targets[i] = kyber_latency_targets[i]; |
---|
| 401 | + } |
---|
346 | 402 | |
---|
347 | | - kqd->read_lat_nsec = 2000000ULL; |
---|
348 | | - kqd->write_lat_nsec = 10000000ULL; |
---|
| 403 | + shift = kyber_sched_tags_shift(q); |
---|
| 404 | + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
---|
349 | 405 | |
---|
350 | 406 | return kqd; |
---|
351 | 407 | |
---|
352 | | -err_cb: |
---|
353 | | - blk_stat_free_callback(kqd->cb); |
---|
| 408 | +err_buckets: |
---|
| 409 | + free_percpu(kqd->cpu_latency); |
---|
354 | 410 | err_kqd: |
---|
355 | 411 | kfree(kqd); |
---|
356 | 412 | err: |
---|
.. | .. |
---|
372 | 428 | return PTR_ERR(kqd); |
---|
373 | 429 | } |
---|
374 | 430 | |
---|
| 431 | + blk_stat_enable_accounting(q); |
---|
| 432 | + |
---|
375 | 433 | eq->elevator_data = kqd; |
---|
376 | 434 | q->elevator = eq; |
---|
377 | | - |
---|
378 | | - blk_stat_add_callback(q, kqd->cb); |
---|
379 | 435 | |
---|
380 | 436 | return 0; |
---|
381 | 437 | } |
---|
.. | .. |
---|
383 | 439 | static void kyber_exit_sched(struct elevator_queue *e) |
---|
384 | 440 | { |
---|
385 | 441 | struct kyber_queue_data *kqd = e->elevator_data; |
---|
386 | | - struct request_queue *q = kqd->q; |
---|
387 | 442 | int i; |
---|
388 | 443 | |
---|
389 | | - blk_stat_remove_callback(q, kqd->cb); |
---|
| 444 | + del_timer_sync(&kqd->timer); |
---|
390 | 445 | |
---|
391 | 446 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
---|
392 | 447 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
---|
393 | | - blk_stat_free_callback(kqd->cb); |
---|
| 448 | + free_percpu(kqd->cpu_latency); |
---|
394 | 449 | kfree(kqd); |
---|
395 | 450 | } |
---|
396 | 451 | |
---|
.. | .. |
---|
435 | 490 | |
---|
436 | 491 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
---|
437 | 492 | INIT_LIST_HEAD(&khd->rqs[i]); |
---|
438 | | - init_waitqueue_func_entry(&khd->domain_wait[i], |
---|
| 493 | + khd->domain_wait[i].sbq = NULL; |
---|
| 494 | + init_waitqueue_func_entry(&khd->domain_wait[i].wait, |
---|
439 | 495 | kyber_domain_wake); |
---|
440 | | - khd->domain_wait[i].private = hctx; |
---|
441 | | - INIT_LIST_HEAD(&khd->domain_wait[i].entry); |
---|
| 496 | + khd->domain_wait[i].wait.private = hctx; |
---|
| 497 | + INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); |
---|
442 | 498 | atomic_set(&khd->wait_index[i], 0); |
---|
443 | 499 | } |
---|
444 | 500 | |
---|
.. | .. |
---|
446 | 502 | khd->batching = 0; |
---|
447 | 503 | |
---|
448 | 504 | hctx->sched_data = khd; |
---|
449 | | - sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, |
---|
| 505 | + sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, |
---|
450 | 506 | kqd->async_depth); |
---|
451 | 507 | |
---|
452 | 508 | return 0; |
---|
.. | .. |
---|
506 | 562 | } |
---|
507 | 563 | } |
---|
508 | 564 | |
---|
509 | | -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) |
---|
| 565 | +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, |
---|
| 566 | + unsigned int nr_segs) |
---|
510 | 567 | { |
---|
| 568 | + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
---|
| 569 | + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); |
---|
511 | 570 | struct kyber_hctx_data *khd = hctx->sched_data; |
---|
512 | | - struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); |
---|
513 | | - struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; |
---|
| 571 | + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; |
---|
514 | 572 | unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); |
---|
515 | 573 | struct list_head *rq_list = &kcq->rq_list[sched_domain]; |
---|
516 | 574 | bool merged; |
---|
517 | 575 | |
---|
518 | 576 | spin_lock(&kcq->lock); |
---|
519 | | - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); |
---|
| 577 | + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); |
---|
520 | 578 | spin_unlock(&kcq->lock); |
---|
521 | | - blk_mq_put_ctx(ctx); |
---|
522 | 579 | |
---|
523 | 580 | return merged; |
---|
524 | 581 | } |
---|
525 | 582 | |
---|
526 | | -static void kyber_prepare_request(struct request *rq, struct bio *bio) |
---|
| 583 | +static void kyber_prepare_request(struct request *rq) |
---|
527 | 584 | { |
---|
528 | 585 | rq_set_domain_token(rq, -1); |
---|
529 | 586 | } |
---|
.. | .. |
---|
536 | 593 | |
---|
537 | 594 | list_for_each_entry_safe(rq, next, rq_list, queuelist) { |
---|
538 | 595 | unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); |
---|
539 | | - struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; |
---|
| 596 | + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; |
---|
540 | 597 | struct list_head *head = &kcq->rq_list[sched_domain]; |
---|
541 | 598 | |
---|
542 | 599 | spin_lock(&kcq->lock); |
---|
.. | .. |
---|
545 | 602 | else |
---|
546 | 603 | list_move_tail(&rq->queuelist, head); |
---|
547 | 604 | sbitmap_set_bit(&khd->kcq_map[sched_domain], |
---|
548 | | - rq->mq_ctx->index_hw); |
---|
| 605 | + rq->mq_ctx->index_hw[hctx->type]); |
---|
549 | 606 | blk_mq_sched_request_inserted(rq); |
---|
550 | 607 | spin_unlock(&kcq->lock); |
---|
551 | 608 | } |
---|
.. | .. |
---|
558 | 615 | rq_clear_domain_token(kqd, rq); |
---|
559 | 616 | } |
---|
560 | 617 | |
---|
561 | | -static void kyber_completed_request(struct request *rq) |
---|
| 618 | +static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, |
---|
| 619 | + unsigned int sched_domain, unsigned int type, |
---|
| 620 | + u64 target, u64 latency) |
---|
562 | 621 | { |
---|
563 | | - struct request_queue *q = rq->q; |
---|
564 | | - struct kyber_queue_data *kqd = q->elevator->elevator_data; |
---|
565 | | - unsigned int sched_domain; |
---|
566 | | - u64 now, latency, target; |
---|
| 622 | + unsigned int bucket; |
---|
| 623 | + u64 divisor; |
---|
567 | 624 | |
---|
568 | | - /* |
---|
569 | | - * Check if this request met our latency goal. If not, quickly gather |
---|
570 | | - * some statistics and start throttling. |
---|
571 | | - */ |
---|
572 | | - sched_domain = kyber_sched_domain(rq->cmd_flags); |
---|
573 | | - switch (sched_domain) { |
---|
574 | | - case KYBER_READ: |
---|
575 | | - target = kqd->read_lat_nsec; |
---|
576 | | - break; |
---|
577 | | - case KYBER_SYNC_WRITE: |
---|
578 | | - target = kqd->write_lat_nsec; |
---|
579 | | - break; |
---|
580 | | - default: |
---|
581 | | - return; |
---|
| 625 | + if (latency > 0) { |
---|
| 626 | + divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); |
---|
| 627 | + bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), |
---|
| 628 | + KYBER_LATENCY_BUCKETS - 1); |
---|
| 629 | + } else { |
---|
| 630 | + bucket = 0; |
---|
582 | 631 | } |
---|
583 | 632 | |
---|
584 | | - /* If we are already monitoring latencies, don't check again. */ |
---|
585 | | - if (blk_stat_is_active(kqd->cb)) |
---|
| 633 | + atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); |
---|
| 634 | +} |
---|
| 635 | + |
---|
| 636 | +static void kyber_completed_request(struct request *rq, u64 now) |
---|
| 637 | +{ |
---|
| 638 | + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; |
---|
| 639 | + struct kyber_cpu_latency *cpu_latency; |
---|
| 640 | + unsigned int sched_domain; |
---|
| 641 | + u64 target; |
---|
| 642 | + |
---|
| 643 | + sched_domain = kyber_sched_domain(rq->cmd_flags); |
---|
| 644 | + if (sched_domain == KYBER_OTHER) |
---|
586 | 645 | return; |
---|
587 | 646 | |
---|
588 | | - now = ktime_get_ns(); |
---|
589 | | - if (now < rq->io_start_time_ns) |
---|
590 | | - return; |
---|
| 647 | + cpu_latency = get_cpu_ptr(kqd->cpu_latency); |
---|
| 648 | + target = kqd->latency_targets[sched_domain]; |
---|
| 649 | + add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, |
---|
| 650 | + target, now - rq->start_time_ns); |
---|
| 651 | + add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, |
---|
| 652 | + now - rq->io_start_time_ns); |
---|
| 653 | + put_cpu_ptr(kqd->cpu_latency); |
---|
591 | 654 | |
---|
592 | | - latency = now - rq->io_start_time_ns; |
---|
593 | | - |
---|
594 | | - if (latency > target) |
---|
595 | | - blk_stat_activate_msecs(kqd->cb, 10); |
---|
| 655 | + timer_reduce(&kqd->timer, jiffies + HZ / 10); |
---|
596 | 656 | } |
---|
597 | 657 | |
---|
598 | 658 | struct flush_kcq_data { |
---|
.. | .. |
---|
629 | 689 | flush_busy_kcq, &data); |
---|
630 | 690 | } |
---|
631 | 691 | |
---|
632 | | -static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, |
---|
| 692 | +static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, |
---|
633 | 693 | void *key) |
---|
634 | 694 | { |
---|
635 | | - struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); |
---|
| 695 | + struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); |
---|
| 696 | + struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); |
---|
636 | 697 | |
---|
637 | | - list_del_init(&wait->entry); |
---|
| 698 | + sbitmap_del_wait_queue(wait); |
---|
638 | 699 | blk_mq_run_hw_queue(hctx, true); |
---|
639 | 700 | return 1; |
---|
640 | 701 | } |
---|
.. | .. |
---|
645 | 706 | { |
---|
646 | 707 | unsigned int sched_domain = khd->cur_domain; |
---|
647 | 708 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; |
---|
648 | | - wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; |
---|
| 709 | + struct sbq_wait *wait = &khd->domain_wait[sched_domain]; |
---|
649 | 710 | struct sbq_wait_state *ws; |
---|
650 | 711 | int nr; |
---|
651 | 712 | |
---|
.. | .. |
---|
656 | 717 | * run when one becomes available. Note that this is serialized on |
---|
657 | 718 | * khd->lock, but we still need to be careful about the waker. |
---|
658 | 719 | */ |
---|
659 | | - if (nr < 0 && list_empty_careful(&wait->entry)) { |
---|
| 720 | + if (nr < 0 && list_empty_careful(&wait->wait.entry)) { |
---|
660 | 721 | ws = sbq_wait_ptr(domain_tokens, |
---|
661 | 722 | &khd->wait_index[sched_domain]); |
---|
662 | 723 | khd->domain_ws[sched_domain] = ws; |
---|
663 | | - add_wait_queue(&ws->wait, wait); |
---|
| 724 | + sbitmap_add_wait_queue(domain_tokens, ws, wait); |
---|
664 | 725 | |
---|
665 | 726 | /* |
---|
666 | 727 | * Try again in case a token was freed before we got on the wait |
---|
.. | .. |
---|
676 | 737 | * between the !list_empty_careful() check and us grabbing the lock, but |
---|
677 | 738 | * list_del_init() is okay with that. |
---|
678 | 739 | */ |
---|
679 | | - if (nr >= 0 && !list_empty_careful(&wait->entry)) { |
---|
| 740 | + if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { |
---|
680 | 741 | ws = khd->domain_ws[sched_domain]; |
---|
681 | 742 | spin_lock_irq(&ws->wait.lock); |
---|
682 | | - list_del_init(&wait->entry); |
---|
| 743 | + sbitmap_del_wait_queue(wait); |
---|
683 | 744 | spin_unlock_irq(&ws->wait.lock); |
---|
684 | 745 | } |
---|
685 | 746 | |
---|
.. | .. |
---|
713 | 774 | rq_set_domain_token(rq, nr); |
---|
714 | 775 | list_del_init(&rq->queuelist); |
---|
715 | 776 | return rq; |
---|
| 777 | + } else { |
---|
| 778 | + trace_kyber_throttled(kqd->q, |
---|
| 779 | + kyber_domain_names[khd->cur_domain]); |
---|
716 | 780 | } |
---|
717 | 781 | } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { |
---|
718 | 782 | nr = kyber_get_domain_token(kqd, khd, hctx); |
---|
.. | .. |
---|
723 | 787 | rq_set_domain_token(rq, nr); |
---|
724 | 788 | list_del_init(&rq->queuelist); |
---|
725 | 789 | return rq; |
---|
| 790 | + } else { |
---|
| 791 | + trace_kyber_throttled(kqd->q, |
---|
| 792 | + kyber_domain_names[khd->cur_domain]); |
---|
726 | 793 | } |
---|
727 | 794 | } |
---|
728 | 795 | |
---|
.. | .. |
---|
790 | 857 | return false; |
---|
791 | 858 | } |
---|
792 | 859 | |
---|
793 | | -#define KYBER_LAT_SHOW_STORE(op) \ |
---|
794 | | -static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ |
---|
795 | | - char *page) \ |
---|
| 860 | +#define KYBER_LAT_SHOW_STORE(domain, name) \ |
---|
| 861 | +static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ |
---|
| 862 | + char *page) \ |
---|
796 | 863 | { \ |
---|
797 | 864 | struct kyber_queue_data *kqd = e->elevator_data; \ |
---|
798 | 865 | \ |
---|
799 | | - return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ |
---|
| 866 | + return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ |
---|
800 | 867 | } \ |
---|
801 | 868 | \ |
---|
802 | | -static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ |
---|
803 | | - const char *page, size_t count) \ |
---|
| 869 | +static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ |
---|
| 870 | + const char *page, size_t count) \ |
---|
804 | 871 | { \ |
---|
805 | 872 | struct kyber_queue_data *kqd = e->elevator_data; \ |
---|
806 | 873 | unsigned long long nsec; \ |
---|
.. | .. |
---|
810 | 877 | if (ret) \ |
---|
811 | 878 | return ret; \ |
---|
812 | 879 | \ |
---|
813 | | - kqd->op##_lat_nsec = nsec; \ |
---|
| 880 | + kqd->latency_targets[domain] = nsec; \ |
---|
814 | 881 | \ |
---|
815 | 882 | return count; \ |
---|
816 | 883 | } |
---|
817 | | -KYBER_LAT_SHOW_STORE(read); |
---|
818 | | -KYBER_LAT_SHOW_STORE(write); |
---|
| 884 | +KYBER_LAT_SHOW_STORE(KYBER_READ, read); |
---|
| 885 | +KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); |
---|
819 | 886 | #undef KYBER_LAT_SHOW_STORE |
---|
820 | 887 | |
---|
821 | 888 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) |
---|
.. | .. |
---|
876 | 943 | { \ |
---|
877 | 944 | struct blk_mq_hw_ctx *hctx = data; \ |
---|
878 | 945 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
---|
879 | | - wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ |
---|
| 946 | + wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ |
---|
880 | 947 | \ |
---|
881 | 948 | seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ |
---|
882 | 949 | return 0; \ |
---|
883 | 950 | } |
---|
884 | 951 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) |
---|
885 | | -KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) |
---|
| 952 | +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) |
---|
| 953 | +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) |
---|
886 | 954 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) |
---|
887 | 955 | #undef KYBER_DEBUGFS_DOMAIN_ATTRS |
---|
888 | 956 | |
---|
.. | .. |
---|
900 | 968 | struct blk_mq_hw_ctx *hctx = data; |
---|
901 | 969 | struct kyber_hctx_data *khd = hctx->sched_data; |
---|
902 | 970 | |
---|
903 | | - switch (khd->cur_domain) { |
---|
904 | | - case KYBER_READ: |
---|
905 | | - seq_puts(m, "READ\n"); |
---|
906 | | - break; |
---|
907 | | - case KYBER_SYNC_WRITE: |
---|
908 | | - seq_puts(m, "SYNC_WRITE\n"); |
---|
909 | | - break; |
---|
910 | | - case KYBER_OTHER: |
---|
911 | | - seq_puts(m, "OTHER\n"); |
---|
912 | | - break; |
---|
913 | | - default: |
---|
914 | | - seq_printf(m, "%u\n", khd->cur_domain); |
---|
915 | | - break; |
---|
916 | | - } |
---|
| 971 | + seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); |
---|
917 | 972 | return 0; |
---|
918 | 973 | } |
---|
919 | 974 | |
---|
.. | .. |
---|
930 | 985 | {#name "_tokens", 0400, kyber_##name##_tokens_show} |
---|
931 | 986 | static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { |
---|
932 | 987 | KYBER_QUEUE_DOMAIN_ATTRS(read), |
---|
933 | | - KYBER_QUEUE_DOMAIN_ATTRS(sync_write), |
---|
| 988 | + KYBER_QUEUE_DOMAIN_ATTRS(write), |
---|
| 989 | + KYBER_QUEUE_DOMAIN_ATTRS(discard), |
---|
934 | 990 | KYBER_QUEUE_DOMAIN_ATTRS(other), |
---|
935 | 991 | {"async_depth", 0400, kyber_async_depth_show}, |
---|
936 | 992 | {}, |
---|
.. | .. |
---|
942 | 998 | {#name "_waiting", 0400, kyber_##name##_waiting_show} |
---|
943 | 999 | static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { |
---|
944 | 1000 | KYBER_HCTX_DOMAIN_ATTRS(read), |
---|
945 | | - KYBER_HCTX_DOMAIN_ATTRS(sync_write), |
---|
| 1001 | + KYBER_HCTX_DOMAIN_ATTRS(write), |
---|
| 1002 | + KYBER_HCTX_DOMAIN_ATTRS(discard), |
---|
946 | 1003 | KYBER_HCTX_DOMAIN_ATTRS(other), |
---|
947 | 1004 | {"cur_domain", 0400, kyber_cur_domain_show}, |
---|
948 | 1005 | {"batching", 0400, kyber_batching_show}, |
---|
.. | .. |
---|
952 | 1009 | #endif |
---|
953 | 1010 | |
---|
954 | 1011 | static struct elevator_type kyber_sched = { |
---|
955 | | - .ops.mq = { |
---|
| 1012 | + .ops = { |
---|
956 | 1013 | .init_sched = kyber_init_sched, |
---|
957 | 1014 | .exit_sched = kyber_exit_sched, |
---|
958 | 1015 | .init_hctx = kyber_init_hctx, |
---|
.. | .. |
---|
967 | 1024 | .dispatch_request = kyber_dispatch_request, |
---|
968 | 1025 | .has_work = kyber_has_work, |
---|
969 | 1026 | }, |
---|
970 | | - .uses_mq = true, |
---|
971 | 1027 | #ifdef CONFIG_BLK_DEBUG_FS |
---|
972 | 1028 | .queue_debugfs_attrs = kyber_queue_debugfs_attrs, |
---|
973 | 1029 | .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, |
---|
974 | 1030 | #endif |
---|
975 | 1031 | .elevator_attrs = kyber_sched_attrs, |
---|
976 | 1032 | .elevator_name = "kyber", |
---|
| 1033 | + .elevator_features = ELEVATOR_F_MQ_AWARE, |
---|
977 | 1034 | .elevator_owner = THIS_MODULE, |
---|
978 | 1035 | }; |
---|
979 | 1036 | |
---|