hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/block/kyber-iosched.c
....@@ -1,20 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
34 * scalable techniques.
45 *
56 * Copyright (C) 2017 Facebook
6
- *
7
- * This program is free software; you can redistribute it and/or
8
- * modify it under the terms of the GNU General Public
9
- * License v2 as published by the Free Software Foundation.
10
- *
11
- * This program is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
- * General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
187 */
198
209 #include <linux/kernel.h>
....@@ -29,19 +18,30 @@
2918 #include "blk-mq-debugfs.h"
3019 #include "blk-mq-sched.h"
3120 #include "blk-mq-tag.h"
32
-#include "blk-stat.h"
3321
34
-/* Scheduling domains. */
22
+#define CREATE_TRACE_POINTS
23
+#include <trace/events/kyber.h>
24
+
25
+/*
26
+ * Scheduling domains: the device is divided into multiple domains based on the
27
+ * request type.
28
+ */
3529 enum {
3630 KYBER_READ,
37
- KYBER_SYNC_WRITE,
38
- KYBER_OTHER, /* Async writes, discard, etc. */
31
+ KYBER_WRITE,
32
+ KYBER_DISCARD,
33
+ KYBER_OTHER,
3934 KYBER_NUM_DOMAINS,
4035 };
4136
42
-enum {
43
- KYBER_MIN_DEPTH = 256,
37
+static const char *kyber_domain_names[] = {
38
+ [KYBER_READ] = "READ",
39
+ [KYBER_WRITE] = "WRITE",
40
+ [KYBER_DISCARD] = "DISCARD",
41
+ [KYBER_OTHER] = "OTHER",
42
+};
4443
44
+enum {
4545 /*
4646 * In order to prevent starvation of synchronous requests by a flood of
4747 * asynchronous requests, we reserve 25% of requests for synchronous
....@@ -51,25 +51,87 @@
5151 };
5252
5353 /*
54
- * Initial device-wide depths for each scheduling domain.
54
+ * Maximum device-wide depth for each scheduling domain.
5555 *
56
- * Even for fast devices with lots of tags like NVMe, you can saturate
57
- * the device with only a fraction of the maximum possible queue depth.
58
- * So, we cap these to a reasonable value.
56
+ * Even for fast devices with lots of tags like NVMe, you can saturate the
57
+ * device with only a fraction of the maximum possible queue depth. So, we cap
58
+ * these to a reasonable value.
5959 */
6060 static const unsigned int kyber_depth[] = {
6161 [KYBER_READ] = 256,
62
- [KYBER_SYNC_WRITE] = 128,
63
- [KYBER_OTHER] = 64,
62
+ [KYBER_WRITE] = 128,
63
+ [KYBER_DISCARD] = 64,
64
+ [KYBER_OTHER] = 16,
6465 };
6566
6667 /*
67
- * Scheduling domain batch sizes. We favor reads.
68
+ * Default latency targets for each scheduling domain.
69
+ */
70
+static const u64 kyber_latency_targets[] = {
71
+ [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
72
+ [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
73
+ [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
74
+};
75
+
76
+/*
77
+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
78
+ * domain.
6879 */
6980 static const unsigned int kyber_batch_size[] = {
7081 [KYBER_READ] = 16,
71
- [KYBER_SYNC_WRITE] = 8,
72
- [KYBER_OTHER] = 8,
82
+ [KYBER_WRITE] = 8,
83
+ [KYBER_DISCARD] = 1,
84
+ [KYBER_OTHER] = 1,
85
+};
86
+
87
+/*
88
+ * Requests latencies are recorded in a histogram with buckets defined relative
89
+ * to the target latency:
90
+ *
91
+ * <= 1/4 * target latency
92
+ * <= 1/2 * target latency
93
+ * <= 3/4 * target latency
94
+ * <= target latency
95
+ * <= 1 1/4 * target latency
96
+ * <= 1 1/2 * target latency
97
+ * <= 1 3/4 * target latency
98
+ * > 1 3/4 * target latency
99
+ */
100
+enum {
101
+ /*
102
+ * The width of the latency histogram buckets is
103
+ * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
104
+ */
105
+ KYBER_LATENCY_SHIFT = 2,
106
+ /*
107
+ * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
108
+ * thus, "good".
109
+ */
110
+ KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
111
+ /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
112
+ KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
113
+};
114
+
115
+/*
116
+ * We measure both the total latency and the I/O latency (i.e., latency after
117
+ * submitting to the device).
118
+ */
119
+enum {
120
+ KYBER_TOTAL_LATENCY,
121
+ KYBER_IO_LATENCY,
122
+};
123
+
124
+static const char *kyber_latency_type_names[] = {
125
+ [KYBER_TOTAL_LATENCY] = "total",
126
+ [KYBER_IO_LATENCY] = "I/O",
127
+};
128
+
129
+/*
130
+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
131
+ * domain except for KYBER_OTHER.
132
+ */
133
+struct kyber_cpu_latency {
134
+ atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
73135 };
74136
75137 /*
....@@ -88,12 +150,9 @@
88150 struct kyber_queue_data {
89151 struct request_queue *q;
90152
91
- struct blk_stat_callback *cb;
92
-
93153 /*
94
- * The device is divided into multiple scheduling domains based on the
95
- * request type. Each domain has a fixed number of in-flight requests of
96
- * that type device-wide, limited by these tokens.
154
+ * Each scheduling domain has a limited number of in-flight requests
155
+ * device-wide, limited by these tokens.
97156 */
98157 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99158
....@@ -103,8 +162,19 @@
103162 */
104163 unsigned int async_depth;
105164
165
+ struct kyber_cpu_latency __percpu *cpu_latency;
166
+
167
+ /* Timer for stats aggregation and adjusting domain tokens. */
168
+ struct timer_list timer;
169
+
170
+ unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
171
+
172
+ unsigned long latency_timeout[KYBER_OTHER];
173
+
174
+ int domain_p99[KYBER_OTHER];
175
+
106176 /* Target latencies in nanoseconds. */
107
- u64 read_lat_nsec, write_lat_nsec;
177
+ u64 latency_targets[KYBER_OTHER];
108178 };
109179
110180 struct kyber_hctx_data {
....@@ -114,7 +184,7 @@
114184 unsigned int batching;
115185 struct kyber_ctx_queue *kcqs;
116186 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117
- wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
187
+ struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
118188 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119189 atomic_t wait_index[KYBER_NUM_DOMAINS];
120190 };
....@@ -124,233 +194,219 @@
124194
125195 static unsigned int kyber_sched_domain(unsigned int op)
126196 {
127
- if ((op & REQ_OP_MASK) == REQ_OP_READ)
197
+ switch (op & REQ_OP_MASK) {
198
+ case REQ_OP_READ:
128199 return KYBER_READ;
129
- else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130
- return KYBER_SYNC_WRITE;
131
- else
200
+ case REQ_OP_WRITE:
201
+ return KYBER_WRITE;
202
+ case REQ_OP_DISCARD:
203
+ return KYBER_DISCARD;
204
+ default:
132205 return KYBER_OTHER;
206
+ }
133207 }
134208
135
-enum {
136
- NONE = 0,
137
- GOOD = 1,
138
- GREAT = 2,
139
- BAD = -1,
140
- AWFUL = -2,
141
-};
142
-
143
-#define IS_GOOD(status) ((status) > 0)
144
-#define IS_BAD(status) ((status) < 0)
145
-
146
-static int kyber_lat_status(struct blk_stat_callback *cb,
147
- unsigned int sched_domain, u64 target)
209
+static void flush_latency_buckets(struct kyber_queue_data *kqd,
210
+ struct kyber_cpu_latency *cpu_latency,
211
+ unsigned int sched_domain, unsigned int type)
148212 {
149
- u64 latency;
213
+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
214
+ atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
215
+ unsigned int bucket;
150216
151
- if (!cb->stat[sched_domain].nr_samples)
152
- return NONE;
153
-
154
- latency = cb->stat[sched_domain].mean;
155
- if (latency >= 2 * target)
156
- return AWFUL;
157
- else if (latency > target)
158
- return BAD;
159
- else if (latency <= target / 2)
160
- return GREAT;
161
- else /* (latency <= target) */
162
- return GOOD;
217
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
218
+ buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
163219 }
164220
165221 /*
166
- * Adjust the read or synchronous write depth given the status of reads and
167
- * writes. The goal is that the latencies of the two domains are fair (i.e., if
168
- * one is good, then the other is good).
222
+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
223
+ * aren't enough samples yet.
169224 */
170
-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171
- unsigned int sched_domain, int this_status,
172
- int other_status)
225
+static int calculate_percentile(struct kyber_queue_data *kqd,
226
+ unsigned int sched_domain, unsigned int type,
227
+ unsigned int percentile)
173228 {
174
- unsigned int orig_depth, depth;
229
+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
230
+ unsigned int bucket, samples = 0, percentile_samples;
231
+
232
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
233
+ samples += buckets[bucket];
234
+
235
+ if (!samples)
236
+ return -1;
175237
176238 /*
177
- * If this domain had no samples, or reads and writes are both good or
178
- * both bad, don't adjust the depth.
239
+ * We do the calculation once we have 500 samples or one second passes
240
+ * since the first sample was recorded, whichever comes first.
179241 */
180
- if (this_status == NONE ||
181
- (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182
- (IS_BAD(this_status) && IS_BAD(other_status)))
183
- return;
184
-
185
- orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186
-
187
- if (other_status == NONE) {
188
- depth++;
189
- } else {
190
- switch (this_status) {
191
- case GOOD:
192
- if (other_status == AWFUL)
193
- depth -= max(depth / 4, 1U);
194
- else
195
- depth -= max(depth / 8, 1U);
196
- break;
197
- case GREAT:
198
- if (other_status == AWFUL)
199
- depth /= 2;
200
- else
201
- depth -= max(depth / 4, 1U);
202
- break;
203
- case BAD:
204
- depth++;
205
- break;
206
- case AWFUL:
207
- if (other_status == GREAT)
208
- depth += 2;
209
- else
210
- depth++;
211
- break;
212
- }
242
+ if (!kqd->latency_timeout[sched_domain])
243
+ kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
244
+ if (samples < 500 &&
245
+ time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
246
+ return -1;
213247 }
248
+ kqd->latency_timeout[sched_domain] = 0;
214249
250
+ percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
251
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
252
+ if (buckets[bucket] >= percentile_samples)
253
+ break;
254
+ percentile_samples -= buckets[bucket];
255
+ }
256
+ memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
257
+
258
+ trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
259
+ kyber_latency_type_names[type], percentile,
260
+ bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
261
+
262
+ return bucket;
263
+}
264
+
265
+static void kyber_resize_domain(struct kyber_queue_data *kqd,
266
+ unsigned int sched_domain, unsigned int depth)
267
+{
215268 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216
- if (depth != orig_depth)
269
+ if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
217270 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
271
+ trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
272
+ depth);
273
+ }
218274 }
219275
220
-/*
221
- * Adjust the depth of other requests given the status of reads and synchronous
222
- * writes. As long as either domain is doing fine, we don't throttle, but if
223
- * both domains are doing badly, we throttle heavily.
224
- */
225
-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226
- int read_status, int write_status,
227
- bool have_samples)
276
+static void kyber_timer_fn(struct timer_list *t)
228277 {
229
- unsigned int orig_depth, depth;
230
- int status;
278
+ struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
279
+ unsigned int sched_domain;
280
+ int cpu;
281
+ bool bad = false;
231282
232
- orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
283
+ /* Sum all of the per-cpu latency histograms. */
284
+ for_each_online_cpu(cpu) {
285
+ struct kyber_cpu_latency *cpu_latency;
233286
234
- if (read_status == NONE && write_status == NONE) {
235
- depth += 2;
236
- } else if (have_samples) {
237
- if (read_status == NONE)
238
- status = write_status;
239
- else if (write_status == NONE)
240
- status = read_status;
241
- else
242
- status = max(read_status, write_status);
243
- switch (status) {
244
- case GREAT:
245
- depth += 2;
246
- break;
247
- case GOOD:
248
- depth++;
249
- break;
250
- case BAD:
251
- depth -= max(depth / 4, 1U);
252
- break;
253
- case AWFUL:
254
- depth /= 2;
255
- break;
287
+ cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
288
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
289
+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
290
+ KYBER_TOTAL_LATENCY);
291
+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
292
+ KYBER_IO_LATENCY);
256293 }
257294 }
258295
259
- depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260
- if (depth != orig_depth)
261
- sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262
-}
296
+ /*
297
+ * Check if any domains have a high I/O latency, which might indicate
298
+ * congestion in the device. Note that we use the p90; we don't want to
299
+ * be too sensitive to outliers here.
300
+ */
301
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
302
+ int p90;
263303
264
-/*
265
- * Apply heuristics for limiting queue depths based on gathered latency
266
- * statistics.
267
- */
268
-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269
-{
270
- struct kyber_queue_data *kqd = cb->data;
271
- int read_status, write_status;
272
-
273
- read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274
- write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275
-
276
- kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277
- kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278
- kyber_adjust_other_depth(kqd, read_status, write_status,
279
- cb->stat[KYBER_OTHER].nr_samples != 0);
304
+ p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
305
+ 90);
306
+ if (p90 >= KYBER_GOOD_BUCKETS)
307
+ bad = true;
308
+ }
280309
281310 /*
282
- * Continue monitoring latencies if we aren't hitting the targets or
283
- * we're still throttling other requests.
311
+ * Adjust the scheduling domain depths. If we determined that there was
312
+ * congestion, we throttle all domains with good latencies. Either way,
313
+ * we ease up on throttling domains with bad latencies.
284314 */
285
- if (!blk_stat_is_active(kqd->cb) &&
286
- ((IS_BAD(read_status) || IS_BAD(write_status) ||
287
- kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288
- blk_stat_activate_msecs(kqd->cb, 100);
315
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
316
+ unsigned int orig_depth, depth;
317
+ int p99;
318
+
319
+ p99 = calculate_percentile(kqd, sched_domain,
320
+ KYBER_TOTAL_LATENCY, 99);
321
+ /*
322
+ * This is kind of subtle: different domains will not
323
+ * necessarily have enough samples to calculate the latency
324
+ * percentiles during the same window, so we have to remember
325
+ * the p99 for the next time we observe congestion; once we do,
326
+ * we don't want to throttle again until we get more data, so we
327
+ * reset it to -1.
328
+ */
329
+ if (bad) {
330
+ if (p99 < 0)
331
+ p99 = kqd->domain_p99[sched_domain];
332
+ kqd->domain_p99[sched_domain] = -1;
333
+ } else if (p99 >= 0) {
334
+ kqd->domain_p99[sched_domain] = p99;
335
+ }
336
+ if (p99 < 0)
337
+ continue;
338
+
339
+ /*
340
+ * If this domain has bad latency, throttle less. Otherwise,
341
+ * throttle more iff we determined that there is congestion.
342
+ *
343
+ * The new depth is scaled linearly with the p99 latency vs the
344
+ * latency target. E.g., if the p99 is 3/4 of the target, then
345
+ * we throttle down to 3/4 of the current depth, and if the p99
346
+ * is 2x the target, then we double the depth.
347
+ */
348
+ if (bad || p99 >= KYBER_GOOD_BUCKETS) {
349
+ orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
350
+ depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
351
+ kyber_resize_domain(kqd, sched_domain, depth);
352
+ }
353
+ }
289354 }
290355
291
-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
356
+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
292357 {
293358 /*
294359 * All of the hardware queues have the same depth, so we can just grab
295360 * the shift of the first one.
296361 */
297
- return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298
-}
299
-
300
-static int kyber_bucket_fn(const struct request *rq)
301
-{
302
- return kyber_sched_domain(rq->cmd_flags);
362
+ return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
303363 }
304364
305365 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
306366 {
307367 struct kyber_queue_data *kqd;
308
- unsigned int max_tokens;
309368 unsigned int shift;
310369 int ret = -ENOMEM;
311370 int i;
312371
313
- kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
372
+ kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314373 if (!kqd)
315374 goto err;
375
+
316376 kqd->q = q;
317377
318
- kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
319
- KYBER_NUM_DOMAINS, kqd);
320
- if (!kqd->cb)
378
+ kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
379
+ GFP_KERNEL | __GFP_ZERO);
380
+ if (!kqd->cpu_latency)
321381 goto err_kqd;
322382
323
- /*
324
- * The maximum number of tokens for any scheduling domain is at least
325
- * the queue depth of a single hardware queue. If the hardware doesn't
326
- * have many tags, still provide a reasonable number.
327
- */
328
- max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329
- KYBER_MIN_DEPTH);
383
+ timer_setup(&kqd->timer, kyber_timer_fn, 0);
384
+
330385 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331386 WARN_ON(!kyber_depth[i]);
332387 WARN_ON(!kyber_batch_size[i]);
333388 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334
- max_tokens, -1, false, GFP_KERNEL,
335
- q->node);
389
+ kyber_depth[i], -1, false,
390
+ GFP_KERNEL, q->node);
336391 if (ret) {
337392 while (--i >= 0)
338393 sbitmap_queue_free(&kqd->domain_tokens[i]);
339
- goto err_cb;
394
+ goto err_buckets;
340395 }
341
- sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342396 }
343397
344
- shift = kyber_sched_tags_shift(kqd);
345
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
398
+ for (i = 0; i < KYBER_OTHER; i++) {
399
+ kqd->domain_p99[i] = -1;
400
+ kqd->latency_targets[i] = kyber_latency_targets[i];
401
+ }
346402
347
- kqd->read_lat_nsec = 2000000ULL;
348
- kqd->write_lat_nsec = 10000000ULL;
403
+ shift = kyber_sched_tags_shift(q);
404
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
349405
350406 return kqd;
351407
352
-err_cb:
353
- blk_stat_free_callback(kqd->cb);
408
+err_buckets:
409
+ free_percpu(kqd->cpu_latency);
354410 err_kqd:
355411 kfree(kqd);
356412 err:
....@@ -372,10 +428,10 @@
372428 return PTR_ERR(kqd);
373429 }
374430
431
+ blk_stat_enable_accounting(q);
432
+
375433 eq->elevator_data = kqd;
376434 q->elevator = eq;
377
-
378
- blk_stat_add_callback(q, kqd->cb);
379435
380436 return 0;
381437 }
....@@ -383,14 +439,13 @@
383439 static void kyber_exit_sched(struct elevator_queue *e)
384440 {
385441 struct kyber_queue_data *kqd = e->elevator_data;
386
- struct request_queue *q = kqd->q;
387442 int i;
388443
389
- blk_stat_remove_callback(q, kqd->cb);
444
+ del_timer_sync(&kqd->timer);
390445
391446 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392447 sbitmap_queue_free(&kqd->domain_tokens[i]);
393
- blk_stat_free_callback(kqd->cb);
448
+ free_percpu(kqd->cpu_latency);
394449 kfree(kqd);
395450 }
396451
....@@ -435,10 +490,11 @@
435490
436491 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437492 INIT_LIST_HEAD(&khd->rqs[i]);
438
- init_waitqueue_func_entry(&khd->domain_wait[i],
493
+ khd->domain_wait[i].sbq = NULL;
494
+ init_waitqueue_func_entry(&khd->domain_wait[i].wait,
439495 kyber_domain_wake);
440
- khd->domain_wait[i].private = hctx;
441
- INIT_LIST_HEAD(&khd->domain_wait[i].entry);
496
+ khd->domain_wait[i].wait.private = hctx;
497
+ INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
442498 atomic_set(&khd->wait_index[i], 0);
443499 }
444500
....@@ -446,7 +502,7 @@
446502 khd->batching = 0;
447503
448504 hctx->sched_data = khd;
449
- sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
505
+ sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
450506 kqd->async_depth);
451507
452508 return 0;
....@@ -506,24 +562,25 @@
506562 }
507563 }
508564
509
-static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
565
+static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
566
+ unsigned int nr_segs)
510567 {
568
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
569
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
511570 struct kyber_hctx_data *khd = hctx->sched_data;
512
- struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513
- struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
571
+ struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
514572 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515573 struct list_head *rq_list = &kcq->rq_list[sched_domain];
516574 bool merged;
517575
518576 spin_lock(&kcq->lock);
519
- merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
577
+ merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
520578 spin_unlock(&kcq->lock);
521
- blk_mq_put_ctx(ctx);
522579
523580 return merged;
524581 }
525582
526
-static void kyber_prepare_request(struct request *rq, struct bio *bio)
583
+static void kyber_prepare_request(struct request *rq)
527584 {
528585 rq_set_domain_token(rq, -1);
529586 }
....@@ -536,7 +593,7 @@
536593
537594 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538595 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539
- struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
596
+ struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
540597 struct list_head *head = &kcq->rq_list[sched_domain];
541598
542599 spin_lock(&kcq->lock);
....@@ -545,7 +602,7 @@
545602 else
546603 list_move_tail(&rq->queuelist, head);
547604 sbitmap_set_bit(&khd->kcq_map[sched_domain],
548
- rq->mq_ctx->index_hw);
605
+ rq->mq_ctx->index_hw[hctx->type]);
549606 blk_mq_sched_request_inserted(rq);
550607 spin_unlock(&kcq->lock);
551608 }
....@@ -558,41 +615,44 @@
558615 rq_clear_domain_token(kqd, rq);
559616 }
560617
561
-static void kyber_completed_request(struct request *rq)
618
+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
619
+ unsigned int sched_domain, unsigned int type,
620
+ u64 target, u64 latency)
562621 {
563
- struct request_queue *q = rq->q;
564
- struct kyber_queue_data *kqd = q->elevator->elevator_data;
565
- unsigned int sched_domain;
566
- u64 now, latency, target;
622
+ unsigned int bucket;
623
+ u64 divisor;
567624
568
- /*
569
- * Check if this request met our latency goal. If not, quickly gather
570
- * some statistics and start throttling.
571
- */
572
- sched_domain = kyber_sched_domain(rq->cmd_flags);
573
- switch (sched_domain) {
574
- case KYBER_READ:
575
- target = kqd->read_lat_nsec;
576
- break;
577
- case KYBER_SYNC_WRITE:
578
- target = kqd->write_lat_nsec;
579
- break;
580
- default:
581
- return;
625
+ if (latency > 0) {
626
+ divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
627
+ bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
628
+ KYBER_LATENCY_BUCKETS - 1);
629
+ } else {
630
+ bucket = 0;
582631 }
583632
584
- /* If we are already monitoring latencies, don't check again. */
585
- if (blk_stat_is_active(kqd->cb))
633
+ atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
634
+}
635
+
636
+static void kyber_completed_request(struct request *rq, u64 now)
637
+{
638
+ struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
639
+ struct kyber_cpu_latency *cpu_latency;
640
+ unsigned int sched_domain;
641
+ u64 target;
642
+
643
+ sched_domain = kyber_sched_domain(rq->cmd_flags);
644
+ if (sched_domain == KYBER_OTHER)
586645 return;
587646
588
- now = ktime_get_ns();
589
- if (now < rq->io_start_time_ns)
590
- return;
647
+ cpu_latency = get_cpu_ptr(kqd->cpu_latency);
648
+ target = kqd->latency_targets[sched_domain];
649
+ add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
650
+ target, now - rq->start_time_ns);
651
+ add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
652
+ now - rq->io_start_time_ns);
653
+ put_cpu_ptr(kqd->cpu_latency);
591654
592
- latency = now - rq->io_start_time_ns;
593
-
594
- if (latency > target)
595
- blk_stat_activate_msecs(kqd->cb, 10);
655
+ timer_reduce(&kqd->timer, jiffies + HZ / 10);
596656 }
597657
598658 struct flush_kcq_data {
....@@ -629,12 +689,13 @@
629689 flush_busy_kcq, &data);
630690 }
631691
632
-static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
692
+static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
633693 void *key)
634694 {
635
- struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
695
+ struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
696
+ struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
636697
637
- list_del_init(&wait->entry);
698
+ sbitmap_del_wait_queue(wait);
638699 blk_mq_run_hw_queue(hctx, true);
639700 return 1;
640701 }
....@@ -645,7 +706,7 @@
645706 {
646707 unsigned int sched_domain = khd->cur_domain;
647708 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
648
- wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
709
+ struct sbq_wait *wait = &khd->domain_wait[sched_domain];
649710 struct sbq_wait_state *ws;
650711 int nr;
651712
....@@ -656,11 +717,11 @@
656717 * run when one becomes available. Note that this is serialized on
657718 * khd->lock, but we still need to be careful about the waker.
658719 */
659
- if (nr < 0 && list_empty_careful(&wait->entry)) {
720
+ if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
660721 ws = sbq_wait_ptr(domain_tokens,
661722 &khd->wait_index[sched_domain]);
662723 khd->domain_ws[sched_domain] = ws;
663
- add_wait_queue(&ws->wait, wait);
724
+ sbitmap_add_wait_queue(domain_tokens, ws, wait);
664725
665726 /*
666727 * Try again in case a token was freed before we got on the wait
....@@ -676,10 +737,10 @@
676737 * between the !list_empty_careful() check and us grabbing the lock, but
677738 * list_del_init() is okay with that.
678739 */
679
- if (nr >= 0 && !list_empty_careful(&wait->entry)) {
740
+ if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
680741 ws = khd->domain_ws[sched_domain];
681742 spin_lock_irq(&ws->wait.lock);
682
- list_del_init(&wait->entry);
743
+ sbitmap_del_wait_queue(wait);
683744 spin_unlock_irq(&ws->wait.lock);
684745 }
685746
....@@ -713,6 +774,9 @@
713774 rq_set_domain_token(rq, nr);
714775 list_del_init(&rq->queuelist);
715776 return rq;
777
+ } else {
778
+ trace_kyber_throttled(kqd->q,
779
+ kyber_domain_names[khd->cur_domain]);
716780 }
717781 } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718782 nr = kyber_get_domain_token(kqd, khd, hctx);
....@@ -723,6 +787,9 @@
723787 rq_set_domain_token(rq, nr);
724788 list_del_init(&rq->queuelist);
725789 return rq;
790
+ } else {
791
+ trace_kyber_throttled(kqd->q,
792
+ kyber_domain_names[khd->cur_domain]);
726793 }
727794 }
728795
....@@ -790,17 +857,17 @@
790857 return false;
791858 }
792859
793
-#define KYBER_LAT_SHOW_STORE(op) \
794
-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
795
- char *page) \
860
+#define KYBER_LAT_SHOW_STORE(domain, name) \
861
+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
862
+ char *page) \
796863 { \
797864 struct kyber_queue_data *kqd = e->elevator_data; \
798865 \
799
- return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
866
+ return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
800867 } \
801868 \
802
-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
803
- const char *page, size_t count) \
869
+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
870
+ const char *page, size_t count) \
804871 { \
805872 struct kyber_queue_data *kqd = e->elevator_data; \
806873 unsigned long long nsec; \
....@@ -810,12 +877,12 @@
810877 if (ret) \
811878 return ret; \
812879 \
813
- kqd->op##_lat_nsec = nsec; \
880
+ kqd->latency_targets[domain] = nsec; \
814881 \
815882 return count; \
816883 }
817
-KYBER_LAT_SHOW_STORE(read);
818
-KYBER_LAT_SHOW_STORE(write);
884
+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
885
+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
819886 #undef KYBER_LAT_SHOW_STORE
820887
821888 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
....@@ -876,13 +943,14 @@
876943 { \
877944 struct blk_mq_hw_ctx *hctx = data; \
878945 struct kyber_hctx_data *khd = hctx->sched_data; \
879
- wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
946
+ wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
880947 \
881948 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
882949 return 0; \
883950 }
884951 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885
-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
952
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
953
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
886954 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887955 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
888956
....@@ -900,20 +968,7 @@
900968 struct blk_mq_hw_ctx *hctx = data;
901969 struct kyber_hctx_data *khd = hctx->sched_data;
902970
903
- switch (khd->cur_domain) {
904
- case KYBER_READ:
905
- seq_puts(m, "READ\n");
906
- break;
907
- case KYBER_SYNC_WRITE:
908
- seq_puts(m, "SYNC_WRITE\n");
909
- break;
910
- case KYBER_OTHER:
911
- seq_puts(m, "OTHER\n");
912
- break;
913
- default:
914
- seq_printf(m, "%u\n", khd->cur_domain);
915
- break;
916
- }
971
+ seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
917972 return 0;
918973 }
919974
....@@ -930,7 +985,8 @@
930985 {#name "_tokens", 0400, kyber_##name##_tokens_show}
931986 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932987 KYBER_QUEUE_DOMAIN_ATTRS(read),
933
- KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
988
+ KYBER_QUEUE_DOMAIN_ATTRS(write),
989
+ KYBER_QUEUE_DOMAIN_ATTRS(discard),
934990 KYBER_QUEUE_DOMAIN_ATTRS(other),
935991 {"async_depth", 0400, kyber_async_depth_show},
936992 {},
....@@ -942,7 +998,8 @@
942998 {#name "_waiting", 0400, kyber_##name##_waiting_show}
943999 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
9441000 KYBER_HCTX_DOMAIN_ATTRS(read),
945
- KYBER_HCTX_DOMAIN_ATTRS(sync_write),
1001
+ KYBER_HCTX_DOMAIN_ATTRS(write),
1002
+ KYBER_HCTX_DOMAIN_ATTRS(discard),
9461003 KYBER_HCTX_DOMAIN_ATTRS(other),
9471004 {"cur_domain", 0400, kyber_cur_domain_show},
9481005 {"batching", 0400, kyber_batching_show},
....@@ -952,7 +1009,7 @@
9521009 #endif
9531010
9541011 static struct elevator_type kyber_sched = {
955
- .ops.mq = {
1012
+ .ops = {
9561013 .init_sched = kyber_init_sched,
9571014 .exit_sched = kyber_exit_sched,
9581015 .init_hctx = kyber_init_hctx,
....@@ -967,13 +1024,13 @@
9671024 .dispatch_request = kyber_dispatch_request,
9681025 .has_work = kyber_has_work,
9691026 },
970
- .uses_mq = true,
9711027 #ifdef CONFIG_BLK_DEBUG_FS
9721028 .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
9731029 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
9741030 #endif
9751031 .elevator_attrs = kyber_sched_attrs,
9761032 .elevator_name = "kyber",
1033
+ .elevator_features = ELEVATOR_F_MQ_AWARE,
9771034 .elevator_owner = THIS_MODULE,
9781035 };
9791036