| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * The Kyber I/O scheduler. Controls latency by throttling queue depths using |
|---|
| 3 | 4 | * scalable techniques. |
|---|
| 4 | 5 | * |
|---|
| 5 | 6 | * Copyright (C) 2017 Facebook |
|---|
| 6 | | - * |
|---|
| 7 | | - * This program is free software; you can redistribute it and/or |
|---|
| 8 | | - * modify it under the terms of the GNU General Public |
|---|
| 9 | | - * License v2 as published by the Free Software Foundation. |
|---|
| 10 | | - * |
|---|
| 11 | | - * This program is distributed in the hope that it will be useful, |
|---|
| 12 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 13 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|---|
| 14 | | - * General Public License for more details. |
|---|
| 15 | | - * |
|---|
| 16 | | - * You should have received a copy of the GNU General Public License |
|---|
| 17 | | - * along with this program. If not, see <https://www.gnu.org/licenses/>. |
|---|
| 18 | 7 | */ |
|---|
| 19 | 8 | |
|---|
| 20 | 9 | #include <linux/kernel.h> |
|---|
| .. | .. |
|---|
| 29 | 18 | #include "blk-mq-debugfs.h" |
|---|
| 30 | 19 | #include "blk-mq-sched.h" |
|---|
| 31 | 20 | #include "blk-mq-tag.h" |
|---|
| 32 | | -#include "blk-stat.h" |
|---|
| 33 | 21 | |
|---|
| 34 | | -/* Scheduling domains. */ |
|---|
| 22 | +#define CREATE_TRACE_POINTS |
|---|
| 23 | +#include <trace/events/kyber.h> |
|---|
| 24 | + |
|---|
| 25 | +/* |
|---|
| 26 | + * Scheduling domains: the device is divided into multiple domains based on the |
|---|
| 27 | + * request type. |
|---|
| 28 | + */ |
|---|
| 35 | 29 | enum { |
|---|
| 36 | 30 | KYBER_READ, |
|---|
| 37 | | - KYBER_SYNC_WRITE, |
|---|
| 38 | | - KYBER_OTHER, /* Async writes, discard, etc. */ |
|---|
| 31 | + KYBER_WRITE, |
|---|
| 32 | + KYBER_DISCARD, |
|---|
| 33 | + KYBER_OTHER, |
|---|
| 39 | 34 | KYBER_NUM_DOMAINS, |
|---|
| 40 | 35 | }; |
|---|
| 41 | 36 | |
|---|
| 42 | | -enum { |
|---|
| 43 | | - KYBER_MIN_DEPTH = 256, |
|---|
| 37 | +static const char *kyber_domain_names[] = { |
|---|
| 38 | + [KYBER_READ] = "READ", |
|---|
| 39 | + [KYBER_WRITE] = "WRITE", |
|---|
| 40 | + [KYBER_DISCARD] = "DISCARD", |
|---|
| 41 | + [KYBER_OTHER] = "OTHER", |
|---|
| 42 | +}; |
|---|
| 44 | 43 | |
|---|
| 44 | +enum { |
|---|
| 45 | 45 | /* |
|---|
| 46 | 46 | * In order to prevent starvation of synchronous requests by a flood of |
|---|
| 47 | 47 | * asynchronous requests, we reserve 25% of requests for synchronous |
|---|
| .. | .. |
|---|
| 51 | 51 | }; |
|---|
| 52 | 52 | |
|---|
| 53 | 53 | /* |
|---|
| 54 | | - * Initial device-wide depths for each scheduling domain. |
|---|
| 54 | + * Maximum device-wide depth for each scheduling domain. |
|---|
| 55 | 55 | * |
|---|
| 56 | | - * Even for fast devices with lots of tags like NVMe, you can saturate |
|---|
| 57 | | - * the device with only a fraction of the maximum possible queue depth. |
|---|
| 58 | | - * So, we cap these to a reasonable value. |
|---|
| 56 | + * Even for fast devices with lots of tags like NVMe, you can saturate the |
|---|
| 57 | + * device with only a fraction of the maximum possible queue depth. So, we cap |
|---|
| 58 | + * these to a reasonable value. |
|---|
| 59 | 59 | */ |
|---|
| 60 | 60 | static const unsigned int kyber_depth[] = { |
|---|
| 61 | 61 | [KYBER_READ] = 256, |
|---|
| 62 | | - [KYBER_SYNC_WRITE] = 128, |
|---|
| 63 | | - [KYBER_OTHER] = 64, |
|---|
| 62 | + [KYBER_WRITE] = 128, |
|---|
| 63 | + [KYBER_DISCARD] = 64, |
|---|
| 64 | + [KYBER_OTHER] = 16, |
|---|
| 64 | 65 | }; |
|---|
| 65 | 66 | |
|---|
| 66 | 67 | /* |
|---|
| 67 | | - * Scheduling domain batch sizes. We favor reads. |
|---|
| 68 | + * Default latency targets for each scheduling domain. |
|---|
| 69 | + */ |
|---|
| 70 | +static const u64 kyber_latency_targets[] = { |
|---|
| 71 | + [KYBER_READ] = 2ULL * NSEC_PER_MSEC, |
|---|
| 72 | + [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, |
|---|
| 73 | + [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, |
|---|
| 74 | +}; |
|---|
| 75 | + |
|---|
| 76 | +/* |
|---|
| 77 | + * Batch size (number of requests we'll dispatch in a row) for each scheduling |
|---|
| 78 | + * domain. |
|---|
| 68 | 79 | */ |
|---|
| 69 | 80 | static const unsigned int kyber_batch_size[] = { |
|---|
| 70 | 81 | [KYBER_READ] = 16, |
|---|
| 71 | | - [KYBER_SYNC_WRITE] = 8, |
|---|
| 72 | | - [KYBER_OTHER] = 8, |
|---|
| 82 | + [KYBER_WRITE] = 8, |
|---|
| 83 | + [KYBER_DISCARD] = 1, |
|---|
| 84 | + [KYBER_OTHER] = 1, |
|---|
| 85 | +}; |
|---|
| 86 | + |
|---|
| 87 | +/* |
|---|
| 88 | + * Requests latencies are recorded in a histogram with buckets defined relative |
|---|
| 89 | + * to the target latency: |
|---|
| 90 | + * |
|---|
| 91 | + * <= 1/4 * target latency |
|---|
| 92 | + * <= 1/2 * target latency |
|---|
| 93 | + * <= 3/4 * target latency |
|---|
| 94 | + * <= target latency |
|---|
| 95 | + * <= 1 1/4 * target latency |
|---|
| 96 | + * <= 1 1/2 * target latency |
|---|
| 97 | + * <= 1 3/4 * target latency |
|---|
| 98 | + * > 1 3/4 * target latency |
|---|
| 99 | + */ |
|---|
| 100 | +enum { |
|---|
| 101 | + /* |
|---|
| 102 | + * The width of the latency histogram buckets is |
|---|
| 103 | + * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. |
|---|
| 104 | + */ |
|---|
| 105 | + KYBER_LATENCY_SHIFT = 2, |
|---|
| 106 | + /* |
|---|
| 107 | + * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, |
|---|
| 108 | + * thus, "good". |
|---|
| 109 | + */ |
|---|
| 110 | + KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, |
|---|
| 111 | + /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ |
|---|
| 112 | + KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, |
|---|
| 113 | +}; |
|---|
| 114 | + |
|---|
| 115 | +/* |
|---|
| 116 | + * We measure both the total latency and the I/O latency (i.e., latency after |
|---|
| 117 | + * submitting to the device). |
|---|
| 118 | + */ |
|---|
| 119 | +enum { |
|---|
| 120 | + KYBER_TOTAL_LATENCY, |
|---|
| 121 | + KYBER_IO_LATENCY, |
|---|
| 122 | +}; |
|---|
| 123 | + |
|---|
| 124 | +static const char *kyber_latency_type_names[] = { |
|---|
| 125 | + [KYBER_TOTAL_LATENCY] = "total", |
|---|
| 126 | + [KYBER_IO_LATENCY] = "I/O", |
|---|
| 127 | +}; |
|---|
| 128 | + |
|---|
| 129 | +/* |
|---|
| 130 | + * Per-cpu latency histograms: total latency and I/O latency for each scheduling |
|---|
| 131 | + * domain except for KYBER_OTHER. |
|---|
| 132 | + */ |
|---|
| 133 | +struct kyber_cpu_latency { |
|---|
| 134 | + atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
|---|
| 73 | 135 | }; |
|---|
| 74 | 136 | |
|---|
| 75 | 137 | /* |
|---|
| .. | .. |
|---|
| 88 | 150 | struct kyber_queue_data { |
|---|
| 89 | 151 | struct request_queue *q; |
|---|
| 90 | 152 | |
|---|
| 91 | | - struct blk_stat_callback *cb; |
|---|
| 92 | | - |
|---|
| 93 | 153 | /* |
|---|
| 94 | | - * The device is divided into multiple scheduling domains based on the |
|---|
| 95 | | - * request type. Each domain has a fixed number of in-flight requests of |
|---|
| 96 | | - * that type device-wide, limited by these tokens. |
|---|
| 154 | + * Each scheduling domain has a limited number of in-flight requests |
|---|
| 155 | + * device-wide, limited by these tokens. |
|---|
| 97 | 156 | */ |
|---|
| 98 | 157 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; |
|---|
| 99 | 158 | |
|---|
| .. | .. |
|---|
| 103 | 162 | */ |
|---|
| 104 | 163 | unsigned int async_depth; |
|---|
| 105 | 164 | |
|---|
| 165 | + struct kyber_cpu_latency __percpu *cpu_latency; |
|---|
| 166 | + |
|---|
| 167 | + /* Timer for stats aggregation and adjusting domain tokens. */ |
|---|
| 168 | + struct timer_list timer; |
|---|
| 169 | + |
|---|
| 170 | + unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
|---|
| 171 | + |
|---|
| 172 | + unsigned long latency_timeout[KYBER_OTHER]; |
|---|
| 173 | + |
|---|
| 174 | + int domain_p99[KYBER_OTHER]; |
|---|
| 175 | + |
|---|
| 106 | 176 | /* Target latencies in nanoseconds. */ |
|---|
| 107 | | - u64 read_lat_nsec, write_lat_nsec; |
|---|
| 177 | + u64 latency_targets[KYBER_OTHER]; |
|---|
| 108 | 178 | }; |
|---|
| 109 | 179 | |
|---|
| 110 | 180 | struct kyber_hctx_data { |
|---|
| .. | .. |
|---|
| 114 | 184 | unsigned int batching; |
|---|
| 115 | 185 | struct kyber_ctx_queue *kcqs; |
|---|
| 116 | 186 | struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; |
|---|
| 117 | | - wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; |
|---|
| 187 | + struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; |
|---|
| 118 | 188 | struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; |
|---|
| 119 | 189 | atomic_t wait_index[KYBER_NUM_DOMAINS]; |
|---|
| 120 | 190 | }; |
|---|
| .. | .. |
|---|
| 124 | 194 | |
|---|
| 125 | 195 | static unsigned int kyber_sched_domain(unsigned int op) |
|---|
| 126 | 196 | { |
|---|
| 127 | | - if ((op & REQ_OP_MASK) == REQ_OP_READ) |
|---|
| 197 | + switch (op & REQ_OP_MASK) { |
|---|
| 198 | + case REQ_OP_READ: |
|---|
| 128 | 199 | return KYBER_READ; |
|---|
| 129 | | - else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) |
|---|
| 130 | | - return KYBER_SYNC_WRITE; |
|---|
| 131 | | - else |
|---|
| 200 | + case REQ_OP_WRITE: |
|---|
| 201 | + return KYBER_WRITE; |
|---|
| 202 | + case REQ_OP_DISCARD: |
|---|
| 203 | + return KYBER_DISCARD; |
|---|
| 204 | + default: |
|---|
| 132 | 205 | return KYBER_OTHER; |
|---|
| 206 | + } |
|---|
| 133 | 207 | } |
|---|
| 134 | 208 | |
|---|
| 135 | | -enum { |
|---|
| 136 | | - NONE = 0, |
|---|
| 137 | | - GOOD = 1, |
|---|
| 138 | | - GREAT = 2, |
|---|
| 139 | | - BAD = -1, |
|---|
| 140 | | - AWFUL = -2, |
|---|
| 141 | | -}; |
|---|
| 142 | | - |
|---|
| 143 | | -#define IS_GOOD(status) ((status) > 0) |
|---|
| 144 | | -#define IS_BAD(status) ((status) < 0) |
|---|
| 145 | | - |
|---|
| 146 | | -static int kyber_lat_status(struct blk_stat_callback *cb, |
|---|
| 147 | | - unsigned int sched_domain, u64 target) |
|---|
| 209 | +static void flush_latency_buckets(struct kyber_queue_data *kqd, |
|---|
| 210 | + struct kyber_cpu_latency *cpu_latency, |
|---|
| 211 | + unsigned int sched_domain, unsigned int type) |
|---|
| 148 | 212 | { |
|---|
| 149 | | - u64 latency; |
|---|
| 213 | + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
|---|
| 214 | + atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; |
|---|
| 215 | + unsigned int bucket; |
|---|
| 150 | 216 | |
|---|
| 151 | | - if (!cb->stat[sched_domain].nr_samples) |
|---|
| 152 | | - return NONE; |
|---|
| 153 | | - |
|---|
| 154 | | - latency = cb->stat[sched_domain].mean; |
|---|
| 155 | | - if (latency >= 2 * target) |
|---|
| 156 | | - return AWFUL; |
|---|
| 157 | | - else if (latency > target) |
|---|
| 158 | | - return BAD; |
|---|
| 159 | | - else if (latency <= target / 2) |
|---|
| 160 | | - return GREAT; |
|---|
| 161 | | - else /* (latency <= target) */ |
|---|
| 162 | | - return GOOD; |
|---|
| 217 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
|---|
| 218 | + buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); |
|---|
| 163 | 219 | } |
|---|
| 164 | 220 | |
|---|
| 165 | 221 | /* |
|---|
| 166 | | - * Adjust the read or synchronous write depth given the status of reads and |
|---|
| 167 | | - * writes. The goal is that the latencies of the two domains are fair (i.e., if |
|---|
| 168 | | - * one is good, then the other is good). |
|---|
| 222 | + * Calculate the histogram bucket with the given percentile rank, or -1 if there |
|---|
| 223 | + * aren't enough samples yet. |
|---|
| 169 | 224 | */ |
|---|
| 170 | | -static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, |
|---|
| 171 | | - unsigned int sched_domain, int this_status, |
|---|
| 172 | | - int other_status) |
|---|
| 225 | +static int calculate_percentile(struct kyber_queue_data *kqd, |
|---|
| 226 | + unsigned int sched_domain, unsigned int type, |
|---|
| 227 | + unsigned int percentile) |
|---|
| 173 | 228 | { |
|---|
| 174 | | - unsigned int orig_depth, depth; |
|---|
| 229 | + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
|---|
| 230 | + unsigned int bucket, samples = 0, percentile_samples; |
|---|
| 231 | + |
|---|
| 232 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
|---|
| 233 | + samples += buckets[bucket]; |
|---|
| 234 | + |
|---|
| 235 | + if (!samples) |
|---|
| 236 | + return -1; |
|---|
| 175 | 237 | |
|---|
| 176 | 238 | /* |
|---|
| 177 | | - * If this domain had no samples, or reads and writes are both good or |
|---|
| 178 | | - * both bad, don't adjust the depth. |
|---|
| 239 | + * We do the calculation once we have 500 samples or one second passes |
|---|
| 240 | + * since the first sample was recorded, whichever comes first. |
|---|
| 179 | 241 | */ |
|---|
| 180 | | - if (this_status == NONE || |
|---|
| 181 | | - (IS_GOOD(this_status) && IS_GOOD(other_status)) || |
|---|
| 182 | | - (IS_BAD(this_status) && IS_BAD(other_status))) |
|---|
| 183 | | - return; |
|---|
| 184 | | - |
|---|
| 185 | | - orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; |
|---|
| 186 | | - |
|---|
| 187 | | - if (other_status == NONE) { |
|---|
| 188 | | - depth++; |
|---|
| 189 | | - } else { |
|---|
| 190 | | - switch (this_status) { |
|---|
| 191 | | - case GOOD: |
|---|
| 192 | | - if (other_status == AWFUL) |
|---|
| 193 | | - depth -= max(depth / 4, 1U); |
|---|
| 194 | | - else |
|---|
| 195 | | - depth -= max(depth / 8, 1U); |
|---|
| 196 | | - break; |
|---|
| 197 | | - case GREAT: |
|---|
| 198 | | - if (other_status == AWFUL) |
|---|
| 199 | | - depth /= 2; |
|---|
| 200 | | - else |
|---|
| 201 | | - depth -= max(depth / 4, 1U); |
|---|
| 202 | | - break; |
|---|
| 203 | | - case BAD: |
|---|
| 204 | | - depth++; |
|---|
| 205 | | - break; |
|---|
| 206 | | - case AWFUL: |
|---|
| 207 | | - if (other_status == GREAT) |
|---|
| 208 | | - depth += 2; |
|---|
| 209 | | - else |
|---|
| 210 | | - depth++; |
|---|
| 211 | | - break; |
|---|
| 212 | | - } |
|---|
| 242 | + if (!kqd->latency_timeout[sched_domain]) |
|---|
| 243 | + kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); |
|---|
| 244 | + if (samples < 500 && |
|---|
| 245 | + time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { |
|---|
| 246 | + return -1; |
|---|
| 213 | 247 | } |
|---|
| 248 | + kqd->latency_timeout[sched_domain] = 0; |
|---|
| 214 | 249 | |
|---|
| 250 | + percentile_samples = DIV_ROUND_UP(samples * percentile, 100); |
|---|
| 251 | + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { |
|---|
| 252 | + if (buckets[bucket] >= percentile_samples) |
|---|
| 253 | + break; |
|---|
| 254 | + percentile_samples -= buckets[bucket]; |
|---|
| 255 | + } |
|---|
| 256 | + memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); |
|---|
| 257 | + |
|---|
| 258 | + trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], |
|---|
| 259 | + kyber_latency_type_names[type], percentile, |
|---|
| 260 | + bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); |
|---|
| 261 | + |
|---|
| 262 | + return bucket; |
|---|
| 263 | +} |
|---|
| 264 | + |
|---|
| 265 | +static void kyber_resize_domain(struct kyber_queue_data *kqd, |
|---|
| 266 | + unsigned int sched_domain, unsigned int depth) |
|---|
| 267 | +{ |
|---|
| 215 | 268 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); |
|---|
| 216 | | - if (depth != orig_depth) |
|---|
| 269 | + if (depth != kqd->domain_tokens[sched_domain].sb.depth) { |
|---|
| 217 | 270 | sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); |
|---|
| 271 | + trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], |
|---|
| 272 | + depth); |
|---|
| 273 | + } |
|---|
| 218 | 274 | } |
|---|
| 219 | 275 | |
|---|
| 220 | | -/* |
|---|
| 221 | | - * Adjust the depth of other requests given the status of reads and synchronous |
|---|
| 222 | | - * writes. As long as either domain is doing fine, we don't throttle, but if |
|---|
| 223 | | - * both domains are doing badly, we throttle heavily. |
|---|
| 224 | | - */ |
|---|
| 225 | | -static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, |
|---|
| 226 | | - int read_status, int write_status, |
|---|
| 227 | | - bool have_samples) |
|---|
| 276 | +static void kyber_timer_fn(struct timer_list *t) |
|---|
| 228 | 277 | { |
|---|
| 229 | | - unsigned int orig_depth, depth; |
|---|
| 230 | | - int status; |
|---|
| 278 | + struct kyber_queue_data *kqd = from_timer(kqd, t, timer); |
|---|
| 279 | + unsigned int sched_domain; |
|---|
| 280 | + int cpu; |
|---|
| 281 | + bool bad = false; |
|---|
| 231 | 282 | |
|---|
| 232 | | - orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; |
|---|
| 283 | + /* Sum all of the per-cpu latency histograms. */ |
|---|
| 284 | + for_each_online_cpu(cpu) { |
|---|
| 285 | + struct kyber_cpu_latency *cpu_latency; |
|---|
| 233 | 286 | |
|---|
| 234 | | - if (read_status == NONE && write_status == NONE) { |
|---|
| 235 | | - depth += 2; |
|---|
| 236 | | - } else if (have_samples) { |
|---|
| 237 | | - if (read_status == NONE) |
|---|
| 238 | | - status = write_status; |
|---|
| 239 | | - else if (write_status == NONE) |
|---|
| 240 | | - status = read_status; |
|---|
| 241 | | - else |
|---|
| 242 | | - status = max(read_status, write_status); |
|---|
| 243 | | - switch (status) { |
|---|
| 244 | | - case GREAT: |
|---|
| 245 | | - depth += 2; |
|---|
| 246 | | - break; |
|---|
| 247 | | - case GOOD: |
|---|
| 248 | | - depth++; |
|---|
| 249 | | - break; |
|---|
| 250 | | - case BAD: |
|---|
| 251 | | - depth -= max(depth / 4, 1U); |
|---|
| 252 | | - break; |
|---|
| 253 | | - case AWFUL: |
|---|
| 254 | | - depth /= 2; |
|---|
| 255 | | - break; |
|---|
| 287 | + cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); |
|---|
| 288 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
|---|
| 289 | + flush_latency_buckets(kqd, cpu_latency, sched_domain, |
|---|
| 290 | + KYBER_TOTAL_LATENCY); |
|---|
| 291 | + flush_latency_buckets(kqd, cpu_latency, sched_domain, |
|---|
| 292 | + KYBER_IO_LATENCY); |
|---|
| 256 | 293 | } |
|---|
| 257 | 294 | } |
|---|
| 258 | 295 | |
|---|
| 259 | | - depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); |
|---|
| 260 | | - if (depth != orig_depth) |
|---|
| 261 | | - sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); |
|---|
| 262 | | -} |
|---|
| 296 | + /* |
|---|
| 297 | + * Check if any domains have a high I/O latency, which might indicate |
|---|
| 298 | + * congestion in the device. Note that we use the p90; we don't want to |
|---|
| 299 | + * be too sensitive to outliers here. |
|---|
| 300 | + */ |
|---|
| 301 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
|---|
| 302 | + int p90; |
|---|
| 263 | 303 | |
|---|
| 264 | | -/* |
|---|
| 265 | | - * Apply heuristics for limiting queue depths based on gathered latency |
|---|
| 266 | | - * statistics. |
|---|
| 267 | | - */ |
|---|
| 268 | | -static void kyber_stat_timer_fn(struct blk_stat_callback *cb) |
|---|
| 269 | | -{ |
|---|
| 270 | | - struct kyber_queue_data *kqd = cb->data; |
|---|
| 271 | | - int read_status, write_status; |
|---|
| 272 | | - |
|---|
| 273 | | - read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); |
|---|
| 274 | | - write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); |
|---|
| 275 | | - |
|---|
| 276 | | - kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); |
|---|
| 277 | | - kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); |
|---|
| 278 | | - kyber_adjust_other_depth(kqd, read_status, write_status, |
|---|
| 279 | | - cb->stat[KYBER_OTHER].nr_samples != 0); |
|---|
| 304 | + p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, |
|---|
| 305 | + 90); |
|---|
| 306 | + if (p90 >= KYBER_GOOD_BUCKETS) |
|---|
| 307 | + bad = true; |
|---|
| 308 | + } |
|---|
| 280 | 309 | |
|---|
| 281 | 310 | /* |
|---|
| 282 | | - * Continue monitoring latencies if we aren't hitting the targets or |
|---|
| 283 | | - * we're still throttling other requests. |
|---|
| 311 | + * Adjust the scheduling domain depths. If we determined that there was |
|---|
| 312 | + * congestion, we throttle all domains with good latencies. Either way, |
|---|
| 313 | + * we ease up on throttling domains with bad latencies. |
|---|
| 284 | 314 | */ |
|---|
| 285 | | - if (!blk_stat_is_active(kqd->cb) && |
|---|
| 286 | | - ((IS_BAD(read_status) || IS_BAD(write_status) || |
|---|
| 287 | | - kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) |
|---|
| 288 | | - blk_stat_activate_msecs(kqd->cb, 100); |
|---|
| 315 | + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
|---|
| 316 | + unsigned int orig_depth, depth; |
|---|
| 317 | + int p99; |
|---|
| 318 | + |
|---|
| 319 | + p99 = calculate_percentile(kqd, sched_domain, |
|---|
| 320 | + KYBER_TOTAL_LATENCY, 99); |
|---|
| 321 | + /* |
|---|
| 322 | + * This is kind of subtle: different domains will not |
|---|
| 323 | + * necessarily have enough samples to calculate the latency |
|---|
| 324 | + * percentiles during the same window, so we have to remember |
|---|
| 325 | + * the p99 for the next time we observe congestion; once we do, |
|---|
| 326 | + * we don't want to throttle again until we get more data, so we |
|---|
| 327 | + * reset it to -1. |
|---|
| 328 | + */ |
|---|
| 329 | + if (bad) { |
|---|
| 330 | + if (p99 < 0) |
|---|
| 331 | + p99 = kqd->domain_p99[sched_domain]; |
|---|
| 332 | + kqd->domain_p99[sched_domain] = -1; |
|---|
| 333 | + } else if (p99 >= 0) { |
|---|
| 334 | + kqd->domain_p99[sched_domain] = p99; |
|---|
| 335 | + } |
|---|
| 336 | + if (p99 < 0) |
|---|
| 337 | + continue; |
|---|
| 338 | + |
|---|
| 339 | + /* |
|---|
| 340 | + * If this domain has bad latency, throttle less. Otherwise, |
|---|
| 341 | + * throttle more iff we determined that there is congestion. |
|---|
| 342 | + * |
|---|
| 343 | + * The new depth is scaled linearly with the p99 latency vs the |
|---|
| 344 | + * latency target. E.g., if the p99 is 3/4 of the target, then |
|---|
| 345 | + * we throttle down to 3/4 of the current depth, and if the p99 |
|---|
| 346 | + * is 2x the target, then we double the depth. |
|---|
| 347 | + */ |
|---|
| 348 | + if (bad || p99 >= KYBER_GOOD_BUCKETS) { |
|---|
| 349 | + orig_depth = kqd->domain_tokens[sched_domain].sb.depth; |
|---|
| 350 | + depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; |
|---|
| 351 | + kyber_resize_domain(kqd, sched_domain, depth); |
|---|
| 352 | + } |
|---|
| 353 | + } |
|---|
| 289 | 354 | } |
|---|
| 290 | 355 | |
|---|
| 291 | | -static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) |
|---|
| 356 | +static unsigned int kyber_sched_tags_shift(struct request_queue *q) |
|---|
| 292 | 357 | { |
|---|
| 293 | 358 | /* |
|---|
| 294 | 359 | * All of the hardware queues have the same depth, so we can just grab |
|---|
| 295 | 360 | * the shift of the first one. |
|---|
| 296 | 361 | */ |
|---|
| 297 | | - return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; |
|---|
| 298 | | -} |
|---|
| 299 | | - |
|---|
| 300 | | -static int kyber_bucket_fn(const struct request *rq) |
|---|
| 301 | | -{ |
|---|
| 302 | | - return kyber_sched_domain(rq->cmd_flags); |
|---|
| 362 | + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; |
|---|
| 303 | 363 | } |
|---|
| 304 | 364 | |
|---|
| 305 | 365 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) |
|---|
| 306 | 366 | { |
|---|
| 307 | 367 | struct kyber_queue_data *kqd; |
|---|
| 308 | | - unsigned int max_tokens; |
|---|
| 309 | 368 | unsigned int shift; |
|---|
| 310 | 369 | int ret = -ENOMEM; |
|---|
| 311 | 370 | int i; |
|---|
| 312 | 371 | |
|---|
| 313 | | - kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
|---|
| 372 | + kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
|---|
| 314 | 373 | if (!kqd) |
|---|
| 315 | 374 | goto err; |
|---|
| 375 | + |
|---|
| 316 | 376 | kqd->q = q; |
|---|
| 317 | 377 | |
|---|
| 318 | | - kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, |
|---|
| 319 | | - KYBER_NUM_DOMAINS, kqd); |
|---|
| 320 | | - if (!kqd->cb) |
|---|
| 378 | + kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, |
|---|
| 379 | + GFP_KERNEL | __GFP_ZERO); |
|---|
| 380 | + if (!kqd->cpu_latency) |
|---|
| 321 | 381 | goto err_kqd; |
|---|
| 322 | 382 | |
|---|
| 323 | | - /* |
|---|
| 324 | | - * The maximum number of tokens for any scheduling domain is at least |
|---|
| 325 | | - * the queue depth of a single hardware queue. If the hardware doesn't |
|---|
| 326 | | - * have many tags, still provide a reasonable number. |
|---|
| 327 | | - */ |
|---|
| 328 | | - max_tokens = max_t(unsigned int, q->tag_set->queue_depth, |
|---|
| 329 | | - KYBER_MIN_DEPTH); |
|---|
| 383 | + timer_setup(&kqd->timer, kyber_timer_fn, 0); |
|---|
| 384 | + |
|---|
| 330 | 385 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
|---|
| 331 | 386 | WARN_ON(!kyber_depth[i]); |
|---|
| 332 | 387 | WARN_ON(!kyber_batch_size[i]); |
|---|
| 333 | 388 | ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], |
|---|
| 334 | | - max_tokens, -1, false, GFP_KERNEL, |
|---|
| 335 | | - q->node); |
|---|
| 389 | + kyber_depth[i], -1, false, |
|---|
| 390 | + GFP_KERNEL, q->node); |
|---|
| 336 | 391 | if (ret) { |
|---|
| 337 | 392 | while (--i >= 0) |
|---|
| 338 | 393 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
|---|
| 339 | | - goto err_cb; |
|---|
| 394 | + goto err_buckets; |
|---|
| 340 | 395 | } |
|---|
| 341 | | - sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); |
|---|
| 342 | 396 | } |
|---|
| 343 | 397 | |
|---|
| 344 | | - shift = kyber_sched_tags_shift(kqd); |
|---|
| 345 | | - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
|---|
| 398 | + for (i = 0; i < KYBER_OTHER; i++) { |
|---|
| 399 | + kqd->domain_p99[i] = -1; |
|---|
| 400 | + kqd->latency_targets[i] = kyber_latency_targets[i]; |
|---|
| 401 | + } |
|---|
| 346 | 402 | |
|---|
| 347 | | - kqd->read_lat_nsec = 2000000ULL; |
|---|
| 348 | | - kqd->write_lat_nsec = 10000000ULL; |
|---|
| 403 | + shift = kyber_sched_tags_shift(q); |
|---|
| 404 | + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
|---|
| 349 | 405 | |
|---|
| 350 | 406 | return kqd; |
|---|
| 351 | 407 | |
|---|
| 352 | | -err_cb: |
|---|
| 353 | | - blk_stat_free_callback(kqd->cb); |
|---|
| 408 | +err_buckets: |
|---|
| 409 | + free_percpu(kqd->cpu_latency); |
|---|
| 354 | 410 | err_kqd: |
|---|
| 355 | 411 | kfree(kqd); |
|---|
| 356 | 412 | err: |
|---|
| .. | .. |
|---|
| 372 | 428 | return PTR_ERR(kqd); |
|---|
| 373 | 429 | } |
|---|
| 374 | 430 | |
|---|
| 431 | + blk_stat_enable_accounting(q); |
|---|
| 432 | + |
|---|
| 375 | 433 | eq->elevator_data = kqd; |
|---|
| 376 | 434 | q->elevator = eq; |
|---|
| 377 | | - |
|---|
| 378 | | - blk_stat_add_callback(q, kqd->cb); |
|---|
| 379 | 435 | |
|---|
| 380 | 436 | return 0; |
|---|
| 381 | 437 | } |
|---|
| .. | .. |
|---|
| 383 | 439 | static void kyber_exit_sched(struct elevator_queue *e) |
|---|
| 384 | 440 | { |
|---|
| 385 | 441 | struct kyber_queue_data *kqd = e->elevator_data; |
|---|
| 386 | | - struct request_queue *q = kqd->q; |
|---|
| 387 | 442 | int i; |
|---|
| 388 | 443 | |
|---|
| 389 | | - blk_stat_remove_callback(q, kqd->cb); |
|---|
| 444 | + del_timer_sync(&kqd->timer); |
|---|
| 390 | 445 | |
|---|
| 391 | 446 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
|---|
| 392 | 447 | sbitmap_queue_free(&kqd->domain_tokens[i]); |
|---|
| 393 | | - blk_stat_free_callback(kqd->cb); |
|---|
| 448 | + free_percpu(kqd->cpu_latency); |
|---|
| 394 | 449 | kfree(kqd); |
|---|
| 395 | 450 | } |
|---|
| 396 | 451 | |
|---|
| .. | .. |
|---|
| 435 | 490 | |
|---|
| 436 | 491 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
|---|
| 437 | 492 | INIT_LIST_HEAD(&khd->rqs[i]); |
|---|
| 438 | | - init_waitqueue_func_entry(&khd->domain_wait[i], |
|---|
| 493 | + khd->domain_wait[i].sbq = NULL; |
|---|
| 494 | + init_waitqueue_func_entry(&khd->domain_wait[i].wait, |
|---|
| 439 | 495 | kyber_domain_wake); |
|---|
| 440 | | - khd->domain_wait[i].private = hctx; |
|---|
| 441 | | - INIT_LIST_HEAD(&khd->domain_wait[i].entry); |
|---|
| 496 | + khd->domain_wait[i].wait.private = hctx; |
|---|
| 497 | + INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); |
|---|
| 442 | 498 | atomic_set(&khd->wait_index[i], 0); |
|---|
| 443 | 499 | } |
|---|
| 444 | 500 | |
|---|
| .. | .. |
|---|
| 446 | 502 | khd->batching = 0; |
|---|
| 447 | 503 | |
|---|
| 448 | 504 | hctx->sched_data = khd; |
|---|
| 449 | | - sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, |
|---|
| 505 | + sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, |
|---|
| 450 | 506 | kqd->async_depth); |
|---|
| 451 | 507 | |
|---|
| 452 | 508 | return 0; |
|---|
| .. | .. |
|---|
| 506 | 562 | } |
|---|
| 507 | 563 | } |
|---|
| 508 | 564 | |
|---|
| 509 | | -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) |
|---|
| 565 | +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, |
|---|
| 566 | + unsigned int nr_segs) |
|---|
| 510 | 567 | { |
|---|
| 568 | + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
|---|
| 569 | + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); |
|---|
| 511 | 570 | struct kyber_hctx_data *khd = hctx->sched_data; |
|---|
| 512 | | - struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); |
|---|
| 513 | | - struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; |
|---|
| 571 | + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; |
|---|
| 514 | 572 | unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); |
|---|
| 515 | 573 | struct list_head *rq_list = &kcq->rq_list[sched_domain]; |
|---|
| 516 | 574 | bool merged; |
|---|
| 517 | 575 | |
|---|
| 518 | 576 | spin_lock(&kcq->lock); |
|---|
| 519 | | - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); |
|---|
| 577 | + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); |
|---|
| 520 | 578 | spin_unlock(&kcq->lock); |
|---|
| 521 | | - blk_mq_put_ctx(ctx); |
|---|
| 522 | 579 | |
|---|
| 523 | 580 | return merged; |
|---|
| 524 | 581 | } |
|---|
| 525 | 582 | |
|---|
| 526 | | -static void kyber_prepare_request(struct request *rq, struct bio *bio) |
|---|
| 583 | +static void kyber_prepare_request(struct request *rq) |
|---|
| 527 | 584 | { |
|---|
| 528 | 585 | rq_set_domain_token(rq, -1); |
|---|
| 529 | 586 | } |
|---|
| .. | .. |
|---|
| 536 | 593 | |
|---|
| 537 | 594 | list_for_each_entry_safe(rq, next, rq_list, queuelist) { |
|---|
| 538 | 595 | unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); |
|---|
| 539 | | - struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; |
|---|
| 596 | + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; |
|---|
| 540 | 597 | struct list_head *head = &kcq->rq_list[sched_domain]; |
|---|
| 541 | 598 | |
|---|
| 542 | 599 | spin_lock(&kcq->lock); |
|---|
| .. | .. |
|---|
| 545 | 602 | else |
|---|
| 546 | 603 | list_move_tail(&rq->queuelist, head); |
|---|
| 547 | 604 | sbitmap_set_bit(&khd->kcq_map[sched_domain], |
|---|
| 548 | | - rq->mq_ctx->index_hw); |
|---|
| 605 | + rq->mq_ctx->index_hw[hctx->type]); |
|---|
| 549 | 606 | blk_mq_sched_request_inserted(rq); |
|---|
| 550 | 607 | spin_unlock(&kcq->lock); |
|---|
| 551 | 608 | } |
|---|
| .. | .. |
|---|
| 558 | 615 | rq_clear_domain_token(kqd, rq); |
|---|
| 559 | 616 | } |
|---|
| 560 | 617 | |
|---|
| 561 | | -static void kyber_completed_request(struct request *rq) |
|---|
| 618 | +static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, |
|---|
| 619 | + unsigned int sched_domain, unsigned int type, |
|---|
| 620 | + u64 target, u64 latency) |
|---|
| 562 | 621 | { |
|---|
| 563 | | - struct request_queue *q = rq->q; |
|---|
| 564 | | - struct kyber_queue_data *kqd = q->elevator->elevator_data; |
|---|
| 565 | | - unsigned int sched_domain; |
|---|
| 566 | | - u64 now, latency, target; |
|---|
| 622 | + unsigned int bucket; |
|---|
| 623 | + u64 divisor; |
|---|
| 567 | 624 | |
|---|
| 568 | | - /* |
|---|
| 569 | | - * Check if this request met our latency goal. If not, quickly gather |
|---|
| 570 | | - * some statistics and start throttling. |
|---|
| 571 | | - */ |
|---|
| 572 | | - sched_domain = kyber_sched_domain(rq->cmd_flags); |
|---|
| 573 | | - switch (sched_domain) { |
|---|
| 574 | | - case KYBER_READ: |
|---|
| 575 | | - target = kqd->read_lat_nsec; |
|---|
| 576 | | - break; |
|---|
| 577 | | - case KYBER_SYNC_WRITE: |
|---|
| 578 | | - target = kqd->write_lat_nsec; |
|---|
| 579 | | - break; |
|---|
| 580 | | - default: |
|---|
| 581 | | - return; |
|---|
| 625 | + if (latency > 0) { |
|---|
| 626 | + divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); |
|---|
| 627 | + bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), |
|---|
| 628 | + KYBER_LATENCY_BUCKETS - 1); |
|---|
| 629 | + } else { |
|---|
| 630 | + bucket = 0; |
|---|
| 582 | 631 | } |
|---|
| 583 | 632 | |
|---|
| 584 | | - /* If we are already monitoring latencies, don't check again. */ |
|---|
| 585 | | - if (blk_stat_is_active(kqd->cb)) |
|---|
| 633 | + atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); |
|---|
| 634 | +} |
|---|
| 635 | + |
|---|
| 636 | +static void kyber_completed_request(struct request *rq, u64 now) |
|---|
| 637 | +{ |
|---|
| 638 | + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; |
|---|
| 639 | + struct kyber_cpu_latency *cpu_latency; |
|---|
| 640 | + unsigned int sched_domain; |
|---|
| 641 | + u64 target; |
|---|
| 642 | + |
|---|
| 643 | + sched_domain = kyber_sched_domain(rq->cmd_flags); |
|---|
| 644 | + if (sched_domain == KYBER_OTHER) |
|---|
| 586 | 645 | return; |
|---|
| 587 | 646 | |
|---|
| 588 | | - now = ktime_get_ns(); |
|---|
| 589 | | - if (now < rq->io_start_time_ns) |
|---|
| 590 | | - return; |
|---|
| 647 | + cpu_latency = get_cpu_ptr(kqd->cpu_latency); |
|---|
| 648 | + target = kqd->latency_targets[sched_domain]; |
|---|
| 649 | + add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, |
|---|
| 650 | + target, now - rq->start_time_ns); |
|---|
| 651 | + add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, |
|---|
| 652 | + now - rq->io_start_time_ns); |
|---|
| 653 | + put_cpu_ptr(kqd->cpu_latency); |
|---|
| 591 | 654 | |
|---|
| 592 | | - latency = now - rq->io_start_time_ns; |
|---|
| 593 | | - |
|---|
| 594 | | - if (latency > target) |
|---|
| 595 | | - blk_stat_activate_msecs(kqd->cb, 10); |
|---|
| 655 | + timer_reduce(&kqd->timer, jiffies + HZ / 10); |
|---|
| 596 | 656 | } |
|---|
| 597 | 657 | |
|---|
| 598 | 658 | struct flush_kcq_data { |
|---|
| .. | .. |
|---|
| 629 | 689 | flush_busy_kcq, &data); |
|---|
| 630 | 690 | } |
|---|
| 631 | 691 | |
|---|
| 632 | | -static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, |
|---|
| 692 | +static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, |
|---|
| 633 | 693 | void *key) |
|---|
| 634 | 694 | { |
|---|
| 635 | | - struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); |
|---|
| 695 | + struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); |
|---|
| 696 | + struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); |
|---|
| 636 | 697 | |
|---|
| 637 | | - list_del_init(&wait->entry); |
|---|
| 698 | + sbitmap_del_wait_queue(wait); |
|---|
| 638 | 699 | blk_mq_run_hw_queue(hctx, true); |
|---|
| 639 | 700 | return 1; |
|---|
| 640 | 701 | } |
|---|
| .. | .. |
|---|
| 645 | 706 | { |
|---|
| 646 | 707 | unsigned int sched_domain = khd->cur_domain; |
|---|
| 647 | 708 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; |
|---|
| 648 | | - wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; |
|---|
| 709 | + struct sbq_wait *wait = &khd->domain_wait[sched_domain]; |
|---|
| 649 | 710 | struct sbq_wait_state *ws; |
|---|
| 650 | 711 | int nr; |
|---|
| 651 | 712 | |
|---|
| .. | .. |
|---|
| 656 | 717 | * run when one becomes available. Note that this is serialized on |
|---|
| 657 | 718 | * khd->lock, but we still need to be careful about the waker. |
|---|
| 658 | 719 | */ |
|---|
| 659 | | - if (nr < 0 && list_empty_careful(&wait->entry)) { |
|---|
| 720 | + if (nr < 0 && list_empty_careful(&wait->wait.entry)) { |
|---|
| 660 | 721 | ws = sbq_wait_ptr(domain_tokens, |
|---|
| 661 | 722 | &khd->wait_index[sched_domain]); |
|---|
| 662 | 723 | khd->domain_ws[sched_domain] = ws; |
|---|
| 663 | | - add_wait_queue(&ws->wait, wait); |
|---|
| 724 | + sbitmap_add_wait_queue(domain_tokens, ws, wait); |
|---|
| 664 | 725 | |
|---|
| 665 | 726 | /* |
|---|
| 666 | 727 | * Try again in case a token was freed before we got on the wait |
|---|
| .. | .. |
|---|
| 676 | 737 | * between the !list_empty_careful() check and us grabbing the lock, but |
|---|
| 677 | 738 | * list_del_init() is okay with that. |
|---|
| 678 | 739 | */ |
|---|
| 679 | | - if (nr >= 0 && !list_empty_careful(&wait->entry)) { |
|---|
| 740 | + if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { |
|---|
| 680 | 741 | ws = khd->domain_ws[sched_domain]; |
|---|
| 681 | 742 | spin_lock_irq(&ws->wait.lock); |
|---|
| 682 | | - list_del_init(&wait->entry); |
|---|
| 743 | + sbitmap_del_wait_queue(wait); |
|---|
| 683 | 744 | spin_unlock_irq(&ws->wait.lock); |
|---|
| 684 | 745 | } |
|---|
| 685 | 746 | |
|---|
| .. | .. |
|---|
| 713 | 774 | rq_set_domain_token(rq, nr); |
|---|
| 714 | 775 | list_del_init(&rq->queuelist); |
|---|
| 715 | 776 | return rq; |
|---|
| 777 | + } else { |
|---|
| 778 | + trace_kyber_throttled(kqd->q, |
|---|
| 779 | + kyber_domain_names[khd->cur_domain]); |
|---|
| 716 | 780 | } |
|---|
| 717 | 781 | } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { |
|---|
| 718 | 782 | nr = kyber_get_domain_token(kqd, khd, hctx); |
|---|
| .. | .. |
|---|
| 723 | 787 | rq_set_domain_token(rq, nr); |
|---|
| 724 | 788 | list_del_init(&rq->queuelist); |
|---|
| 725 | 789 | return rq; |
|---|
| 790 | + } else { |
|---|
| 791 | + trace_kyber_throttled(kqd->q, |
|---|
| 792 | + kyber_domain_names[khd->cur_domain]); |
|---|
| 726 | 793 | } |
|---|
| 727 | 794 | } |
|---|
| 728 | 795 | |
|---|
| .. | .. |
|---|
| 790 | 857 | return false; |
|---|
| 791 | 858 | } |
|---|
| 792 | 859 | |
|---|
| 793 | | -#define KYBER_LAT_SHOW_STORE(op) \ |
|---|
| 794 | | -static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ |
|---|
| 795 | | - char *page) \ |
|---|
| 860 | +#define KYBER_LAT_SHOW_STORE(domain, name) \ |
|---|
| 861 | +static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ |
|---|
| 862 | + char *page) \ |
|---|
| 796 | 863 | { \ |
|---|
| 797 | 864 | struct kyber_queue_data *kqd = e->elevator_data; \ |
|---|
| 798 | 865 | \ |
|---|
| 799 | | - return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ |
|---|
| 866 | + return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ |
|---|
| 800 | 867 | } \ |
|---|
| 801 | 868 | \ |
|---|
| 802 | | -static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ |
|---|
| 803 | | - const char *page, size_t count) \ |
|---|
| 869 | +static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ |
|---|
| 870 | + const char *page, size_t count) \ |
|---|
| 804 | 871 | { \ |
|---|
| 805 | 872 | struct kyber_queue_data *kqd = e->elevator_data; \ |
|---|
| 806 | 873 | unsigned long long nsec; \ |
|---|
| .. | .. |
|---|
| 810 | 877 | if (ret) \ |
|---|
| 811 | 878 | return ret; \ |
|---|
| 812 | 879 | \ |
|---|
| 813 | | - kqd->op##_lat_nsec = nsec; \ |
|---|
| 880 | + kqd->latency_targets[domain] = nsec; \ |
|---|
| 814 | 881 | \ |
|---|
| 815 | 882 | return count; \ |
|---|
| 816 | 883 | } |
|---|
| 817 | | -KYBER_LAT_SHOW_STORE(read); |
|---|
| 818 | | -KYBER_LAT_SHOW_STORE(write); |
|---|
| 884 | +KYBER_LAT_SHOW_STORE(KYBER_READ, read); |
|---|
| 885 | +KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); |
|---|
| 819 | 886 | #undef KYBER_LAT_SHOW_STORE |
|---|
| 820 | 887 | |
|---|
| 821 | 888 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) |
|---|
| .. | .. |
|---|
| 876 | 943 | { \ |
|---|
| 877 | 944 | struct blk_mq_hw_ctx *hctx = data; \ |
|---|
| 878 | 945 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
|---|
| 879 | | - wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ |
|---|
| 946 | + wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ |
|---|
| 880 | 947 | \ |
|---|
| 881 | 948 | seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ |
|---|
| 882 | 949 | return 0; \ |
|---|
| 883 | 950 | } |
|---|
| 884 | 951 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) |
|---|
| 885 | | -KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) |
|---|
| 952 | +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) |
|---|
| 953 | +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) |
|---|
| 886 | 954 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) |
|---|
| 887 | 955 | #undef KYBER_DEBUGFS_DOMAIN_ATTRS |
|---|
| 888 | 956 | |
|---|
| .. | .. |
|---|
| 900 | 968 | struct blk_mq_hw_ctx *hctx = data; |
|---|
| 901 | 969 | struct kyber_hctx_data *khd = hctx->sched_data; |
|---|
| 902 | 970 | |
|---|
| 903 | | - switch (khd->cur_domain) { |
|---|
| 904 | | - case KYBER_READ: |
|---|
| 905 | | - seq_puts(m, "READ\n"); |
|---|
| 906 | | - break; |
|---|
| 907 | | - case KYBER_SYNC_WRITE: |
|---|
| 908 | | - seq_puts(m, "SYNC_WRITE\n"); |
|---|
| 909 | | - break; |
|---|
| 910 | | - case KYBER_OTHER: |
|---|
| 911 | | - seq_puts(m, "OTHER\n"); |
|---|
| 912 | | - break; |
|---|
| 913 | | - default: |
|---|
| 914 | | - seq_printf(m, "%u\n", khd->cur_domain); |
|---|
| 915 | | - break; |
|---|
| 916 | | - } |
|---|
| 971 | + seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); |
|---|
| 917 | 972 | return 0; |
|---|
| 918 | 973 | } |
|---|
| 919 | 974 | |
|---|
| .. | .. |
|---|
| 930 | 985 | {#name "_tokens", 0400, kyber_##name##_tokens_show} |
|---|
| 931 | 986 | static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { |
|---|
| 932 | 987 | KYBER_QUEUE_DOMAIN_ATTRS(read), |
|---|
| 933 | | - KYBER_QUEUE_DOMAIN_ATTRS(sync_write), |
|---|
| 988 | + KYBER_QUEUE_DOMAIN_ATTRS(write), |
|---|
| 989 | + KYBER_QUEUE_DOMAIN_ATTRS(discard), |
|---|
| 934 | 990 | KYBER_QUEUE_DOMAIN_ATTRS(other), |
|---|
| 935 | 991 | {"async_depth", 0400, kyber_async_depth_show}, |
|---|
| 936 | 992 | {}, |
|---|
| .. | .. |
|---|
| 942 | 998 | {#name "_waiting", 0400, kyber_##name##_waiting_show} |
|---|
| 943 | 999 | static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { |
|---|
| 944 | 1000 | KYBER_HCTX_DOMAIN_ATTRS(read), |
|---|
| 945 | | - KYBER_HCTX_DOMAIN_ATTRS(sync_write), |
|---|
| 1001 | + KYBER_HCTX_DOMAIN_ATTRS(write), |
|---|
| 1002 | + KYBER_HCTX_DOMAIN_ATTRS(discard), |
|---|
| 946 | 1003 | KYBER_HCTX_DOMAIN_ATTRS(other), |
|---|
| 947 | 1004 | {"cur_domain", 0400, kyber_cur_domain_show}, |
|---|
| 948 | 1005 | {"batching", 0400, kyber_batching_show}, |
|---|
| .. | .. |
|---|
| 952 | 1009 | #endif |
|---|
| 953 | 1010 | |
|---|
| 954 | 1011 | static struct elevator_type kyber_sched = { |
|---|
| 955 | | - .ops.mq = { |
|---|
| 1012 | + .ops = { |
|---|
| 956 | 1013 | .init_sched = kyber_init_sched, |
|---|
| 957 | 1014 | .exit_sched = kyber_exit_sched, |
|---|
| 958 | 1015 | .init_hctx = kyber_init_hctx, |
|---|
| .. | .. |
|---|
| 967 | 1024 | .dispatch_request = kyber_dispatch_request, |
|---|
| 968 | 1025 | .has_work = kyber_has_work, |
|---|
| 969 | 1026 | }, |
|---|
| 970 | | - .uses_mq = true, |
|---|
| 971 | 1027 | #ifdef CONFIG_BLK_DEBUG_FS |
|---|
| 972 | 1028 | .queue_debugfs_attrs = kyber_queue_debugfs_attrs, |
|---|
| 973 | 1029 | .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, |
|---|
| 974 | 1030 | #endif |
|---|
| 975 | 1031 | .elevator_attrs = kyber_sched_attrs, |
|---|
| 976 | 1032 | .elevator_name = "kyber", |
|---|
| 1033 | + .elevator_features = ELEVATOR_F_MQ_AWARE, |
|---|
| 977 | 1034 | .elevator_owner = THIS_MODULE, |
|---|
| 978 | 1035 | }; |
|---|
| 979 | 1036 | |
|---|