.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * blk-mq scheduling framework |
---|
3 | 4 | * |
---|
.. | .. |
---|
6 | 7 | #include <linux/kernel.h> |
---|
7 | 8 | #include <linux/module.h> |
---|
8 | 9 | #include <linux/blk-mq.h> |
---|
| 10 | +#include <linux/list_sort.h> |
---|
9 | 11 | |
---|
10 | 12 | #include <trace/events/block.h> |
---|
11 | 13 | |
---|
.. | .. |
---|
16 | 18 | #include "blk-mq-tag.h" |
---|
17 | 19 | #include "blk-wbt.h" |
---|
18 | 20 | |
---|
19 | | -void blk_mq_sched_free_hctx_data(struct request_queue *q, |
---|
20 | | - void (*exit)(struct blk_mq_hw_ctx *)) |
---|
21 | | -{ |
---|
22 | | - struct blk_mq_hw_ctx *hctx; |
---|
23 | | - int i; |
---|
24 | | - |
---|
25 | | - queue_for_each_hw_ctx(q, hctx, i) { |
---|
26 | | - if (exit && hctx->sched_data) |
---|
27 | | - exit(hctx); |
---|
28 | | - kfree(hctx->sched_data); |
---|
29 | | - hctx->sched_data = NULL; |
---|
30 | | - } |
---|
31 | | -} |
---|
32 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
---|
33 | | - |
---|
34 | | -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) |
---|
| 21 | +void blk_mq_sched_assign_ioc(struct request *rq) |
---|
35 | 22 | { |
---|
36 | 23 | struct request_queue *q = rq->q; |
---|
37 | | - struct io_context *ioc = rq_ioc(bio); |
---|
| 24 | + struct io_context *ioc; |
---|
38 | 25 | struct io_cq *icq; |
---|
39 | 26 | |
---|
40 | | - spin_lock_irq(q->queue_lock); |
---|
| 27 | + /* |
---|
| 28 | + * May not have an IO context if it's a passthrough request |
---|
| 29 | + */ |
---|
| 30 | + ioc = current->io_context; |
---|
| 31 | + if (!ioc) |
---|
| 32 | + return; |
---|
| 33 | + |
---|
| 34 | + spin_lock_irq(&q->queue_lock); |
---|
41 | 35 | icq = ioc_lookup_icq(ioc, q); |
---|
42 | | - spin_unlock_irq(q->queue_lock); |
---|
| 36 | + spin_unlock_irq(&q->queue_lock); |
---|
43 | 37 | |
---|
44 | 38 | if (!icq) { |
---|
45 | 39 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); |
---|
.. | .. |
---|
81 | 75 | blk_mq_run_hw_queue(hctx, true); |
---|
82 | 76 | } |
---|
83 | 77 | |
---|
| 78 | +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) |
---|
| 79 | +{ |
---|
| 80 | + struct request *rqa = container_of(a, struct request, queuelist); |
---|
| 81 | + struct request *rqb = container_of(b, struct request, queuelist); |
---|
| 82 | + |
---|
| 83 | + return rqa->mq_hctx > rqb->mq_hctx; |
---|
| 84 | +} |
---|
| 85 | + |
---|
| 86 | +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) |
---|
| 87 | +{ |
---|
| 88 | + struct blk_mq_hw_ctx *hctx = |
---|
| 89 | + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; |
---|
| 90 | + struct request *rq; |
---|
| 91 | + LIST_HEAD(hctx_list); |
---|
| 92 | + unsigned int count = 0; |
---|
| 93 | + |
---|
| 94 | + list_for_each_entry(rq, rq_list, queuelist) { |
---|
| 95 | + if (rq->mq_hctx != hctx) { |
---|
| 96 | + list_cut_before(&hctx_list, rq_list, &rq->queuelist); |
---|
| 97 | + goto dispatch; |
---|
| 98 | + } |
---|
| 99 | + count++; |
---|
| 100 | + } |
---|
| 101 | + list_splice_tail_init(rq_list, &hctx_list); |
---|
| 102 | + |
---|
| 103 | +dispatch: |
---|
| 104 | + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); |
---|
| 105 | +} |
---|
| 106 | + |
---|
| 107 | +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ |
---|
| 108 | + |
---|
84 | 109 | /* |
---|
85 | 110 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
---|
86 | 111 | * its queue by itself in its completion handler, so we don't need to |
---|
87 | 112 | * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
---|
| 113 | + * |
---|
| 114 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
---|
| 115 | + * be run again. This is necessary to avoid starving flushes. |
---|
88 | 116 | */ |
---|
89 | | -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
| 117 | +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
90 | 118 | { |
---|
91 | 119 | struct request_queue *q = hctx->queue; |
---|
92 | 120 | struct elevator_queue *e = q->elevator; |
---|
| 121 | + bool multi_hctxs = false, run_queue = false; |
---|
| 122 | + bool dispatched = false, busy = false; |
---|
| 123 | + unsigned int max_dispatch; |
---|
93 | 124 | LIST_HEAD(rq_list); |
---|
| 125 | + int count = 0; |
---|
| 126 | + |
---|
| 127 | + if (hctx->dispatch_busy) |
---|
| 128 | + max_dispatch = 1; |
---|
| 129 | + else |
---|
| 130 | + max_dispatch = hctx->queue->nr_requests; |
---|
94 | 131 | |
---|
95 | 132 | do { |
---|
96 | 133 | struct request *rq; |
---|
97 | 134 | |
---|
98 | | - if (e->type->ops.mq.has_work && |
---|
99 | | - !e->type->ops.mq.has_work(hctx)) |
---|
| 135 | + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) |
---|
100 | 136 | break; |
---|
101 | 137 | |
---|
102 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
---|
| 138 | + if (!list_empty_careful(&hctx->dispatch)) { |
---|
| 139 | + busy = true; |
---|
| 140 | + break; |
---|
| 141 | + } |
---|
| 142 | + |
---|
| 143 | + if (!blk_mq_get_dispatch_budget(q)) |
---|
103 | 144 | break; |
---|
104 | 145 | |
---|
105 | | - rq = e->type->ops.mq.dispatch_request(hctx); |
---|
| 146 | + rq = e->type->ops.dispatch_request(hctx); |
---|
106 | 147 | if (!rq) { |
---|
107 | | - blk_mq_put_dispatch_budget(hctx); |
---|
| 148 | + blk_mq_put_dispatch_budget(q); |
---|
| 149 | + /* |
---|
| 150 | + * We're releasing without dispatching. Holding the |
---|
| 151 | + * budget could have blocked any "hctx"s with the |
---|
| 152 | + * same queue and if we didn't dispatch then there's |
---|
| 153 | + * no guarantee anyone will kick the queue. Kick it |
---|
| 154 | + * ourselves. |
---|
| 155 | + */ |
---|
| 156 | + run_queue = true; |
---|
108 | 157 | break; |
---|
109 | 158 | } |
---|
110 | 159 | |
---|
.. | .. |
---|
113 | 162 | * if this rq won't be queued to driver via .queue_rq() |
---|
114 | 163 | * in blk_mq_dispatch_rq_list(). |
---|
115 | 164 | */ |
---|
116 | | - list_add(&rq->queuelist, &rq_list); |
---|
117 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
---|
| 165 | + list_add_tail(&rq->queuelist, &rq_list); |
---|
| 166 | + if (rq->mq_hctx != hctx) |
---|
| 167 | + multi_hctxs = true; |
---|
| 168 | + } while (++count < max_dispatch); |
---|
| 169 | + |
---|
| 170 | + if (!count) { |
---|
| 171 | + if (run_queue) |
---|
| 172 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
---|
| 173 | + } else if (multi_hctxs) { |
---|
| 174 | + /* |
---|
| 175 | + * Requests from different hctx may be dequeued from some |
---|
| 176 | + * schedulers, such as bfq and deadline. |
---|
| 177 | + * |
---|
| 178 | + * Sort the requests in the list according to their hctx, |
---|
| 179 | + * dispatch batching requests from same hctx at a time. |
---|
| 180 | + */ |
---|
| 181 | + list_sort(NULL, &rq_list, sched_rq_cmp); |
---|
| 182 | + do { |
---|
| 183 | + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); |
---|
| 184 | + } while (!list_empty(&rq_list)); |
---|
| 185 | + } else { |
---|
| 186 | + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); |
---|
| 187 | + } |
---|
| 188 | + |
---|
| 189 | + if (busy) |
---|
| 190 | + return -EAGAIN; |
---|
| 191 | + return !!dispatched; |
---|
| 192 | +} |
---|
| 193 | + |
---|
| 194 | +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
| 195 | +{ |
---|
| 196 | + unsigned long end = jiffies + HZ; |
---|
| 197 | + int ret; |
---|
| 198 | + |
---|
| 199 | + do { |
---|
| 200 | + ret = __blk_mq_do_dispatch_sched(hctx); |
---|
| 201 | + if (ret != 1) |
---|
| 202 | + break; |
---|
| 203 | + if (need_resched() || time_is_before_jiffies(end)) { |
---|
| 204 | + blk_mq_delay_run_hw_queue(hctx, 0); |
---|
| 205 | + break; |
---|
| 206 | + } |
---|
| 207 | + } while (1); |
---|
| 208 | + |
---|
| 209 | + return ret; |
---|
118 | 210 | } |
---|
119 | 211 | |
---|
120 | 212 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, |
---|
121 | 213 | struct blk_mq_ctx *ctx) |
---|
122 | 214 | { |
---|
123 | | - unsigned idx = ctx->index_hw; |
---|
| 215 | + unsigned short idx = ctx->index_hw[hctx->type]; |
---|
124 | 216 | |
---|
125 | 217 | if (++idx == hctx->nr_ctx) |
---|
126 | 218 | idx = 0; |
---|
.. | .. |
---|
132 | 224 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
---|
133 | 225 | * its queue by itself in its completion handler, so we don't need to |
---|
134 | 226 | * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
---|
| 227 | + * |
---|
| 228 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
---|
| 229 | + * be run again. This is necessary to avoid starving flushes. |
---|
135 | 230 | */ |
---|
136 | | -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
---|
| 231 | +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
---|
137 | 232 | { |
---|
138 | 233 | struct request_queue *q = hctx->queue; |
---|
139 | 234 | LIST_HEAD(rq_list); |
---|
140 | 235 | struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); |
---|
| 236 | + int ret = 0; |
---|
| 237 | + struct request *rq; |
---|
141 | 238 | |
---|
142 | 239 | do { |
---|
143 | | - struct request *rq; |
---|
| 240 | + if (!list_empty_careful(&hctx->dispatch)) { |
---|
| 241 | + ret = -EAGAIN; |
---|
| 242 | + break; |
---|
| 243 | + } |
---|
144 | 244 | |
---|
145 | 245 | if (!sbitmap_any_bit_set(&hctx->ctx_map)) |
---|
146 | 246 | break; |
---|
147 | 247 | |
---|
148 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
---|
| 248 | + if (!blk_mq_get_dispatch_budget(q)) |
---|
149 | 249 | break; |
---|
150 | 250 | |
---|
151 | 251 | rq = blk_mq_dequeue_from_ctx(hctx, ctx); |
---|
152 | 252 | if (!rq) { |
---|
153 | | - blk_mq_put_dispatch_budget(hctx); |
---|
| 253 | + blk_mq_put_dispatch_budget(q); |
---|
| 254 | + /* |
---|
| 255 | + * We're releasing without dispatching. Holding the |
---|
| 256 | + * budget could have blocked any "hctx"s with the |
---|
| 257 | + * same queue and if we didn't dispatch then there's |
---|
| 258 | + * no guarantee anyone will kick the queue. Kick it |
---|
| 259 | + * ourselves. |
---|
| 260 | + */ |
---|
| 261 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
---|
154 | 262 | break; |
---|
155 | 263 | } |
---|
156 | 264 | |
---|
.. | .. |
---|
164 | 272 | /* round robin for fair dispatch */ |
---|
165 | 273 | ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); |
---|
166 | 274 | |
---|
167 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
---|
| 275 | + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); |
---|
168 | 276 | |
---|
169 | 277 | WRITE_ONCE(hctx->dispatch_from, ctx); |
---|
| 278 | + return ret; |
---|
170 | 279 | } |
---|
171 | 280 | |
---|
172 | | -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
| 281 | +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
173 | 282 | { |
---|
174 | 283 | struct request_queue *q = hctx->queue; |
---|
175 | 284 | struct elevator_queue *e = q->elevator; |
---|
176 | | - const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
---|
| 285 | + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; |
---|
| 286 | + int ret = 0; |
---|
177 | 287 | LIST_HEAD(rq_list); |
---|
178 | | - |
---|
179 | | - /* RCU or SRCU read lock is needed before checking quiesced flag */ |
---|
180 | | - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
---|
181 | | - return; |
---|
182 | | - |
---|
183 | | - hctx->run++; |
---|
184 | 288 | |
---|
185 | 289 | /* |
---|
186 | 290 | * If we have previous entries on our dispatch list, grab them first for |
---|
.. | .. |
---|
208 | 312 | */ |
---|
209 | 313 | if (!list_empty(&rq_list)) { |
---|
210 | 314 | blk_mq_sched_mark_restart_hctx(hctx); |
---|
211 | | - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { |
---|
| 315 | + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { |
---|
212 | 316 | if (has_sched_dispatch) |
---|
213 | | - blk_mq_do_dispatch_sched(hctx); |
---|
| 317 | + ret = blk_mq_do_dispatch_sched(hctx); |
---|
214 | 318 | else |
---|
215 | | - blk_mq_do_dispatch_ctx(hctx); |
---|
| 319 | + ret = blk_mq_do_dispatch_ctx(hctx); |
---|
216 | 320 | } |
---|
217 | 321 | } else if (has_sched_dispatch) { |
---|
218 | | - blk_mq_do_dispatch_sched(hctx); |
---|
| 322 | + ret = blk_mq_do_dispatch_sched(hctx); |
---|
219 | 323 | } else if (hctx->dispatch_busy) { |
---|
220 | 324 | /* dequeue request one by one from sw queue if queue is busy */ |
---|
221 | | - blk_mq_do_dispatch_ctx(hctx); |
---|
| 325 | + ret = blk_mq_do_dispatch_ctx(hctx); |
---|
222 | 326 | } else { |
---|
223 | 327 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
---|
224 | | - blk_mq_dispatch_rq_list(q, &rq_list, false); |
---|
| 328 | + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); |
---|
225 | 329 | } |
---|
| 330 | + |
---|
| 331 | + return ret; |
---|
226 | 332 | } |
---|
227 | 333 | |
---|
228 | | -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
---|
229 | | - struct request **merged_request) |
---|
| 334 | +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
230 | 335 | { |
---|
231 | | - struct request *rq; |
---|
| 336 | + struct request_queue *q = hctx->queue; |
---|
232 | 337 | |
---|
233 | | - switch (elv_merge(q, &rq, bio)) { |
---|
234 | | - case ELEVATOR_BACK_MERGE: |
---|
235 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
---|
236 | | - return false; |
---|
237 | | - if (!bio_attempt_back_merge(q, rq, bio)) |
---|
238 | | - return false; |
---|
239 | | - *merged_request = attempt_back_merge(q, rq); |
---|
240 | | - if (!*merged_request) |
---|
241 | | - elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); |
---|
242 | | - return true; |
---|
243 | | - case ELEVATOR_FRONT_MERGE: |
---|
244 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
---|
245 | | - return false; |
---|
246 | | - if (!bio_attempt_front_merge(q, rq, bio)) |
---|
247 | | - return false; |
---|
248 | | - *merged_request = attempt_front_merge(q, rq); |
---|
249 | | - if (!*merged_request) |
---|
250 | | - elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); |
---|
251 | | - return true; |
---|
252 | | - case ELEVATOR_DISCARD_MERGE: |
---|
253 | | - return bio_attempt_discard_merge(q, rq, bio); |
---|
254 | | - default: |
---|
255 | | - return false; |
---|
| 338 | + /* RCU or SRCU read lock is needed before checking quiesced flag */ |
---|
| 339 | + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
---|
| 340 | + return; |
---|
| 341 | + |
---|
| 342 | + hctx->run++; |
---|
| 343 | + |
---|
| 344 | + /* |
---|
| 345 | + * A return of -EAGAIN is an indication that hctx->dispatch is not |
---|
| 346 | + * empty and we must run again in order to avoid starving flushes. |
---|
| 347 | + */ |
---|
| 348 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { |
---|
| 349 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) |
---|
| 350 | + blk_mq_run_hw_queue(hctx, true); |
---|
256 | 351 | } |
---|
257 | 352 | } |
---|
258 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); |
---|
259 | 353 | |
---|
260 | | -/* |
---|
261 | | - * Iterate list of requests and see if we can merge this bio with any |
---|
262 | | - * of them. |
---|
263 | | - */ |
---|
264 | | -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, |
---|
265 | | - struct bio *bio) |
---|
266 | | -{ |
---|
267 | | - struct request *rq; |
---|
268 | | - int checked = 8; |
---|
269 | | - |
---|
270 | | - list_for_each_entry_reverse(rq, list, queuelist) { |
---|
271 | | - bool merged = false; |
---|
272 | | - |
---|
273 | | - if (!checked--) |
---|
274 | | - break; |
---|
275 | | - |
---|
276 | | - if (!blk_rq_merge_ok(rq, bio)) |
---|
277 | | - continue; |
---|
278 | | - |
---|
279 | | - switch (blk_try_merge(rq, bio)) { |
---|
280 | | - case ELEVATOR_BACK_MERGE: |
---|
281 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
---|
282 | | - merged = bio_attempt_back_merge(q, rq, bio); |
---|
283 | | - break; |
---|
284 | | - case ELEVATOR_FRONT_MERGE: |
---|
285 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
---|
286 | | - merged = bio_attempt_front_merge(q, rq, bio); |
---|
287 | | - break; |
---|
288 | | - case ELEVATOR_DISCARD_MERGE: |
---|
289 | | - merged = bio_attempt_discard_merge(q, rq, bio); |
---|
290 | | - break; |
---|
291 | | - default: |
---|
292 | | - continue; |
---|
293 | | - } |
---|
294 | | - |
---|
295 | | - return merged; |
---|
296 | | - } |
---|
297 | | - |
---|
298 | | - return false; |
---|
299 | | -} |
---|
300 | | -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); |
---|
301 | | - |
---|
302 | | -/* |
---|
303 | | - * Reverse check our software queue for entries that we could potentially |
---|
304 | | - * merge with. Currently includes a hand-wavy stop count of 8, to not spend |
---|
305 | | - * too much time checking for merges. |
---|
306 | | - */ |
---|
307 | | -static bool blk_mq_attempt_merge(struct request_queue *q, |
---|
308 | | - struct blk_mq_ctx *ctx, struct bio *bio) |
---|
309 | | -{ |
---|
310 | | - lockdep_assert_held(&ctx->lock); |
---|
311 | | - |
---|
312 | | - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { |
---|
313 | | - ctx->rq_merged++; |
---|
314 | | - return true; |
---|
315 | | - } |
---|
316 | | - |
---|
317 | | - return false; |
---|
318 | | -} |
---|
319 | | - |
---|
320 | | -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) |
---|
| 354 | +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
---|
| 355 | + unsigned int nr_segs) |
---|
321 | 356 | { |
---|
322 | 357 | struct elevator_queue *e = q->elevator; |
---|
323 | | - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
---|
324 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
| 358 | + struct blk_mq_ctx *ctx; |
---|
| 359 | + struct blk_mq_hw_ctx *hctx; |
---|
325 | 360 | bool ret = false; |
---|
| 361 | + enum hctx_type type; |
---|
326 | 362 | |
---|
327 | | - if (e && e->type->ops.mq.bio_merge) { |
---|
328 | | - blk_mq_put_ctx(ctx); |
---|
329 | | - return e->type->ops.mq.bio_merge(hctx, bio); |
---|
| 363 | + if (e && e->type->ops.bio_merge) |
---|
| 364 | + return e->type->ops.bio_merge(q, bio, nr_segs); |
---|
| 365 | + |
---|
| 366 | + ctx = blk_mq_get_ctx(q); |
---|
| 367 | + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); |
---|
| 368 | + type = hctx->type; |
---|
| 369 | + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || |
---|
| 370 | + list_empty_careful(&ctx->rq_lists[type])) |
---|
| 371 | + return false; |
---|
| 372 | + |
---|
| 373 | + /* default per sw-queue merge */ |
---|
| 374 | + spin_lock(&ctx->lock); |
---|
| 375 | + /* |
---|
| 376 | + * Reverse check our software queue for entries that we could |
---|
| 377 | + * potentially merge with. Currently includes a hand-wavy stop |
---|
| 378 | + * count of 8, to not spend too much time checking for merges. |
---|
| 379 | + */ |
---|
| 380 | + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { |
---|
| 381 | + ctx->rq_merged++; |
---|
| 382 | + ret = true; |
---|
330 | 383 | } |
---|
331 | 384 | |
---|
332 | | - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
---|
333 | | - !list_empty_careful(&ctx->rq_list)) { |
---|
334 | | - /* default per sw-queue merge */ |
---|
335 | | - spin_lock(&ctx->lock); |
---|
336 | | - ret = blk_mq_attempt_merge(q, ctx, bio); |
---|
337 | | - spin_unlock(&ctx->lock); |
---|
338 | | - } |
---|
| 385 | + spin_unlock(&ctx->lock); |
---|
339 | 386 | |
---|
340 | | - blk_mq_put_ctx(ctx); |
---|
341 | 387 | return ret; |
---|
342 | 388 | } |
---|
343 | 389 | |
---|
.. | .. |
---|
357 | 403 | bool has_sched, |
---|
358 | 404 | struct request *rq) |
---|
359 | 405 | { |
---|
360 | | - /* dispatch flush rq directly */ |
---|
361 | | - if (rq->rq_flags & RQF_FLUSH_SEQ) { |
---|
362 | | - spin_lock(&hctx->lock); |
---|
363 | | - list_add(&rq->queuelist, &hctx->dispatch); |
---|
364 | | - spin_unlock(&hctx->lock); |
---|
| 406 | + /* |
---|
| 407 | + * dispatch flush and passthrough rq directly |
---|
| 408 | + * |
---|
| 409 | + * passthrough request has to be added to hctx->dispatch directly. |
---|
| 410 | + * For some reason, device may be in one situation which can't |
---|
| 411 | + * handle FS request, so STS_RESOURCE is always returned and the |
---|
| 412 | + * FS request will be added to hctx->dispatch. However passthrough |
---|
| 413 | + * request may be required at that time for fixing the problem. If |
---|
| 414 | + * passthrough request is added to scheduler queue, there isn't any |
---|
| 415 | + * chance to dispatch it given we prioritize requests in hctx->dispatch. |
---|
| 416 | + */ |
---|
| 417 | + if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) |
---|
365 | 418 | return true; |
---|
366 | | - } |
---|
367 | 419 | |
---|
368 | 420 | if (has_sched) |
---|
369 | 421 | rq->rq_flags |= RQF_SORTED; |
---|
.. | .. |
---|
377 | 429 | struct request_queue *q = rq->q; |
---|
378 | 430 | struct elevator_queue *e = q->elevator; |
---|
379 | 431 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
---|
380 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
| 432 | + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
---|
381 | 433 | |
---|
382 | | - /* flush rq in flush machinery need to be dispatched directly */ |
---|
383 | | - if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { |
---|
384 | | - blk_insert_flush(rq); |
---|
| 434 | + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); |
---|
| 435 | + |
---|
| 436 | + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { |
---|
| 437 | + /* |
---|
| 438 | + * Firstly normal IO request is inserted to scheduler queue or |
---|
| 439 | + * sw queue, meantime we add flush request to dispatch queue( |
---|
| 440 | + * hctx->dispatch) directly and there is at most one in-flight |
---|
| 441 | + * flush request for each hw queue, so it doesn't matter to add |
---|
| 442 | + * flush request to tail or front of the dispatch queue. |
---|
| 443 | + * |
---|
| 444 | + * Secondly in case of NCQ, flush request belongs to non-NCQ |
---|
| 445 | + * command, and queueing it will fail when there is any |
---|
| 446 | + * in-flight normal IO request(NCQ command). When adding flush |
---|
| 447 | + * rq to the front of hctx->dispatch, it is easier to introduce |
---|
| 448 | + * extra time to flush rq's latency because of S_SCHED_RESTART |
---|
| 449 | + * compared with adding to the tail of dispatch queue, then |
---|
| 450 | + * chance of flush merge is increased, and less flush requests |
---|
| 451 | + * will be issued to controller. It is observed that ~10% time |
---|
| 452 | + * is saved in blktests block/004 on disk attached to AHCI/NCQ |
---|
| 453 | + * drive when adding flush rq to the front of hctx->dispatch. |
---|
| 454 | + * |
---|
| 455 | + * Simply queue flush rq to the front of hctx->dispatch so that |
---|
| 456 | + * intensive flush workloads can benefit in case of NCQ HW. |
---|
| 457 | + */ |
---|
| 458 | + at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; |
---|
| 459 | + blk_mq_request_bypass_insert(rq, at_head, false); |
---|
385 | 460 | goto run; |
---|
386 | 461 | } |
---|
387 | 462 | |
---|
388 | | - WARN_ON(e && (rq->tag != -1)); |
---|
389 | | - |
---|
390 | | - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) |
---|
391 | | - goto run; |
---|
392 | | - |
---|
393 | | - if (e && e->type->ops.mq.insert_requests) { |
---|
| 463 | + if (e && e->type->ops.insert_requests) { |
---|
394 | 464 | LIST_HEAD(list); |
---|
395 | 465 | |
---|
396 | 466 | list_add(&rq->queuelist, &list); |
---|
397 | | - e->type->ops.mq.insert_requests(hctx, &list, at_head); |
---|
| 467 | + e->type->ops.insert_requests(hctx, &list, at_head); |
---|
398 | 468 | } else { |
---|
399 | 469 | spin_lock(&ctx->lock); |
---|
400 | 470 | __blk_mq_insert_request(hctx, rq, at_head); |
---|
.. | .. |
---|
406 | 476 | blk_mq_run_hw_queue(hctx, async); |
---|
407 | 477 | } |
---|
408 | 478 | |
---|
409 | | -void blk_mq_sched_insert_requests(struct request_queue *q, |
---|
| 479 | +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, |
---|
410 | 480 | struct blk_mq_ctx *ctx, |
---|
411 | 481 | struct list_head *list, bool run_queue_async) |
---|
412 | 482 | { |
---|
413 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
414 | | - struct elevator_queue *e = hctx->queue->elevator; |
---|
| 483 | + struct elevator_queue *e; |
---|
| 484 | + struct request_queue *q = hctx->queue; |
---|
415 | 485 | |
---|
416 | | - if (e && e->type->ops.mq.insert_requests) |
---|
417 | | - e->type->ops.mq.insert_requests(hctx, list, false); |
---|
| 486 | + /* |
---|
| 487 | + * blk_mq_sched_insert_requests() is called from flush plug |
---|
| 488 | + * context only, and hold one usage counter to prevent queue |
---|
| 489 | + * from being released. |
---|
| 490 | + */ |
---|
| 491 | + percpu_ref_get(&q->q_usage_counter); |
---|
| 492 | + |
---|
| 493 | + e = hctx->queue->elevator; |
---|
| 494 | + if (e && e->type->ops.insert_requests) |
---|
| 495 | + e->type->ops.insert_requests(hctx, list, false); |
---|
418 | 496 | else { |
---|
419 | 497 | /* |
---|
420 | 498 | * try to issue requests directly if the hw queue isn't |
---|
.. | .. |
---|
424 | 502 | if (!hctx->dispatch_busy && !e && !run_queue_async) { |
---|
425 | 503 | blk_mq_try_issue_list_directly(hctx, list); |
---|
426 | 504 | if (list_empty(list)) |
---|
427 | | - return; |
---|
| 505 | + goto out; |
---|
428 | 506 | } |
---|
429 | 507 | blk_mq_insert_requests(hctx, ctx, list); |
---|
430 | 508 | } |
---|
431 | 509 | |
---|
432 | 510 | blk_mq_run_hw_queue(hctx, run_queue_async); |
---|
| 511 | + out: |
---|
| 512 | + percpu_ref_put(&q->q_usage_counter); |
---|
433 | 513 | } |
---|
434 | 514 | |
---|
435 | 515 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, |
---|
436 | 516 | struct blk_mq_hw_ctx *hctx, |
---|
437 | 517 | unsigned int hctx_idx) |
---|
438 | 518 | { |
---|
| 519 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
| 520 | + |
---|
439 | 521 | if (hctx->sched_tags) { |
---|
440 | 522 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); |
---|
441 | | - blk_mq_free_rq_map(hctx->sched_tags); |
---|
| 523 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
---|
442 | 524 | hctx->sched_tags = NULL; |
---|
443 | 525 | } |
---|
444 | 526 | } |
---|
.. | .. |
---|
448 | 530 | unsigned int hctx_idx) |
---|
449 | 531 | { |
---|
450 | 532 | struct blk_mq_tag_set *set = q->tag_set; |
---|
| 533 | + /* Clear HCTX_SHARED so tags are init'ed */ |
---|
| 534 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
451 | 535 | int ret; |
---|
452 | 536 | |
---|
453 | 537 | hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, |
---|
454 | | - set->reserved_tags); |
---|
| 538 | + set->reserved_tags, flags); |
---|
455 | 539 | if (!hctx->sched_tags) |
---|
456 | 540 | return -ENOMEM; |
---|
457 | 541 | |
---|
.. | .. |
---|
462 | 546 | return ret; |
---|
463 | 547 | } |
---|
464 | 548 | |
---|
| 549 | +/* called in queue's release handler, tagset has gone away */ |
---|
465 | 550 | static void blk_mq_sched_tags_teardown(struct request_queue *q) |
---|
466 | 551 | { |
---|
467 | | - struct blk_mq_tag_set *set = q->tag_set; |
---|
468 | 552 | struct blk_mq_hw_ctx *hctx; |
---|
469 | 553 | int i; |
---|
470 | 554 | |
---|
471 | | - queue_for_each_hw_ctx(q, hctx, i) |
---|
472 | | - blk_mq_sched_free_tags(set, hctx, i); |
---|
| 555 | + queue_for_each_hw_ctx(q, hctx, i) { |
---|
| 556 | + /* Clear HCTX_SHARED so tags are freed */ |
---|
| 557 | + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
| 558 | + |
---|
| 559 | + if (hctx->sched_tags) { |
---|
| 560 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
---|
| 561 | + hctx->sched_tags = NULL; |
---|
| 562 | + } |
---|
| 563 | + } |
---|
473 | 564 | } |
---|
474 | 565 | |
---|
475 | 566 | int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) |
---|
.. | .. |
---|
499 | 590 | goto err; |
---|
500 | 591 | } |
---|
501 | 592 | |
---|
502 | | - ret = e->ops.mq.init_sched(q, e); |
---|
| 593 | + ret = e->ops.init_sched(q, e); |
---|
503 | 594 | if (ret) |
---|
504 | 595 | goto err; |
---|
505 | 596 | |
---|
506 | 597 | blk_mq_debugfs_register_sched(q); |
---|
507 | 598 | |
---|
508 | 599 | queue_for_each_hw_ctx(q, hctx, i) { |
---|
509 | | - if (e->ops.mq.init_hctx) { |
---|
510 | | - ret = e->ops.mq.init_hctx(hctx, i); |
---|
| 600 | + if (e->ops.init_hctx) { |
---|
| 601 | + ret = e->ops.init_hctx(hctx, i); |
---|
511 | 602 | if (ret) { |
---|
512 | 603 | eq = q->elevator; |
---|
| 604 | + blk_mq_sched_free_requests(q); |
---|
513 | 605 | blk_mq_exit_sched(q, eq); |
---|
514 | 606 | kobject_put(&eq->kobj); |
---|
515 | 607 | return ret; |
---|
.. | .. |
---|
521 | 613 | return 0; |
---|
522 | 614 | |
---|
523 | 615 | err: |
---|
| 616 | + blk_mq_sched_free_requests(q); |
---|
524 | 617 | blk_mq_sched_tags_teardown(q); |
---|
525 | 618 | q->elevator = NULL; |
---|
526 | 619 | return ret; |
---|
| 620 | +} |
---|
| 621 | + |
---|
| 622 | +/* |
---|
| 623 | + * called in either blk_queue_cleanup or elevator_switch, tagset |
---|
| 624 | + * is required for freeing requests |
---|
| 625 | + */ |
---|
| 626 | +void blk_mq_sched_free_requests(struct request_queue *q) |
---|
| 627 | +{ |
---|
| 628 | + struct blk_mq_hw_ctx *hctx; |
---|
| 629 | + int i; |
---|
| 630 | + |
---|
| 631 | + queue_for_each_hw_ctx(q, hctx, i) { |
---|
| 632 | + if (hctx->sched_tags) |
---|
| 633 | + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); |
---|
| 634 | + } |
---|
527 | 635 | } |
---|
528 | 636 | |
---|
529 | 637 | void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) |
---|
.. | .. |
---|
533 | 641 | |
---|
534 | 642 | queue_for_each_hw_ctx(q, hctx, i) { |
---|
535 | 643 | blk_mq_debugfs_unregister_sched_hctx(hctx); |
---|
536 | | - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { |
---|
537 | | - e->type->ops.mq.exit_hctx(hctx, i); |
---|
| 644 | + if (e->type->ops.exit_hctx && hctx->sched_data) { |
---|
| 645 | + e->type->ops.exit_hctx(hctx, i); |
---|
538 | 646 | hctx->sched_data = NULL; |
---|
539 | 647 | } |
---|
540 | 648 | } |
---|
541 | 649 | blk_mq_debugfs_unregister_sched(q); |
---|
542 | | - if (e->type->ops.mq.exit_sched) |
---|
543 | | - e->type->ops.mq.exit_sched(e); |
---|
| 650 | + if (e->type->ops.exit_sched) |
---|
| 651 | + e->type->ops.exit_sched(e); |
---|
544 | 652 | blk_mq_sched_tags_teardown(q); |
---|
545 | 653 | q->elevator = NULL; |
---|
546 | 654 | } |
---|