.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * blk-mq scheduling framework |
---|
3 | 4 | * |
---|
.. | .. |
---|
6 | 7 | #include <linux/kernel.h> |
---|
7 | 8 | #include <linux/module.h> |
---|
8 | 9 | #include <linux/blk-mq.h> |
---|
| 10 | +#include <linux/list_sort.h> |
---|
9 | 11 | |
---|
10 | 12 | #include <trace/events/block.h> |
---|
11 | 13 | |
---|
.. | .. |
---|
16 | 18 | #include "blk-mq-tag.h" |
---|
17 | 19 | #include "blk-wbt.h" |
---|
18 | 20 | |
---|
19 | | -void blk_mq_sched_free_hctx_data(struct request_queue *q, |
---|
20 | | - void (*exit)(struct blk_mq_hw_ctx *)) |
---|
21 | | -{ |
---|
22 | | - struct blk_mq_hw_ctx *hctx; |
---|
23 | | - int i; |
---|
24 | | - |
---|
25 | | - queue_for_each_hw_ctx(q, hctx, i) { |
---|
26 | | - if (exit && hctx->sched_data) |
---|
27 | | - exit(hctx); |
---|
28 | | - kfree(hctx->sched_data); |
---|
29 | | - hctx->sched_data = NULL; |
---|
30 | | - } |
---|
31 | | -} |
---|
32 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
---|
33 | | - |
---|
34 | | -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) |
---|
| 21 | +void blk_mq_sched_assign_ioc(struct request *rq) |
---|
35 | 22 | { |
---|
36 | 23 | struct request_queue *q = rq->q; |
---|
37 | | - struct io_context *ioc = rq_ioc(bio); |
---|
| 24 | + struct io_context *ioc; |
---|
38 | 25 | struct io_cq *icq; |
---|
39 | 26 | |
---|
40 | | - spin_lock_irq(q->queue_lock); |
---|
| 27 | + /* |
---|
| 28 | + * May not have an IO context if it's a passthrough request |
---|
| 29 | + */ |
---|
| 30 | + ioc = current->io_context; |
---|
| 31 | + if (!ioc) |
---|
| 32 | + return; |
---|
| 33 | + |
---|
| 34 | + spin_lock_irq(&q->queue_lock); |
---|
41 | 35 | icq = ioc_lookup_icq(ioc, q); |
---|
42 | | - spin_unlock_irq(q->queue_lock); |
---|
| 36 | + spin_unlock_irq(&q->queue_lock); |
---|
43 | 37 | |
---|
44 | 38 | if (!icq) { |
---|
45 | 39 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); |
---|
.. | .. |
---|
51 | 45 | } |
---|
52 | 46 | |
---|
53 | 47 | /* |
---|
54 | | - * Mark a hardware queue as needing a restart. For shared queues, maintain |
---|
55 | | - * a count of how many hardware queues are marked for restart. |
---|
| 48 | + * Mark a hardware queue as needing a restart. |
---|
56 | 49 | */ |
---|
57 | 50 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) |
---|
58 | 51 | { |
---|
.. | .. |
---|
81 | 74 | blk_mq_run_hw_queue(hctx, true); |
---|
82 | 75 | } |
---|
83 | 76 | |
---|
| 77 | +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) |
---|
| 78 | +{ |
---|
| 79 | + struct request *rqa = container_of(a, struct request, queuelist); |
---|
| 80 | + struct request *rqb = container_of(b, struct request, queuelist); |
---|
| 81 | + |
---|
| 82 | + return rqa->mq_hctx > rqb->mq_hctx; |
---|
| 83 | +} |
---|
| 84 | + |
---|
| 85 | +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) |
---|
| 86 | +{ |
---|
| 87 | + struct blk_mq_hw_ctx *hctx = |
---|
| 88 | + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; |
---|
| 89 | + struct request *rq; |
---|
| 90 | + LIST_HEAD(hctx_list); |
---|
| 91 | + unsigned int count = 0; |
---|
| 92 | + |
---|
| 93 | + list_for_each_entry(rq, rq_list, queuelist) { |
---|
| 94 | + if (rq->mq_hctx != hctx) { |
---|
| 95 | + list_cut_before(&hctx_list, rq_list, &rq->queuelist); |
---|
| 96 | + goto dispatch; |
---|
| 97 | + } |
---|
| 98 | + count++; |
---|
| 99 | + } |
---|
| 100 | + list_splice_tail_init(rq_list, &hctx_list); |
---|
| 101 | + |
---|
| 102 | +dispatch: |
---|
| 103 | + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); |
---|
| 104 | +} |
---|
| 105 | + |
---|
| 106 | +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ |
---|
| 107 | + |
---|
84 | 108 | /* |
---|
85 | 109 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
---|
86 | 110 | * its queue by itself in its completion handler, so we don't need to |
---|
87 | | - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
---|
| 111 | + * restart queue if .get_budget() fails to get the budget. |
---|
| 112 | + * |
---|
| 113 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
---|
| 114 | + * be run again. This is necessary to avoid starving flushes. |
---|
88 | 115 | */ |
---|
89 | | -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
| 116 | +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
90 | 117 | { |
---|
91 | 118 | struct request_queue *q = hctx->queue; |
---|
92 | 119 | struct elevator_queue *e = q->elevator; |
---|
| 120 | + bool multi_hctxs = false, run_queue = false; |
---|
| 121 | + bool dispatched = false, busy = false; |
---|
| 122 | + unsigned int max_dispatch; |
---|
93 | 123 | LIST_HEAD(rq_list); |
---|
| 124 | + int count = 0; |
---|
| 125 | + |
---|
| 126 | + if (hctx->dispatch_busy) |
---|
| 127 | + max_dispatch = 1; |
---|
| 128 | + else |
---|
| 129 | + max_dispatch = hctx->queue->nr_requests; |
---|
94 | 130 | |
---|
95 | 131 | do { |
---|
96 | 132 | struct request *rq; |
---|
97 | 133 | |
---|
98 | | - if (e->type->ops.mq.has_work && |
---|
99 | | - !e->type->ops.mq.has_work(hctx)) |
---|
| 134 | + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) |
---|
100 | 135 | break; |
---|
101 | 136 | |
---|
102 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
---|
| 137 | + if (!list_empty_careful(&hctx->dispatch)) { |
---|
| 138 | + busy = true; |
---|
| 139 | + break; |
---|
| 140 | + } |
---|
| 141 | + |
---|
| 142 | + if (!blk_mq_get_dispatch_budget(q)) |
---|
103 | 143 | break; |
---|
104 | 144 | |
---|
105 | | - rq = e->type->ops.mq.dispatch_request(hctx); |
---|
| 145 | + rq = e->type->ops.dispatch_request(hctx); |
---|
106 | 146 | if (!rq) { |
---|
107 | | - blk_mq_put_dispatch_budget(hctx); |
---|
| 147 | + blk_mq_put_dispatch_budget(q); |
---|
| 148 | + /* |
---|
| 149 | + * We're releasing without dispatching. Holding the |
---|
| 150 | + * budget could have blocked any "hctx"s with the |
---|
| 151 | + * same queue and if we didn't dispatch then there's |
---|
| 152 | + * no guarantee anyone will kick the queue. Kick it |
---|
| 153 | + * ourselves. |
---|
| 154 | + */ |
---|
| 155 | + run_queue = true; |
---|
108 | 156 | break; |
---|
109 | 157 | } |
---|
110 | 158 | |
---|
.. | .. |
---|
113 | 161 | * if this rq won't be queued to driver via .queue_rq() |
---|
114 | 162 | * in blk_mq_dispatch_rq_list(). |
---|
115 | 163 | */ |
---|
116 | | - list_add(&rq->queuelist, &rq_list); |
---|
117 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
---|
| 164 | + list_add_tail(&rq->queuelist, &rq_list); |
---|
| 165 | + if (rq->mq_hctx != hctx) |
---|
| 166 | + multi_hctxs = true; |
---|
| 167 | + } while (++count < max_dispatch); |
---|
| 168 | + |
---|
| 169 | + if (!count) { |
---|
| 170 | + if (run_queue) |
---|
| 171 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
---|
| 172 | + } else if (multi_hctxs) { |
---|
| 173 | + /* |
---|
| 174 | + * Requests from different hctx may be dequeued from some |
---|
| 175 | + * schedulers, such as bfq and deadline. |
---|
| 176 | + * |
---|
| 177 | + * Sort the requests in the list according to their hctx, |
---|
| 178 | + * dispatch batching requests from same hctx at a time. |
---|
| 179 | + */ |
---|
| 180 | + list_sort(NULL, &rq_list, sched_rq_cmp); |
---|
| 181 | + do { |
---|
| 182 | + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); |
---|
| 183 | + } while (!list_empty(&rq_list)); |
---|
| 184 | + } else { |
---|
| 185 | + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); |
---|
| 186 | + } |
---|
| 187 | + |
---|
| 188 | + if (busy) |
---|
| 189 | + return -EAGAIN; |
---|
| 190 | + return !!dispatched; |
---|
| 191 | +} |
---|
| 192 | + |
---|
| 193 | +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
---|
| 194 | +{ |
---|
| 195 | + unsigned long end = jiffies + HZ; |
---|
| 196 | + int ret; |
---|
| 197 | + |
---|
| 198 | + do { |
---|
| 199 | + ret = __blk_mq_do_dispatch_sched(hctx); |
---|
| 200 | + if (ret != 1) |
---|
| 201 | + break; |
---|
| 202 | + if (need_resched() || time_is_before_jiffies(end)) { |
---|
| 203 | + blk_mq_delay_run_hw_queue(hctx, 0); |
---|
| 204 | + break; |
---|
| 205 | + } |
---|
| 206 | + } while (1); |
---|
| 207 | + |
---|
| 208 | + return ret; |
---|
118 | 209 | } |
---|
119 | 210 | |
---|
120 | 211 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, |
---|
121 | 212 | struct blk_mq_ctx *ctx) |
---|
122 | 213 | { |
---|
123 | | - unsigned idx = ctx->index_hw; |
---|
| 214 | + unsigned short idx = ctx->index_hw[hctx->type]; |
---|
124 | 215 | |
---|
125 | 216 | if (++idx == hctx->nr_ctx) |
---|
126 | 217 | idx = 0; |
---|
.. | .. |
---|
131 | 222 | /* |
---|
132 | 223 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
---|
133 | 224 | * its queue by itself in its completion handler, so we don't need to |
---|
134 | | - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
---|
| 225 | + * restart queue if .get_budget() fails to get the budget. |
---|
| 226 | + * |
---|
| 227 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
---|
| 228 | + * be run again. This is necessary to avoid starving flushes. |
---|
135 | 229 | */ |
---|
136 | | -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
---|
| 230 | +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
---|
137 | 231 | { |
---|
138 | 232 | struct request_queue *q = hctx->queue; |
---|
139 | 233 | LIST_HEAD(rq_list); |
---|
140 | 234 | struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); |
---|
| 235 | + int ret = 0; |
---|
| 236 | + struct request *rq; |
---|
141 | 237 | |
---|
142 | 238 | do { |
---|
143 | | - struct request *rq; |
---|
| 239 | + if (!list_empty_careful(&hctx->dispatch)) { |
---|
| 240 | + ret = -EAGAIN; |
---|
| 241 | + break; |
---|
| 242 | + } |
---|
144 | 243 | |
---|
145 | 244 | if (!sbitmap_any_bit_set(&hctx->ctx_map)) |
---|
146 | 245 | break; |
---|
147 | 246 | |
---|
148 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
---|
| 247 | + if (!blk_mq_get_dispatch_budget(q)) |
---|
149 | 248 | break; |
---|
150 | 249 | |
---|
151 | 250 | rq = blk_mq_dequeue_from_ctx(hctx, ctx); |
---|
152 | 251 | if (!rq) { |
---|
153 | | - blk_mq_put_dispatch_budget(hctx); |
---|
| 252 | + blk_mq_put_dispatch_budget(q); |
---|
| 253 | + /* |
---|
| 254 | + * We're releasing without dispatching. Holding the |
---|
| 255 | + * budget could have blocked any "hctx"s with the |
---|
| 256 | + * same queue and if we didn't dispatch then there's |
---|
| 257 | + * no guarantee anyone will kick the queue. Kick it |
---|
| 258 | + * ourselves. |
---|
| 259 | + */ |
---|
| 260 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
---|
154 | 261 | break; |
---|
155 | 262 | } |
---|
156 | 263 | |
---|
.. | .. |
---|
164 | 271 | /* round robin for fair dispatch */ |
---|
165 | 272 | ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); |
---|
166 | 273 | |
---|
167 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
---|
| 274 | + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); |
---|
168 | 275 | |
---|
169 | 276 | WRITE_ONCE(hctx->dispatch_from, ctx); |
---|
| 277 | + return ret; |
---|
170 | 278 | } |
---|
171 | 279 | |
---|
172 | | -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
| 280 | +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
173 | 281 | { |
---|
174 | 282 | struct request_queue *q = hctx->queue; |
---|
175 | 283 | struct elevator_queue *e = q->elevator; |
---|
176 | | - const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
---|
| 284 | + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; |
---|
| 285 | + int ret = 0; |
---|
177 | 286 | LIST_HEAD(rq_list); |
---|
178 | | - |
---|
179 | | - /* RCU or SRCU read lock is needed before checking quiesced flag */ |
---|
180 | | - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
---|
181 | | - return; |
---|
182 | | - |
---|
183 | | - hctx->run++; |
---|
184 | 287 | |
---|
185 | 288 | /* |
---|
186 | 289 | * If we have previous entries on our dispatch list, grab them first for |
---|
.. | .. |
---|
208 | 311 | */ |
---|
209 | 312 | if (!list_empty(&rq_list)) { |
---|
210 | 313 | blk_mq_sched_mark_restart_hctx(hctx); |
---|
211 | | - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { |
---|
| 314 | + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { |
---|
212 | 315 | if (has_sched_dispatch) |
---|
213 | | - blk_mq_do_dispatch_sched(hctx); |
---|
| 316 | + ret = blk_mq_do_dispatch_sched(hctx); |
---|
214 | 317 | else |
---|
215 | | - blk_mq_do_dispatch_ctx(hctx); |
---|
| 318 | + ret = blk_mq_do_dispatch_ctx(hctx); |
---|
216 | 319 | } |
---|
217 | 320 | } else if (has_sched_dispatch) { |
---|
218 | | - blk_mq_do_dispatch_sched(hctx); |
---|
| 321 | + ret = blk_mq_do_dispatch_sched(hctx); |
---|
219 | 322 | } else if (hctx->dispatch_busy) { |
---|
220 | 323 | /* dequeue request one by one from sw queue if queue is busy */ |
---|
221 | | - blk_mq_do_dispatch_ctx(hctx); |
---|
| 324 | + ret = blk_mq_do_dispatch_ctx(hctx); |
---|
222 | 325 | } else { |
---|
223 | 326 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
---|
224 | | - blk_mq_dispatch_rq_list(q, &rq_list, false); |
---|
| 327 | + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); |
---|
225 | 328 | } |
---|
| 329 | + |
---|
| 330 | + return ret; |
---|
226 | 331 | } |
---|
227 | 332 | |
---|
228 | | -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
---|
229 | | - struct request **merged_request) |
---|
| 333 | +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
---|
230 | 334 | { |
---|
231 | | - struct request *rq; |
---|
| 335 | + struct request_queue *q = hctx->queue; |
---|
232 | 336 | |
---|
233 | | - switch (elv_merge(q, &rq, bio)) { |
---|
234 | | - case ELEVATOR_BACK_MERGE: |
---|
235 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
---|
236 | | - return false; |
---|
237 | | - if (!bio_attempt_back_merge(q, rq, bio)) |
---|
238 | | - return false; |
---|
239 | | - *merged_request = attempt_back_merge(q, rq); |
---|
240 | | - if (!*merged_request) |
---|
241 | | - elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); |
---|
242 | | - return true; |
---|
243 | | - case ELEVATOR_FRONT_MERGE: |
---|
244 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
---|
245 | | - return false; |
---|
246 | | - if (!bio_attempt_front_merge(q, rq, bio)) |
---|
247 | | - return false; |
---|
248 | | - *merged_request = attempt_front_merge(q, rq); |
---|
249 | | - if (!*merged_request) |
---|
250 | | - elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); |
---|
251 | | - return true; |
---|
252 | | - case ELEVATOR_DISCARD_MERGE: |
---|
253 | | - return bio_attempt_discard_merge(q, rq, bio); |
---|
254 | | - default: |
---|
255 | | - return false; |
---|
| 337 | + /* RCU or SRCU read lock is needed before checking quiesced flag */ |
---|
| 338 | + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
---|
| 339 | + return; |
---|
| 340 | + |
---|
| 341 | + hctx->run++; |
---|
| 342 | + |
---|
| 343 | + /* |
---|
| 344 | + * A return of -EAGAIN is an indication that hctx->dispatch is not |
---|
| 345 | + * empty and we must run again in order to avoid starving flushes. |
---|
| 346 | + */ |
---|
| 347 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { |
---|
| 348 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) |
---|
| 349 | + blk_mq_run_hw_queue(hctx, true); |
---|
256 | 350 | } |
---|
257 | 351 | } |
---|
258 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); |
---|
259 | 352 | |
---|
260 | | -/* |
---|
261 | | - * Iterate list of requests and see if we can merge this bio with any |
---|
262 | | - * of them. |
---|
263 | | - */ |
---|
264 | | -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, |
---|
265 | | - struct bio *bio) |
---|
266 | | -{ |
---|
267 | | - struct request *rq; |
---|
268 | | - int checked = 8; |
---|
269 | | - |
---|
270 | | - list_for_each_entry_reverse(rq, list, queuelist) { |
---|
271 | | - bool merged = false; |
---|
272 | | - |
---|
273 | | - if (!checked--) |
---|
274 | | - break; |
---|
275 | | - |
---|
276 | | - if (!blk_rq_merge_ok(rq, bio)) |
---|
277 | | - continue; |
---|
278 | | - |
---|
279 | | - switch (blk_try_merge(rq, bio)) { |
---|
280 | | - case ELEVATOR_BACK_MERGE: |
---|
281 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
---|
282 | | - merged = bio_attempt_back_merge(q, rq, bio); |
---|
283 | | - break; |
---|
284 | | - case ELEVATOR_FRONT_MERGE: |
---|
285 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
---|
286 | | - merged = bio_attempt_front_merge(q, rq, bio); |
---|
287 | | - break; |
---|
288 | | - case ELEVATOR_DISCARD_MERGE: |
---|
289 | | - merged = bio_attempt_discard_merge(q, rq, bio); |
---|
290 | | - break; |
---|
291 | | - default: |
---|
292 | | - continue; |
---|
293 | | - } |
---|
294 | | - |
---|
295 | | - return merged; |
---|
296 | | - } |
---|
297 | | - |
---|
298 | | - return false; |
---|
299 | | -} |
---|
300 | | -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); |
---|
301 | | - |
---|
302 | | -/* |
---|
303 | | - * Reverse check our software queue for entries that we could potentially |
---|
304 | | - * merge with. Currently includes a hand-wavy stop count of 8, to not spend |
---|
305 | | - * too much time checking for merges. |
---|
306 | | - */ |
---|
307 | | -static bool blk_mq_attempt_merge(struct request_queue *q, |
---|
308 | | - struct blk_mq_ctx *ctx, struct bio *bio) |
---|
309 | | -{ |
---|
310 | | - lockdep_assert_held(&ctx->lock); |
---|
311 | | - |
---|
312 | | - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { |
---|
313 | | - ctx->rq_merged++; |
---|
314 | | - return true; |
---|
315 | | - } |
---|
316 | | - |
---|
317 | | - return false; |
---|
318 | | -} |
---|
319 | | - |
---|
320 | | -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) |
---|
| 353 | +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
---|
| 354 | + unsigned int nr_segs) |
---|
321 | 355 | { |
---|
322 | 356 | struct elevator_queue *e = q->elevator; |
---|
323 | | - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
---|
324 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
| 357 | + struct blk_mq_ctx *ctx; |
---|
| 358 | + struct blk_mq_hw_ctx *hctx; |
---|
325 | 359 | bool ret = false; |
---|
| 360 | + enum hctx_type type; |
---|
326 | 361 | |
---|
327 | | - if (e && e->type->ops.mq.bio_merge) { |
---|
328 | | - blk_mq_put_ctx(ctx); |
---|
329 | | - return e->type->ops.mq.bio_merge(hctx, bio); |
---|
| 362 | + if (e && e->type->ops.bio_merge) |
---|
| 363 | + return e->type->ops.bio_merge(q, bio, nr_segs); |
---|
| 364 | + |
---|
| 365 | + ctx = blk_mq_get_ctx(q); |
---|
| 366 | + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); |
---|
| 367 | + type = hctx->type; |
---|
| 368 | + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || |
---|
| 369 | + list_empty_careful(&ctx->rq_lists[type])) |
---|
| 370 | + return false; |
---|
| 371 | + |
---|
| 372 | + /* default per sw-queue merge */ |
---|
| 373 | + spin_lock(&ctx->lock); |
---|
| 374 | + /* |
---|
| 375 | + * Reverse check our software queue for entries that we could |
---|
| 376 | + * potentially merge with. Currently includes a hand-wavy stop |
---|
| 377 | + * count of 8, to not spend too much time checking for merges. |
---|
| 378 | + */ |
---|
| 379 | + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { |
---|
| 380 | + ctx->rq_merged++; |
---|
| 381 | + ret = true; |
---|
330 | 382 | } |
---|
331 | 383 | |
---|
332 | | - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
---|
333 | | - !list_empty_careful(&ctx->rq_list)) { |
---|
334 | | - /* default per sw-queue merge */ |
---|
335 | | - spin_lock(&ctx->lock); |
---|
336 | | - ret = blk_mq_attempt_merge(q, ctx, bio); |
---|
337 | | - spin_unlock(&ctx->lock); |
---|
338 | | - } |
---|
| 384 | + spin_unlock(&ctx->lock); |
---|
339 | 385 | |
---|
340 | | - blk_mq_put_ctx(ctx); |
---|
341 | 386 | return ret; |
---|
342 | 387 | } |
---|
343 | 388 | |
---|
.. | .. |
---|
357 | 402 | bool has_sched, |
---|
358 | 403 | struct request *rq) |
---|
359 | 404 | { |
---|
360 | | - /* dispatch flush rq directly */ |
---|
361 | | - if (rq->rq_flags & RQF_FLUSH_SEQ) { |
---|
362 | | - spin_lock(&hctx->lock); |
---|
363 | | - list_add(&rq->queuelist, &hctx->dispatch); |
---|
364 | | - spin_unlock(&hctx->lock); |
---|
| 405 | + /* |
---|
| 406 | + * dispatch flush and passthrough rq directly |
---|
| 407 | + * |
---|
| 408 | + * passthrough request has to be added to hctx->dispatch directly. |
---|
| 409 | + * For some reason, device may be in one situation which can't |
---|
| 410 | + * handle FS request, so STS_RESOURCE is always returned and the |
---|
| 411 | + * FS request will be added to hctx->dispatch. However passthrough |
---|
| 412 | + * request may be required at that time for fixing the problem. If |
---|
| 413 | + * passthrough request is added to scheduler queue, there isn't any |
---|
| 414 | + * chance to dispatch it given we prioritize requests in hctx->dispatch. |
---|
| 415 | + */ |
---|
| 416 | + if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) |
---|
365 | 417 | return true; |
---|
366 | | - } |
---|
367 | 418 | |
---|
368 | 419 | if (has_sched) |
---|
369 | 420 | rq->rq_flags |= RQF_SORTED; |
---|
.. | .. |
---|
377 | 428 | struct request_queue *q = rq->q; |
---|
378 | 429 | struct elevator_queue *e = q->elevator; |
---|
379 | 430 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
---|
380 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
| 431 | + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
---|
381 | 432 | |
---|
382 | | - /* flush rq in flush machinery need to be dispatched directly */ |
---|
383 | | - if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { |
---|
384 | | - blk_insert_flush(rq); |
---|
| 433 | + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); |
---|
| 434 | + |
---|
| 435 | + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { |
---|
| 436 | + /* |
---|
| 437 | + * Firstly normal IO request is inserted to scheduler queue or |
---|
| 438 | + * sw queue, meantime we add flush request to dispatch queue( |
---|
| 439 | + * hctx->dispatch) directly and there is at most one in-flight |
---|
| 440 | + * flush request for each hw queue, so it doesn't matter to add |
---|
| 441 | + * flush request to tail or front of the dispatch queue. |
---|
| 442 | + * |
---|
| 443 | + * Secondly in case of NCQ, flush request belongs to non-NCQ |
---|
| 444 | + * command, and queueing it will fail when there is any |
---|
| 445 | + * in-flight normal IO request(NCQ command). When adding flush |
---|
| 446 | + * rq to the front of hctx->dispatch, it is easier to introduce |
---|
| 447 | + * extra time to flush rq's latency because of S_SCHED_RESTART |
---|
| 448 | + * compared with adding to the tail of dispatch queue, then |
---|
| 449 | + * chance of flush merge is increased, and less flush requests |
---|
| 450 | + * will be issued to controller. It is observed that ~10% time |
---|
| 451 | + * is saved in blktests block/004 on disk attached to AHCI/NCQ |
---|
| 452 | + * drive when adding flush rq to the front of hctx->dispatch. |
---|
| 453 | + * |
---|
| 454 | + * Simply queue flush rq to the front of hctx->dispatch so that |
---|
| 455 | + * intensive flush workloads can benefit in case of NCQ HW. |
---|
| 456 | + */ |
---|
| 457 | + at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; |
---|
| 458 | + blk_mq_request_bypass_insert(rq, at_head, false); |
---|
385 | 459 | goto run; |
---|
386 | 460 | } |
---|
387 | 461 | |
---|
388 | | - WARN_ON(e && (rq->tag != -1)); |
---|
389 | | - |
---|
390 | | - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) |
---|
391 | | - goto run; |
---|
392 | | - |
---|
393 | | - if (e && e->type->ops.mq.insert_requests) { |
---|
| 462 | + if (e && e->type->ops.insert_requests) { |
---|
394 | 463 | LIST_HEAD(list); |
---|
395 | 464 | |
---|
396 | 465 | list_add(&rq->queuelist, &list); |
---|
397 | | - e->type->ops.mq.insert_requests(hctx, &list, at_head); |
---|
| 466 | + e->type->ops.insert_requests(hctx, &list, at_head); |
---|
398 | 467 | } else { |
---|
399 | 468 | spin_lock(&ctx->lock); |
---|
400 | 469 | __blk_mq_insert_request(hctx, rq, at_head); |
---|
.. | .. |
---|
406 | 475 | blk_mq_run_hw_queue(hctx, async); |
---|
407 | 476 | } |
---|
408 | 477 | |
---|
409 | | -void blk_mq_sched_insert_requests(struct request_queue *q, |
---|
| 478 | +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, |
---|
410 | 479 | struct blk_mq_ctx *ctx, |
---|
411 | 480 | struct list_head *list, bool run_queue_async) |
---|
412 | 481 | { |
---|
413 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
---|
414 | | - struct elevator_queue *e = hctx->queue->elevator; |
---|
| 482 | + struct elevator_queue *e; |
---|
| 483 | + struct request_queue *q = hctx->queue; |
---|
415 | 484 | |
---|
416 | | - if (e && e->type->ops.mq.insert_requests) |
---|
417 | | - e->type->ops.mq.insert_requests(hctx, list, false); |
---|
| 485 | + /* |
---|
| 486 | + * blk_mq_sched_insert_requests() is called from flush plug |
---|
| 487 | + * context only, and hold one usage counter to prevent queue |
---|
| 488 | + * from being released. |
---|
| 489 | + */ |
---|
| 490 | + percpu_ref_get(&q->q_usage_counter); |
---|
| 491 | + |
---|
| 492 | + e = hctx->queue->elevator; |
---|
| 493 | + if (e && e->type->ops.insert_requests) |
---|
| 494 | + e->type->ops.insert_requests(hctx, list, false); |
---|
418 | 495 | else { |
---|
419 | 496 | /* |
---|
420 | 497 | * try to issue requests directly if the hw queue isn't |
---|
.. | .. |
---|
424 | 501 | if (!hctx->dispatch_busy && !e && !run_queue_async) { |
---|
425 | 502 | blk_mq_try_issue_list_directly(hctx, list); |
---|
426 | 503 | if (list_empty(list)) |
---|
427 | | - return; |
---|
| 504 | + goto out; |
---|
428 | 505 | } |
---|
429 | 506 | blk_mq_insert_requests(hctx, ctx, list); |
---|
430 | 507 | } |
---|
431 | 508 | |
---|
432 | 509 | blk_mq_run_hw_queue(hctx, run_queue_async); |
---|
| 510 | + out: |
---|
| 511 | + percpu_ref_put(&q->q_usage_counter); |
---|
433 | 512 | } |
---|
434 | 513 | |
---|
435 | 514 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, |
---|
436 | 515 | struct blk_mq_hw_ctx *hctx, |
---|
437 | 516 | unsigned int hctx_idx) |
---|
438 | 517 | { |
---|
| 518 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
| 519 | + |
---|
439 | 520 | if (hctx->sched_tags) { |
---|
440 | 521 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); |
---|
441 | | - blk_mq_free_rq_map(hctx->sched_tags); |
---|
| 522 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
---|
442 | 523 | hctx->sched_tags = NULL; |
---|
443 | 524 | } |
---|
444 | 525 | } |
---|
.. | .. |
---|
448 | 529 | unsigned int hctx_idx) |
---|
449 | 530 | { |
---|
450 | 531 | struct blk_mq_tag_set *set = q->tag_set; |
---|
| 532 | + /* Clear HCTX_SHARED so tags are init'ed */ |
---|
| 533 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
451 | 534 | int ret; |
---|
452 | 535 | |
---|
453 | 536 | hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, |
---|
454 | | - set->reserved_tags); |
---|
| 537 | + set->reserved_tags, flags); |
---|
455 | 538 | if (!hctx->sched_tags) |
---|
456 | 539 | return -ENOMEM; |
---|
457 | 540 | |
---|
.. | .. |
---|
462 | 545 | return ret; |
---|
463 | 546 | } |
---|
464 | 547 | |
---|
| 548 | +/* called in queue's release handler, tagset has gone away */ |
---|
465 | 549 | static void blk_mq_sched_tags_teardown(struct request_queue *q) |
---|
466 | 550 | { |
---|
467 | | - struct blk_mq_tag_set *set = q->tag_set; |
---|
468 | 551 | struct blk_mq_hw_ctx *hctx; |
---|
469 | 552 | int i; |
---|
470 | 553 | |
---|
471 | | - queue_for_each_hw_ctx(q, hctx, i) |
---|
472 | | - blk_mq_sched_free_tags(set, hctx, i); |
---|
| 554 | + queue_for_each_hw_ctx(q, hctx, i) { |
---|
| 555 | + /* Clear HCTX_SHARED so tags are freed */ |
---|
| 556 | + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
---|
| 557 | + |
---|
| 558 | + if (hctx->sched_tags) { |
---|
| 559 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
---|
| 560 | + hctx->sched_tags = NULL; |
---|
| 561 | + } |
---|
| 562 | + } |
---|
473 | 563 | } |
---|
474 | 564 | |
---|
475 | 565 | int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) |
---|
.. | .. |
---|
499 | 589 | goto err; |
---|
500 | 590 | } |
---|
501 | 591 | |
---|
502 | | - ret = e->ops.mq.init_sched(q, e); |
---|
| 592 | + ret = e->ops.init_sched(q, e); |
---|
503 | 593 | if (ret) |
---|
504 | 594 | goto err; |
---|
505 | 595 | |
---|
506 | 596 | blk_mq_debugfs_register_sched(q); |
---|
507 | 597 | |
---|
508 | 598 | queue_for_each_hw_ctx(q, hctx, i) { |
---|
509 | | - if (e->ops.mq.init_hctx) { |
---|
510 | | - ret = e->ops.mq.init_hctx(hctx, i); |
---|
| 599 | + if (e->ops.init_hctx) { |
---|
| 600 | + ret = e->ops.init_hctx(hctx, i); |
---|
511 | 601 | if (ret) { |
---|
512 | 602 | eq = q->elevator; |
---|
| 603 | + blk_mq_sched_free_requests(q); |
---|
513 | 604 | blk_mq_exit_sched(q, eq); |
---|
514 | 605 | kobject_put(&eq->kobj); |
---|
515 | 606 | return ret; |
---|
.. | .. |
---|
521 | 612 | return 0; |
---|
522 | 613 | |
---|
523 | 614 | err: |
---|
| 615 | + blk_mq_sched_free_requests(q); |
---|
524 | 616 | blk_mq_sched_tags_teardown(q); |
---|
525 | 617 | q->elevator = NULL; |
---|
526 | 618 | return ret; |
---|
| 619 | +} |
---|
| 620 | + |
---|
| 621 | +/* |
---|
| 622 | + * called in either blk_queue_cleanup or elevator_switch, tagset |
---|
| 623 | + * is required for freeing requests |
---|
| 624 | + */ |
---|
| 625 | +void blk_mq_sched_free_requests(struct request_queue *q) |
---|
| 626 | +{ |
---|
| 627 | + struct blk_mq_hw_ctx *hctx; |
---|
| 628 | + int i; |
---|
| 629 | + |
---|
| 630 | + queue_for_each_hw_ctx(q, hctx, i) { |
---|
| 631 | + if (hctx->sched_tags) |
---|
| 632 | + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); |
---|
| 633 | + } |
---|
527 | 634 | } |
---|
528 | 635 | |
---|
529 | 636 | void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) |
---|
.. | .. |
---|
533 | 640 | |
---|
534 | 641 | queue_for_each_hw_ctx(q, hctx, i) { |
---|
535 | 642 | blk_mq_debugfs_unregister_sched_hctx(hctx); |
---|
536 | | - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { |
---|
537 | | - e->type->ops.mq.exit_hctx(hctx, i); |
---|
| 643 | + if (e->type->ops.exit_hctx && hctx->sched_data) { |
---|
| 644 | + e->type->ops.exit_hctx(hctx, i); |
---|
538 | 645 | hctx->sched_data = NULL; |
---|
539 | 646 | } |
---|
540 | 647 | } |
---|
541 | 648 | blk_mq_debugfs_unregister_sched(q); |
---|
542 | | - if (e->type->ops.mq.exit_sched) |
---|
543 | | - e->type->ops.mq.exit_sched(e); |
---|
| 649 | + if (e->type->ops.exit_sched) |
---|
| 650 | + e->type->ops.exit_sched(e); |
---|
544 | 651 | blk_mq_sched_tags_teardown(q); |
---|
545 | 652 | q->elevator = NULL; |
---|
546 | 653 | } |
---|