| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * blk-mq scheduling framework |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 6 | 7 | #include <linux/kernel.h> |
|---|
| 7 | 8 | #include <linux/module.h> |
|---|
| 8 | 9 | #include <linux/blk-mq.h> |
|---|
| 10 | +#include <linux/list_sort.h> |
|---|
| 9 | 11 | |
|---|
| 10 | 12 | #include <trace/events/block.h> |
|---|
| 11 | 13 | |
|---|
| .. | .. |
|---|
| 16 | 18 | #include "blk-mq-tag.h" |
|---|
| 17 | 19 | #include "blk-wbt.h" |
|---|
| 18 | 20 | |
|---|
| 19 | | -void blk_mq_sched_free_hctx_data(struct request_queue *q, |
|---|
| 20 | | - void (*exit)(struct blk_mq_hw_ctx *)) |
|---|
| 21 | | -{ |
|---|
| 22 | | - struct blk_mq_hw_ctx *hctx; |
|---|
| 23 | | - int i; |
|---|
| 24 | | - |
|---|
| 25 | | - queue_for_each_hw_ctx(q, hctx, i) { |
|---|
| 26 | | - if (exit && hctx->sched_data) |
|---|
| 27 | | - exit(hctx); |
|---|
| 28 | | - kfree(hctx->sched_data); |
|---|
| 29 | | - hctx->sched_data = NULL; |
|---|
| 30 | | - } |
|---|
| 31 | | -} |
|---|
| 32 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); |
|---|
| 33 | | - |
|---|
| 34 | | -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) |
|---|
| 21 | +void blk_mq_sched_assign_ioc(struct request *rq) |
|---|
| 35 | 22 | { |
|---|
| 36 | 23 | struct request_queue *q = rq->q; |
|---|
| 37 | | - struct io_context *ioc = rq_ioc(bio); |
|---|
| 24 | + struct io_context *ioc; |
|---|
| 38 | 25 | struct io_cq *icq; |
|---|
| 39 | 26 | |
|---|
| 40 | | - spin_lock_irq(q->queue_lock); |
|---|
| 27 | + /* |
|---|
| 28 | + * May not have an IO context if it's a passthrough request |
|---|
| 29 | + */ |
|---|
| 30 | + ioc = current->io_context; |
|---|
| 31 | + if (!ioc) |
|---|
| 32 | + return; |
|---|
| 33 | + |
|---|
| 34 | + spin_lock_irq(&q->queue_lock); |
|---|
| 41 | 35 | icq = ioc_lookup_icq(ioc, q); |
|---|
| 42 | | - spin_unlock_irq(q->queue_lock); |
|---|
| 36 | + spin_unlock_irq(&q->queue_lock); |
|---|
| 43 | 37 | |
|---|
| 44 | 38 | if (!icq) { |
|---|
| 45 | 39 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); |
|---|
| .. | .. |
|---|
| 51 | 45 | } |
|---|
| 52 | 46 | |
|---|
| 53 | 47 | /* |
|---|
| 54 | | - * Mark a hardware queue as needing a restart. For shared queues, maintain |
|---|
| 55 | | - * a count of how many hardware queues are marked for restart. |
|---|
| 48 | + * Mark a hardware queue as needing a restart. |
|---|
| 56 | 49 | */ |
|---|
| 57 | 50 | void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) |
|---|
| 58 | 51 | { |
|---|
| .. | .. |
|---|
| 81 | 74 | blk_mq_run_hw_queue(hctx, true); |
|---|
| 82 | 75 | } |
|---|
| 83 | 76 | |
|---|
| 77 | +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) |
|---|
| 78 | +{ |
|---|
| 79 | + struct request *rqa = container_of(a, struct request, queuelist); |
|---|
| 80 | + struct request *rqb = container_of(b, struct request, queuelist); |
|---|
| 81 | + |
|---|
| 82 | + return rqa->mq_hctx > rqb->mq_hctx; |
|---|
| 83 | +} |
|---|
| 84 | + |
|---|
| 85 | +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) |
|---|
| 86 | +{ |
|---|
| 87 | + struct blk_mq_hw_ctx *hctx = |
|---|
| 88 | + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; |
|---|
| 89 | + struct request *rq; |
|---|
| 90 | + LIST_HEAD(hctx_list); |
|---|
| 91 | + unsigned int count = 0; |
|---|
| 92 | + |
|---|
| 93 | + list_for_each_entry(rq, rq_list, queuelist) { |
|---|
| 94 | + if (rq->mq_hctx != hctx) { |
|---|
| 95 | + list_cut_before(&hctx_list, rq_list, &rq->queuelist); |
|---|
| 96 | + goto dispatch; |
|---|
| 97 | + } |
|---|
| 98 | + count++; |
|---|
| 99 | + } |
|---|
| 100 | + list_splice_tail_init(rq_list, &hctx_list); |
|---|
| 101 | + |
|---|
| 102 | +dispatch: |
|---|
| 103 | + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); |
|---|
| 104 | +} |
|---|
| 105 | + |
|---|
| 106 | +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ |
|---|
| 107 | + |
|---|
| 84 | 108 | /* |
|---|
| 85 | 109 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
|---|
| 86 | 110 | * its queue by itself in its completion handler, so we don't need to |
|---|
| 87 | | - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
|---|
| 111 | + * restart queue if .get_budget() fails to get the budget. |
|---|
| 112 | + * |
|---|
| 113 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
|---|
| 114 | + * be run again. This is necessary to avoid starving flushes. |
|---|
| 88 | 115 | */ |
|---|
| 89 | | -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
|---|
| 116 | +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
|---|
| 90 | 117 | { |
|---|
| 91 | 118 | struct request_queue *q = hctx->queue; |
|---|
| 92 | 119 | struct elevator_queue *e = q->elevator; |
|---|
| 120 | + bool multi_hctxs = false, run_queue = false; |
|---|
| 121 | + bool dispatched = false, busy = false; |
|---|
| 122 | + unsigned int max_dispatch; |
|---|
| 93 | 123 | LIST_HEAD(rq_list); |
|---|
| 124 | + int count = 0; |
|---|
| 125 | + |
|---|
| 126 | + if (hctx->dispatch_busy) |
|---|
| 127 | + max_dispatch = 1; |
|---|
| 128 | + else |
|---|
| 129 | + max_dispatch = hctx->queue->nr_requests; |
|---|
| 94 | 130 | |
|---|
| 95 | 131 | do { |
|---|
| 96 | 132 | struct request *rq; |
|---|
| 97 | 133 | |
|---|
| 98 | | - if (e->type->ops.mq.has_work && |
|---|
| 99 | | - !e->type->ops.mq.has_work(hctx)) |
|---|
| 134 | + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) |
|---|
| 100 | 135 | break; |
|---|
| 101 | 136 | |
|---|
| 102 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
|---|
| 137 | + if (!list_empty_careful(&hctx->dispatch)) { |
|---|
| 138 | + busy = true; |
|---|
| 139 | + break; |
|---|
| 140 | + } |
|---|
| 141 | + |
|---|
| 142 | + if (!blk_mq_get_dispatch_budget(q)) |
|---|
| 103 | 143 | break; |
|---|
| 104 | 144 | |
|---|
| 105 | | - rq = e->type->ops.mq.dispatch_request(hctx); |
|---|
| 145 | + rq = e->type->ops.dispatch_request(hctx); |
|---|
| 106 | 146 | if (!rq) { |
|---|
| 107 | | - blk_mq_put_dispatch_budget(hctx); |
|---|
| 147 | + blk_mq_put_dispatch_budget(q); |
|---|
| 148 | + /* |
|---|
| 149 | + * We're releasing without dispatching. Holding the |
|---|
| 150 | + * budget could have blocked any "hctx"s with the |
|---|
| 151 | + * same queue and if we didn't dispatch then there's |
|---|
| 152 | + * no guarantee anyone will kick the queue. Kick it |
|---|
| 153 | + * ourselves. |
|---|
| 154 | + */ |
|---|
| 155 | + run_queue = true; |
|---|
| 108 | 156 | break; |
|---|
| 109 | 157 | } |
|---|
| 110 | 158 | |
|---|
| .. | .. |
|---|
| 113 | 161 | * if this rq won't be queued to driver via .queue_rq() |
|---|
| 114 | 162 | * in blk_mq_dispatch_rq_list(). |
|---|
| 115 | 163 | */ |
|---|
| 116 | | - list_add(&rq->queuelist, &rq_list); |
|---|
| 117 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
|---|
| 164 | + list_add_tail(&rq->queuelist, &rq_list); |
|---|
| 165 | + if (rq->mq_hctx != hctx) |
|---|
| 166 | + multi_hctxs = true; |
|---|
| 167 | + } while (++count < max_dispatch); |
|---|
| 168 | + |
|---|
| 169 | + if (!count) { |
|---|
| 170 | + if (run_queue) |
|---|
| 171 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
|---|
| 172 | + } else if (multi_hctxs) { |
|---|
| 173 | + /* |
|---|
| 174 | + * Requests from different hctx may be dequeued from some |
|---|
| 175 | + * schedulers, such as bfq and deadline. |
|---|
| 176 | + * |
|---|
| 177 | + * Sort the requests in the list according to their hctx, |
|---|
| 178 | + * dispatch batching requests from same hctx at a time. |
|---|
| 179 | + */ |
|---|
| 180 | + list_sort(NULL, &rq_list, sched_rq_cmp); |
|---|
| 181 | + do { |
|---|
| 182 | + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); |
|---|
| 183 | + } while (!list_empty(&rq_list)); |
|---|
| 184 | + } else { |
|---|
| 185 | + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); |
|---|
| 186 | + } |
|---|
| 187 | + |
|---|
| 188 | + if (busy) |
|---|
| 189 | + return -EAGAIN; |
|---|
| 190 | + return !!dispatched; |
|---|
| 191 | +} |
|---|
| 192 | + |
|---|
| 193 | +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) |
|---|
| 194 | +{ |
|---|
| 195 | + unsigned long end = jiffies + HZ; |
|---|
| 196 | + int ret; |
|---|
| 197 | + |
|---|
| 198 | + do { |
|---|
| 199 | + ret = __blk_mq_do_dispatch_sched(hctx); |
|---|
| 200 | + if (ret != 1) |
|---|
| 201 | + break; |
|---|
| 202 | + if (need_resched() || time_is_before_jiffies(end)) { |
|---|
| 203 | + blk_mq_delay_run_hw_queue(hctx, 0); |
|---|
| 204 | + break; |
|---|
| 205 | + } |
|---|
| 206 | + } while (1); |
|---|
| 207 | + |
|---|
| 208 | + return ret; |
|---|
| 118 | 209 | } |
|---|
| 119 | 210 | |
|---|
| 120 | 211 | static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, |
|---|
| 121 | 212 | struct blk_mq_ctx *ctx) |
|---|
| 122 | 213 | { |
|---|
| 123 | | - unsigned idx = ctx->index_hw; |
|---|
| 214 | + unsigned short idx = ctx->index_hw[hctx->type]; |
|---|
| 124 | 215 | |
|---|
| 125 | 216 | if (++idx == hctx->nr_ctx) |
|---|
| 126 | 217 | idx = 0; |
|---|
| .. | .. |
|---|
| 131 | 222 | /* |
|---|
| 132 | 223 | * Only SCSI implements .get_budget and .put_budget, and SCSI restarts |
|---|
| 133 | 224 | * its queue by itself in its completion handler, so we don't need to |
|---|
| 134 | | - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. |
|---|
| 225 | + * restart queue if .get_budget() fails to get the budget. |
|---|
| 226 | + * |
|---|
| 227 | + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to |
|---|
| 228 | + * be run again. This is necessary to avoid starving flushes. |
|---|
| 135 | 229 | */ |
|---|
| 136 | | -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
|---|
| 230 | +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) |
|---|
| 137 | 231 | { |
|---|
| 138 | 232 | struct request_queue *q = hctx->queue; |
|---|
| 139 | 233 | LIST_HEAD(rq_list); |
|---|
| 140 | 234 | struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); |
|---|
| 235 | + int ret = 0; |
|---|
| 236 | + struct request *rq; |
|---|
| 141 | 237 | |
|---|
| 142 | 238 | do { |
|---|
| 143 | | - struct request *rq; |
|---|
| 239 | + if (!list_empty_careful(&hctx->dispatch)) { |
|---|
| 240 | + ret = -EAGAIN; |
|---|
| 241 | + break; |
|---|
| 242 | + } |
|---|
| 144 | 243 | |
|---|
| 145 | 244 | if (!sbitmap_any_bit_set(&hctx->ctx_map)) |
|---|
| 146 | 245 | break; |
|---|
| 147 | 246 | |
|---|
| 148 | | - if (!blk_mq_get_dispatch_budget(hctx)) |
|---|
| 247 | + if (!blk_mq_get_dispatch_budget(q)) |
|---|
| 149 | 248 | break; |
|---|
| 150 | 249 | |
|---|
| 151 | 250 | rq = blk_mq_dequeue_from_ctx(hctx, ctx); |
|---|
| 152 | 251 | if (!rq) { |
|---|
| 153 | | - blk_mq_put_dispatch_budget(hctx); |
|---|
| 252 | + blk_mq_put_dispatch_budget(q); |
|---|
| 253 | + /* |
|---|
| 254 | + * We're releasing without dispatching. Holding the |
|---|
| 255 | + * budget could have blocked any "hctx"s with the |
|---|
| 256 | + * same queue and if we didn't dispatch then there's |
|---|
| 257 | + * no guarantee anyone will kick the queue. Kick it |
|---|
| 258 | + * ourselves. |
|---|
| 259 | + */ |
|---|
| 260 | + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); |
|---|
| 154 | 261 | break; |
|---|
| 155 | 262 | } |
|---|
| 156 | 263 | |
|---|
| .. | .. |
|---|
| 164 | 271 | /* round robin for fair dispatch */ |
|---|
| 165 | 272 | ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); |
|---|
| 166 | 273 | |
|---|
| 167 | | - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); |
|---|
| 274 | + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); |
|---|
| 168 | 275 | |
|---|
| 169 | 276 | WRITE_ONCE(hctx->dispatch_from, ctx); |
|---|
| 277 | + return ret; |
|---|
| 170 | 278 | } |
|---|
| 171 | 279 | |
|---|
| 172 | | -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
|---|
| 280 | +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
|---|
| 173 | 281 | { |
|---|
| 174 | 282 | struct request_queue *q = hctx->queue; |
|---|
| 175 | 283 | struct elevator_queue *e = q->elevator; |
|---|
| 176 | | - const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
|---|
| 284 | + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; |
|---|
| 285 | + int ret = 0; |
|---|
| 177 | 286 | LIST_HEAD(rq_list); |
|---|
| 178 | | - |
|---|
| 179 | | - /* RCU or SRCU read lock is needed before checking quiesced flag */ |
|---|
| 180 | | - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
|---|
| 181 | | - return; |
|---|
| 182 | | - |
|---|
| 183 | | - hctx->run++; |
|---|
| 184 | 287 | |
|---|
| 185 | 288 | /* |
|---|
| 186 | 289 | * If we have previous entries on our dispatch list, grab them first for |
|---|
| .. | .. |
|---|
| 208 | 311 | */ |
|---|
| 209 | 312 | if (!list_empty(&rq_list)) { |
|---|
| 210 | 313 | blk_mq_sched_mark_restart_hctx(hctx); |
|---|
| 211 | | - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { |
|---|
| 314 | + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { |
|---|
| 212 | 315 | if (has_sched_dispatch) |
|---|
| 213 | | - blk_mq_do_dispatch_sched(hctx); |
|---|
| 316 | + ret = blk_mq_do_dispatch_sched(hctx); |
|---|
| 214 | 317 | else |
|---|
| 215 | | - blk_mq_do_dispatch_ctx(hctx); |
|---|
| 318 | + ret = blk_mq_do_dispatch_ctx(hctx); |
|---|
| 216 | 319 | } |
|---|
| 217 | 320 | } else if (has_sched_dispatch) { |
|---|
| 218 | | - blk_mq_do_dispatch_sched(hctx); |
|---|
| 321 | + ret = blk_mq_do_dispatch_sched(hctx); |
|---|
| 219 | 322 | } else if (hctx->dispatch_busy) { |
|---|
| 220 | 323 | /* dequeue request one by one from sw queue if queue is busy */ |
|---|
| 221 | | - blk_mq_do_dispatch_ctx(hctx); |
|---|
| 324 | + ret = blk_mq_do_dispatch_ctx(hctx); |
|---|
| 222 | 325 | } else { |
|---|
| 223 | 326 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
|---|
| 224 | | - blk_mq_dispatch_rq_list(q, &rq_list, false); |
|---|
| 327 | + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); |
|---|
| 225 | 328 | } |
|---|
| 329 | + |
|---|
| 330 | + return ret; |
|---|
| 226 | 331 | } |
|---|
| 227 | 332 | |
|---|
| 228 | | -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
|---|
| 229 | | - struct request **merged_request) |
|---|
| 333 | +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) |
|---|
| 230 | 334 | { |
|---|
| 231 | | - struct request *rq; |
|---|
| 335 | + struct request_queue *q = hctx->queue; |
|---|
| 232 | 336 | |
|---|
| 233 | | - switch (elv_merge(q, &rq, bio)) { |
|---|
| 234 | | - case ELEVATOR_BACK_MERGE: |
|---|
| 235 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
|---|
| 236 | | - return false; |
|---|
| 237 | | - if (!bio_attempt_back_merge(q, rq, bio)) |
|---|
| 238 | | - return false; |
|---|
| 239 | | - *merged_request = attempt_back_merge(q, rq); |
|---|
| 240 | | - if (!*merged_request) |
|---|
| 241 | | - elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); |
|---|
| 242 | | - return true; |
|---|
| 243 | | - case ELEVATOR_FRONT_MERGE: |
|---|
| 244 | | - if (!blk_mq_sched_allow_merge(q, rq, bio)) |
|---|
| 245 | | - return false; |
|---|
| 246 | | - if (!bio_attempt_front_merge(q, rq, bio)) |
|---|
| 247 | | - return false; |
|---|
| 248 | | - *merged_request = attempt_front_merge(q, rq); |
|---|
| 249 | | - if (!*merged_request) |
|---|
| 250 | | - elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); |
|---|
| 251 | | - return true; |
|---|
| 252 | | - case ELEVATOR_DISCARD_MERGE: |
|---|
| 253 | | - return bio_attempt_discard_merge(q, rq, bio); |
|---|
| 254 | | - default: |
|---|
| 255 | | - return false; |
|---|
| 337 | + /* RCU or SRCU read lock is needed before checking quiesced flag */ |
|---|
| 338 | + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) |
|---|
| 339 | + return; |
|---|
| 340 | + |
|---|
| 341 | + hctx->run++; |
|---|
| 342 | + |
|---|
| 343 | + /* |
|---|
| 344 | + * A return of -EAGAIN is an indication that hctx->dispatch is not |
|---|
| 345 | + * empty and we must run again in order to avoid starving flushes. |
|---|
| 346 | + */ |
|---|
| 347 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { |
|---|
| 348 | + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) |
|---|
| 349 | + blk_mq_run_hw_queue(hctx, true); |
|---|
| 256 | 350 | } |
|---|
| 257 | 351 | } |
|---|
| 258 | | -EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); |
|---|
| 259 | 352 | |
|---|
| 260 | | -/* |
|---|
| 261 | | - * Iterate list of requests and see if we can merge this bio with any |
|---|
| 262 | | - * of them. |
|---|
| 263 | | - */ |
|---|
| 264 | | -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, |
|---|
| 265 | | - struct bio *bio) |
|---|
| 266 | | -{ |
|---|
| 267 | | - struct request *rq; |
|---|
| 268 | | - int checked = 8; |
|---|
| 269 | | - |
|---|
| 270 | | - list_for_each_entry_reverse(rq, list, queuelist) { |
|---|
| 271 | | - bool merged = false; |
|---|
| 272 | | - |
|---|
| 273 | | - if (!checked--) |
|---|
| 274 | | - break; |
|---|
| 275 | | - |
|---|
| 276 | | - if (!blk_rq_merge_ok(rq, bio)) |
|---|
| 277 | | - continue; |
|---|
| 278 | | - |
|---|
| 279 | | - switch (blk_try_merge(rq, bio)) { |
|---|
| 280 | | - case ELEVATOR_BACK_MERGE: |
|---|
| 281 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
|---|
| 282 | | - merged = bio_attempt_back_merge(q, rq, bio); |
|---|
| 283 | | - break; |
|---|
| 284 | | - case ELEVATOR_FRONT_MERGE: |
|---|
| 285 | | - if (blk_mq_sched_allow_merge(q, rq, bio)) |
|---|
| 286 | | - merged = bio_attempt_front_merge(q, rq, bio); |
|---|
| 287 | | - break; |
|---|
| 288 | | - case ELEVATOR_DISCARD_MERGE: |
|---|
| 289 | | - merged = bio_attempt_discard_merge(q, rq, bio); |
|---|
| 290 | | - break; |
|---|
| 291 | | - default: |
|---|
| 292 | | - continue; |
|---|
| 293 | | - } |
|---|
| 294 | | - |
|---|
| 295 | | - return merged; |
|---|
| 296 | | - } |
|---|
| 297 | | - |
|---|
| 298 | | - return false; |
|---|
| 299 | | -} |
|---|
| 300 | | -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); |
|---|
| 301 | | - |
|---|
| 302 | | -/* |
|---|
| 303 | | - * Reverse check our software queue for entries that we could potentially |
|---|
| 304 | | - * merge with. Currently includes a hand-wavy stop count of 8, to not spend |
|---|
| 305 | | - * too much time checking for merges. |
|---|
| 306 | | - */ |
|---|
| 307 | | -static bool blk_mq_attempt_merge(struct request_queue *q, |
|---|
| 308 | | - struct blk_mq_ctx *ctx, struct bio *bio) |
|---|
| 309 | | -{ |
|---|
| 310 | | - lockdep_assert_held(&ctx->lock); |
|---|
| 311 | | - |
|---|
| 312 | | - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { |
|---|
| 313 | | - ctx->rq_merged++; |
|---|
| 314 | | - return true; |
|---|
| 315 | | - } |
|---|
| 316 | | - |
|---|
| 317 | | - return false; |
|---|
| 318 | | -} |
|---|
| 319 | | - |
|---|
| 320 | | -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) |
|---|
| 353 | +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, |
|---|
| 354 | + unsigned int nr_segs) |
|---|
| 321 | 355 | { |
|---|
| 322 | 356 | struct elevator_queue *e = q->elevator; |
|---|
| 323 | | - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
|---|
| 324 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
|---|
| 357 | + struct blk_mq_ctx *ctx; |
|---|
| 358 | + struct blk_mq_hw_ctx *hctx; |
|---|
| 325 | 359 | bool ret = false; |
|---|
| 360 | + enum hctx_type type; |
|---|
| 326 | 361 | |
|---|
| 327 | | - if (e && e->type->ops.mq.bio_merge) { |
|---|
| 328 | | - blk_mq_put_ctx(ctx); |
|---|
| 329 | | - return e->type->ops.mq.bio_merge(hctx, bio); |
|---|
| 362 | + if (e && e->type->ops.bio_merge) |
|---|
| 363 | + return e->type->ops.bio_merge(q, bio, nr_segs); |
|---|
| 364 | + |
|---|
| 365 | + ctx = blk_mq_get_ctx(q); |
|---|
| 366 | + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); |
|---|
| 367 | + type = hctx->type; |
|---|
| 368 | + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || |
|---|
| 369 | + list_empty_careful(&ctx->rq_lists[type])) |
|---|
| 370 | + return false; |
|---|
| 371 | + |
|---|
| 372 | + /* default per sw-queue merge */ |
|---|
| 373 | + spin_lock(&ctx->lock); |
|---|
| 374 | + /* |
|---|
| 375 | + * Reverse check our software queue for entries that we could |
|---|
| 376 | + * potentially merge with. Currently includes a hand-wavy stop |
|---|
| 377 | + * count of 8, to not spend too much time checking for merges. |
|---|
| 378 | + */ |
|---|
| 379 | + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { |
|---|
| 380 | + ctx->rq_merged++; |
|---|
| 381 | + ret = true; |
|---|
| 330 | 382 | } |
|---|
| 331 | 383 | |
|---|
| 332 | | - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
|---|
| 333 | | - !list_empty_careful(&ctx->rq_list)) { |
|---|
| 334 | | - /* default per sw-queue merge */ |
|---|
| 335 | | - spin_lock(&ctx->lock); |
|---|
| 336 | | - ret = blk_mq_attempt_merge(q, ctx, bio); |
|---|
| 337 | | - spin_unlock(&ctx->lock); |
|---|
| 338 | | - } |
|---|
| 384 | + spin_unlock(&ctx->lock); |
|---|
| 339 | 385 | |
|---|
| 340 | | - blk_mq_put_ctx(ctx); |
|---|
| 341 | 386 | return ret; |
|---|
| 342 | 387 | } |
|---|
| 343 | 388 | |
|---|
| .. | .. |
|---|
| 357 | 402 | bool has_sched, |
|---|
| 358 | 403 | struct request *rq) |
|---|
| 359 | 404 | { |
|---|
| 360 | | - /* dispatch flush rq directly */ |
|---|
| 361 | | - if (rq->rq_flags & RQF_FLUSH_SEQ) { |
|---|
| 362 | | - spin_lock(&hctx->lock); |
|---|
| 363 | | - list_add(&rq->queuelist, &hctx->dispatch); |
|---|
| 364 | | - spin_unlock(&hctx->lock); |
|---|
| 405 | + /* |
|---|
| 406 | + * dispatch flush and passthrough rq directly |
|---|
| 407 | + * |
|---|
| 408 | + * passthrough request has to be added to hctx->dispatch directly. |
|---|
| 409 | + * For some reason, device may be in one situation which can't |
|---|
| 410 | + * handle FS request, so STS_RESOURCE is always returned and the |
|---|
| 411 | + * FS request will be added to hctx->dispatch. However passthrough |
|---|
| 412 | + * request may be required at that time for fixing the problem. If |
|---|
| 413 | + * passthrough request is added to scheduler queue, there isn't any |
|---|
| 414 | + * chance to dispatch it given we prioritize requests in hctx->dispatch. |
|---|
| 415 | + */ |
|---|
| 416 | + if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) |
|---|
| 365 | 417 | return true; |
|---|
| 366 | | - } |
|---|
| 367 | 418 | |
|---|
| 368 | 419 | if (has_sched) |
|---|
| 369 | 420 | rq->rq_flags |= RQF_SORTED; |
|---|
| .. | .. |
|---|
| 377 | 428 | struct request_queue *q = rq->q; |
|---|
| 378 | 429 | struct elevator_queue *e = q->elevator; |
|---|
| 379 | 430 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
|---|
| 380 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
|---|
| 431 | + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; |
|---|
| 381 | 432 | |
|---|
| 382 | | - /* flush rq in flush machinery need to be dispatched directly */ |
|---|
| 383 | | - if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { |
|---|
| 384 | | - blk_insert_flush(rq); |
|---|
| 433 | + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); |
|---|
| 434 | + |
|---|
| 435 | + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { |
|---|
| 436 | + /* |
|---|
| 437 | + * Firstly normal IO request is inserted to scheduler queue or |
|---|
| 438 | + * sw queue, meantime we add flush request to dispatch queue( |
|---|
| 439 | + * hctx->dispatch) directly and there is at most one in-flight |
|---|
| 440 | + * flush request for each hw queue, so it doesn't matter to add |
|---|
| 441 | + * flush request to tail or front of the dispatch queue. |
|---|
| 442 | + * |
|---|
| 443 | + * Secondly in case of NCQ, flush request belongs to non-NCQ |
|---|
| 444 | + * command, and queueing it will fail when there is any |
|---|
| 445 | + * in-flight normal IO request(NCQ command). When adding flush |
|---|
| 446 | + * rq to the front of hctx->dispatch, it is easier to introduce |
|---|
| 447 | + * extra time to flush rq's latency because of S_SCHED_RESTART |
|---|
| 448 | + * compared with adding to the tail of dispatch queue, then |
|---|
| 449 | + * chance of flush merge is increased, and less flush requests |
|---|
| 450 | + * will be issued to controller. It is observed that ~10% time |
|---|
| 451 | + * is saved in blktests block/004 on disk attached to AHCI/NCQ |
|---|
| 452 | + * drive when adding flush rq to the front of hctx->dispatch. |
|---|
| 453 | + * |
|---|
| 454 | + * Simply queue flush rq to the front of hctx->dispatch so that |
|---|
| 455 | + * intensive flush workloads can benefit in case of NCQ HW. |
|---|
| 456 | + */ |
|---|
| 457 | + at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; |
|---|
| 458 | + blk_mq_request_bypass_insert(rq, at_head, false); |
|---|
| 385 | 459 | goto run; |
|---|
| 386 | 460 | } |
|---|
| 387 | 461 | |
|---|
| 388 | | - WARN_ON(e && (rq->tag != -1)); |
|---|
| 389 | | - |
|---|
| 390 | | - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) |
|---|
| 391 | | - goto run; |
|---|
| 392 | | - |
|---|
| 393 | | - if (e && e->type->ops.mq.insert_requests) { |
|---|
| 462 | + if (e && e->type->ops.insert_requests) { |
|---|
| 394 | 463 | LIST_HEAD(list); |
|---|
| 395 | 464 | |
|---|
| 396 | 465 | list_add(&rq->queuelist, &list); |
|---|
| 397 | | - e->type->ops.mq.insert_requests(hctx, &list, at_head); |
|---|
| 466 | + e->type->ops.insert_requests(hctx, &list, at_head); |
|---|
| 398 | 467 | } else { |
|---|
| 399 | 468 | spin_lock(&ctx->lock); |
|---|
| 400 | 469 | __blk_mq_insert_request(hctx, rq, at_head); |
|---|
| .. | .. |
|---|
| 406 | 475 | blk_mq_run_hw_queue(hctx, async); |
|---|
| 407 | 476 | } |
|---|
| 408 | 477 | |
|---|
| 409 | | -void blk_mq_sched_insert_requests(struct request_queue *q, |
|---|
| 478 | +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, |
|---|
| 410 | 479 | struct blk_mq_ctx *ctx, |
|---|
| 411 | 480 | struct list_head *list, bool run_queue_async) |
|---|
| 412 | 481 | { |
|---|
| 413 | | - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); |
|---|
| 414 | | - struct elevator_queue *e = hctx->queue->elevator; |
|---|
| 482 | + struct elevator_queue *e; |
|---|
| 483 | + struct request_queue *q = hctx->queue; |
|---|
| 415 | 484 | |
|---|
| 416 | | - if (e && e->type->ops.mq.insert_requests) |
|---|
| 417 | | - e->type->ops.mq.insert_requests(hctx, list, false); |
|---|
| 485 | + /* |
|---|
| 486 | + * blk_mq_sched_insert_requests() is called from flush plug |
|---|
| 487 | + * context only, and hold one usage counter to prevent queue |
|---|
| 488 | + * from being released. |
|---|
| 489 | + */ |
|---|
| 490 | + percpu_ref_get(&q->q_usage_counter); |
|---|
| 491 | + |
|---|
| 492 | + e = hctx->queue->elevator; |
|---|
| 493 | + if (e && e->type->ops.insert_requests) |
|---|
| 494 | + e->type->ops.insert_requests(hctx, list, false); |
|---|
| 418 | 495 | else { |
|---|
| 419 | 496 | /* |
|---|
| 420 | 497 | * try to issue requests directly if the hw queue isn't |
|---|
| .. | .. |
|---|
| 424 | 501 | if (!hctx->dispatch_busy && !e && !run_queue_async) { |
|---|
| 425 | 502 | blk_mq_try_issue_list_directly(hctx, list); |
|---|
| 426 | 503 | if (list_empty(list)) |
|---|
| 427 | | - return; |
|---|
| 504 | + goto out; |
|---|
| 428 | 505 | } |
|---|
| 429 | 506 | blk_mq_insert_requests(hctx, ctx, list); |
|---|
| 430 | 507 | } |
|---|
| 431 | 508 | |
|---|
| 432 | 509 | blk_mq_run_hw_queue(hctx, run_queue_async); |
|---|
| 510 | + out: |
|---|
| 511 | + percpu_ref_put(&q->q_usage_counter); |
|---|
| 433 | 512 | } |
|---|
| 434 | 513 | |
|---|
| 435 | 514 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, |
|---|
| 436 | 515 | struct blk_mq_hw_ctx *hctx, |
|---|
| 437 | 516 | unsigned int hctx_idx) |
|---|
| 438 | 517 | { |
|---|
| 518 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
|---|
| 519 | + |
|---|
| 439 | 520 | if (hctx->sched_tags) { |
|---|
| 440 | 521 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); |
|---|
| 441 | | - blk_mq_free_rq_map(hctx->sched_tags); |
|---|
| 522 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
|---|
| 442 | 523 | hctx->sched_tags = NULL; |
|---|
| 443 | 524 | } |
|---|
| 444 | 525 | } |
|---|
| .. | .. |
|---|
| 448 | 529 | unsigned int hctx_idx) |
|---|
| 449 | 530 | { |
|---|
| 450 | 531 | struct blk_mq_tag_set *set = q->tag_set; |
|---|
| 532 | + /* Clear HCTX_SHARED so tags are init'ed */ |
|---|
| 533 | + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
|---|
| 451 | 534 | int ret; |
|---|
| 452 | 535 | |
|---|
| 453 | 536 | hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, |
|---|
| 454 | | - set->reserved_tags); |
|---|
| 537 | + set->reserved_tags, flags); |
|---|
| 455 | 538 | if (!hctx->sched_tags) |
|---|
| 456 | 539 | return -ENOMEM; |
|---|
| 457 | 540 | |
|---|
| .. | .. |
|---|
| 462 | 545 | return ret; |
|---|
| 463 | 546 | } |
|---|
| 464 | 547 | |
|---|
| 548 | +/* called in queue's release handler, tagset has gone away */ |
|---|
| 465 | 549 | static void blk_mq_sched_tags_teardown(struct request_queue *q) |
|---|
| 466 | 550 | { |
|---|
| 467 | | - struct blk_mq_tag_set *set = q->tag_set; |
|---|
| 468 | 551 | struct blk_mq_hw_ctx *hctx; |
|---|
| 469 | 552 | int i; |
|---|
| 470 | 553 | |
|---|
| 471 | | - queue_for_each_hw_ctx(q, hctx, i) |
|---|
| 472 | | - blk_mq_sched_free_tags(set, hctx, i); |
|---|
| 554 | + queue_for_each_hw_ctx(q, hctx, i) { |
|---|
| 555 | + /* Clear HCTX_SHARED so tags are freed */ |
|---|
| 556 | + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; |
|---|
| 557 | + |
|---|
| 558 | + if (hctx->sched_tags) { |
|---|
| 559 | + blk_mq_free_rq_map(hctx->sched_tags, flags); |
|---|
| 560 | + hctx->sched_tags = NULL; |
|---|
| 561 | + } |
|---|
| 562 | + } |
|---|
| 473 | 563 | } |
|---|
| 474 | 564 | |
|---|
| 475 | 565 | int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) |
|---|
| .. | .. |
|---|
| 499 | 589 | goto err; |
|---|
| 500 | 590 | } |
|---|
| 501 | 591 | |
|---|
| 502 | | - ret = e->ops.mq.init_sched(q, e); |
|---|
| 592 | + ret = e->ops.init_sched(q, e); |
|---|
| 503 | 593 | if (ret) |
|---|
| 504 | 594 | goto err; |
|---|
| 505 | 595 | |
|---|
| 506 | 596 | blk_mq_debugfs_register_sched(q); |
|---|
| 507 | 597 | |
|---|
| 508 | 598 | queue_for_each_hw_ctx(q, hctx, i) { |
|---|
| 509 | | - if (e->ops.mq.init_hctx) { |
|---|
| 510 | | - ret = e->ops.mq.init_hctx(hctx, i); |
|---|
| 599 | + if (e->ops.init_hctx) { |
|---|
| 600 | + ret = e->ops.init_hctx(hctx, i); |
|---|
| 511 | 601 | if (ret) { |
|---|
| 512 | 602 | eq = q->elevator; |
|---|
| 603 | + blk_mq_sched_free_requests(q); |
|---|
| 513 | 604 | blk_mq_exit_sched(q, eq); |
|---|
| 514 | 605 | kobject_put(&eq->kobj); |
|---|
| 515 | 606 | return ret; |
|---|
| .. | .. |
|---|
| 521 | 612 | return 0; |
|---|
| 522 | 613 | |
|---|
| 523 | 614 | err: |
|---|
| 615 | + blk_mq_sched_free_requests(q); |
|---|
| 524 | 616 | blk_mq_sched_tags_teardown(q); |
|---|
| 525 | 617 | q->elevator = NULL; |
|---|
| 526 | 618 | return ret; |
|---|
| 619 | +} |
|---|
| 620 | + |
|---|
| 621 | +/* |
|---|
| 622 | + * called in either blk_queue_cleanup or elevator_switch, tagset |
|---|
| 623 | + * is required for freeing requests |
|---|
| 624 | + */ |
|---|
| 625 | +void blk_mq_sched_free_requests(struct request_queue *q) |
|---|
| 626 | +{ |
|---|
| 627 | + struct blk_mq_hw_ctx *hctx; |
|---|
| 628 | + int i; |
|---|
| 629 | + |
|---|
| 630 | + queue_for_each_hw_ctx(q, hctx, i) { |
|---|
| 631 | + if (hctx->sched_tags) |
|---|
| 632 | + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); |
|---|
| 633 | + } |
|---|
| 527 | 634 | } |
|---|
| 528 | 635 | |
|---|
| 529 | 636 | void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) |
|---|
| .. | .. |
|---|
| 533 | 640 | |
|---|
| 534 | 641 | queue_for_each_hw_ctx(q, hctx, i) { |
|---|
| 535 | 642 | blk_mq_debugfs_unregister_sched_hctx(hctx); |
|---|
| 536 | | - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { |
|---|
| 537 | | - e->type->ops.mq.exit_hctx(hctx, i); |
|---|
| 643 | + if (e->type->ops.exit_hctx && hctx->sched_data) { |
|---|
| 644 | + e->type->ops.exit_hctx(hctx, i); |
|---|
| 538 | 645 | hctx->sched_data = NULL; |
|---|
| 539 | 646 | } |
|---|
| 540 | 647 | } |
|---|
| 541 | 648 | blk_mq_debugfs_unregister_sched(q); |
|---|
| 542 | | - if (e->type->ops.mq.exit_sched) |
|---|
| 543 | | - e->type->ops.mq.exit_sched(e); |
|---|
| 649 | + if (e->type->ops.exit_sched) |
|---|
| 650 | + e->type->ops.exit_sched(e); |
|---|
| 544 | 651 | blk_mq_sched_tags_teardown(q); |
|---|
| 545 | 652 | q->elevator = NULL; |
|---|
| 546 | 653 | } |
|---|