hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/block/blk-core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Copyright (C) 1991, 1992 Linus Torvalds
34 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
....@@ -19,6 +20,7 @@
1920 #include <linux/blk-mq.h>
2021 #include <linux/highmem.h>
2122 #include <linux/mm.h>
23
+#include <linux/pagemap.h>
2224 #include <linux/kernel_stat.h>
2325 #include <linux/string.h>
2426 #include <linux/init.h>
....@@ -33,9 +35,11 @@
3335 #include <linux/ratelimit.h>
3436 #include <linux/pm_runtime.h>
3537 #include <linux/blk-cgroup.h>
38
+#include <linux/t10-pi.h>
3639 #include <linux/debugfs.h>
3740 #include <linux/bpf.h>
3841 #include <linux/psi.h>
42
+#include <linux/sched/sysctl.h>
3943 #include <linux/blk-crypto.h>
4044
4145 #define CREATE_TRACE_POINTS
....@@ -44,24 +48,25 @@
4448 #include "blk.h"
4549 #include "blk-mq.h"
4650 #include "blk-mq-sched.h"
51
+#include "blk-pm.h"
4752 #include "blk-rq-qos.h"
4853
49
-#ifdef CONFIG_DEBUG_FS
5054 struct dentry *blk_debugfs_root;
51
-#endif
5255
5356 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
5457 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
5558 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
5659 EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
5760 EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
61
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_queue);
62
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_getrq);
63
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
64
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_issue);
65
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_merge);
66
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_requeue);
67
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_complete);
5868
5969 DEFINE_IDA(blk_queue_ida);
60
-
61
-/*
62
- * For the allocated request tables
63
- */
64
-struct kmem_cache *request_cachep;
6570
6671 /*
6772 * For queue allocation
....@@ -80,11 +85,7 @@
8085 */
8186 void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
8287 {
83
- unsigned long flags;
84
-
85
- spin_lock_irqsave(q->queue_lock, flags);
86
- queue_flag_set(flag, q);
87
- spin_unlock_irqrestore(q->queue_lock, flags);
88
+ set_bit(flag, &q->queue_flags);
8889 }
8990 EXPORT_SYMBOL(blk_queue_flag_set);
9091
....@@ -95,11 +96,7 @@
9596 */
9697 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
9798 {
98
- unsigned long flags;
99
-
100
- spin_lock_irqsave(q->queue_lock, flags);
101
- queue_flag_clear(flag, q);
102
- spin_unlock_irqrestore(q->queue_lock, flags);
99
+ clear_bit(flag, &q->queue_flags);
103100 }
104101 EXPORT_SYMBOL(blk_queue_flag_clear);
105102
....@@ -113,96 +110,67 @@
113110 */
114111 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
115112 {
116
- unsigned long flags;
117
- bool res;
118
-
119
- spin_lock_irqsave(q->queue_lock, flags);
120
- res = queue_flag_test_and_set(flag, q);
121
- spin_unlock_irqrestore(q->queue_lock, flags);
122
-
123
- return res;
113
+ return test_and_set_bit(flag, &q->queue_flags);
124114 }
125115 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
126
-
127
-/**
128
- * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
129
- * @flag: flag to be cleared
130
- * @q: request queue
131
- *
132
- * Returns the previous value of @flag - 0 if the flag was not set and 1 if
133
- * the flag was set.
134
- */
135
-bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
136
-{
137
- unsigned long flags;
138
- bool res;
139
-
140
- spin_lock_irqsave(q->queue_lock, flags);
141
- res = queue_flag_test_and_clear(flag, q);
142
- spin_unlock_irqrestore(q->queue_lock, flags);
143
-
144
- return res;
145
-}
146
-EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
147
-
148
-static void blk_clear_congested(struct request_list *rl, int sync)
149
-{
150
-#ifdef CONFIG_CGROUP_WRITEBACK
151
- clear_wb_congested(rl->blkg->wb_congested, sync);
152
-#else
153
- /*
154
- * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
155
- * flip its congestion state for events on other blkcgs.
156
- */
157
- if (rl == &rl->q->root_rl)
158
- clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
159
-#endif
160
-}
161
-
162
-static void blk_set_congested(struct request_list *rl, int sync)
163
-{
164
-#ifdef CONFIG_CGROUP_WRITEBACK
165
- set_wb_congested(rl->blkg->wb_congested, sync);
166
-#else
167
- /* see blk_clear_congested() */
168
- if (rl == &rl->q->root_rl)
169
- set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
170
-#endif
171
-}
172
-
173
-void blk_queue_congestion_threshold(struct request_queue *q)
174
-{
175
- int nr;
176
-
177
- nr = q->nr_requests - (q->nr_requests / 8) + 1;
178
- if (nr > q->nr_requests)
179
- nr = q->nr_requests;
180
- q->nr_congestion_on = nr;
181
-
182
- nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
183
- if (nr < 1)
184
- nr = 1;
185
- q->nr_congestion_off = nr;
186
-}
187116
188117 void blk_rq_init(struct request_queue *q, struct request *rq)
189118 {
190119 memset(rq, 0, sizeof(*rq));
191120
192121 INIT_LIST_HEAD(&rq->queuelist);
193
- INIT_LIST_HEAD(&rq->timeout_list);
194
- rq->cpu = -1;
195122 rq->q = q;
196123 rq->__sector = (sector_t) -1;
197124 INIT_HLIST_NODE(&rq->hash);
198125 RB_CLEAR_NODE(&rq->rb_node);
199
- rq->tag = -1;
200
- rq->internal_tag = -1;
126
+ rq->tag = BLK_MQ_NO_TAG;
127
+ rq->internal_tag = BLK_MQ_NO_TAG;
201128 rq->start_time_ns = ktime_get_ns();
202129 rq->part = NULL;
203
- refcount_set(&rq->ref, 1);
130
+ blk_crypto_rq_set_defaults(rq);
204131 }
205132 EXPORT_SYMBOL(blk_rq_init);
133
+
134
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
135
+static const char *const blk_op_name[] = {
136
+ REQ_OP_NAME(READ),
137
+ REQ_OP_NAME(WRITE),
138
+ REQ_OP_NAME(FLUSH),
139
+ REQ_OP_NAME(DISCARD),
140
+ REQ_OP_NAME(SECURE_ERASE),
141
+ REQ_OP_NAME(ZONE_RESET),
142
+ REQ_OP_NAME(ZONE_RESET_ALL),
143
+ REQ_OP_NAME(ZONE_OPEN),
144
+ REQ_OP_NAME(ZONE_CLOSE),
145
+ REQ_OP_NAME(ZONE_FINISH),
146
+ REQ_OP_NAME(ZONE_APPEND),
147
+ REQ_OP_NAME(WRITE_SAME),
148
+ REQ_OP_NAME(WRITE_ZEROES),
149
+ REQ_OP_NAME(SCSI_IN),
150
+ REQ_OP_NAME(SCSI_OUT),
151
+ REQ_OP_NAME(DRV_IN),
152
+ REQ_OP_NAME(DRV_OUT),
153
+};
154
+#undef REQ_OP_NAME
155
+
156
+/**
157
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
158
+ * @op: REQ_OP_XXX.
159
+ *
160
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
161
+ * string format. Useful in the debugging and tracing bio or request. For
162
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
163
+ */
164
+inline const char *blk_op_str(unsigned int op)
165
+{
166
+ const char *op_str = "UNKNOWN";
167
+
168
+ if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
169
+ op_str = blk_op_name[op];
170
+
171
+ return op_str;
172
+}
173
+EXPORT_SYMBOL_GPL(blk_op_str);
206174
207175 static const struct {
208176 int errno;
....@@ -223,6 +191,10 @@
223191
224192 /* device mapper special case, should not leak out: */
225193 [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
194
+
195
+ /* zone device specific errors */
196
+ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
197
+ [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
226198
227199 /* everything else not covered above: */
228200 [BLK_STS_IOERR] = { -EIO, "I/O" },
....@@ -251,17 +223,23 @@
251223 }
252224 EXPORT_SYMBOL_GPL(blk_status_to_errno);
253225
254
-static void print_req_error(struct request *req, blk_status_t status)
226
+static void print_req_error(struct request *req, blk_status_t status,
227
+ const char *caller)
255228 {
256229 int idx = (__force int)status;
257230
258231 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
259232 return;
260233
261
- printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
262
- __func__, blk_errors[idx].name, req->rq_disk ?
263
- req->rq_disk->disk_name : "?",
264
- (unsigned long long)blk_rq_pos(req));
234
+ printk_ratelimited(KERN_ERR
235
+ "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
236
+ "phys_seg %u prio class %u\n",
237
+ caller, blk_errors[idx].name,
238
+ req->rq_disk ? req->rq_disk->disk_name : "?",
239
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
240
+ req->cmd_flags & ~REQ_OP_MASK,
241
+ req->nr_phys_segments,
242
+ IOPRIO_PRIO_CLASS(req->ioprio));
265243 }
266244
267245 static void req_bio_endio(struct request *rq, struct bio *bio,
....@@ -274,6 +252,17 @@
274252 bio_set_flag(bio, BIO_QUIET);
275253
276254 bio_advance(bio, nbytes);
255
+
256
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
257
+ /*
258
+ * Partial zone append completions cannot be supported as the
259
+ * BIO fragments may end up not being written sequentially.
260
+ */
261
+ if (bio->bi_iter.bi_size)
262
+ bio->bi_status = BLK_STS_IOERR;
263
+ else
264
+ bio->bi_iter.bi_sector = rq->__sector;
265
+ }
277266
278267 /* don't actually finish bio if it's part of flush sequence */
279268 if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
....@@ -294,99 +283,6 @@
294283 }
295284 EXPORT_SYMBOL(blk_dump_rq_flags);
296285
297
-static void blk_delay_work(struct work_struct *work)
298
-{
299
- struct request_queue *q;
300
-
301
- q = container_of(work, struct request_queue, delay_work.work);
302
- spin_lock_irq(q->queue_lock);
303
- __blk_run_queue(q);
304
- spin_unlock_irq(q->queue_lock);
305
-}
306
-
307
-/**
308
- * blk_delay_queue - restart queueing after defined interval
309
- * @q: The &struct request_queue in question
310
- * @msecs: Delay in msecs
311
- *
312
- * Description:
313
- * Sometimes queueing needs to be postponed for a little while, to allow
314
- * resources to come back. This function will make sure that queueing is
315
- * restarted around the specified time.
316
- */
317
-void blk_delay_queue(struct request_queue *q, unsigned long msecs)
318
-{
319
- lockdep_assert_held(q->queue_lock);
320
- WARN_ON_ONCE(q->mq_ops);
321
-
322
- if (likely(!blk_queue_dead(q)))
323
- queue_delayed_work(kblockd_workqueue, &q->delay_work,
324
- msecs_to_jiffies(msecs));
325
-}
326
-EXPORT_SYMBOL(blk_delay_queue);
327
-
328
-/**
329
- * blk_start_queue_async - asynchronously restart a previously stopped queue
330
- * @q: The &struct request_queue in question
331
- *
332
- * Description:
333
- * blk_start_queue_async() will clear the stop flag on the queue, and
334
- * ensure that the request_fn for the queue is run from an async
335
- * context.
336
- **/
337
-void blk_start_queue_async(struct request_queue *q)
338
-{
339
- lockdep_assert_held(q->queue_lock);
340
- WARN_ON_ONCE(q->mq_ops);
341
-
342
- queue_flag_clear(QUEUE_FLAG_STOPPED, q);
343
- blk_run_queue_async(q);
344
-}
345
-EXPORT_SYMBOL(blk_start_queue_async);
346
-
347
-/**
348
- * blk_start_queue - restart a previously stopped queue
349
- * @q: The &struct request_queue in question
350
- *
351
- * Description:
352
- * blk_start_queue() will clear the stop flag on the queue, and call
353
- * the request_fn for the queue if it was in a stopped state when
354
- * entered. Also see blk_stop_queue().
355
- **/
356
-void blk_start_queue(struct request_queue *q)
357
-{
358
- lockdep_assert_held(q->queue_lock);
359
- WARN_ON_ONCE(q->mq_ops);
360
-
361
- queue_flag_clear(QUEUE_FLAG_STOPPED, q);
362
- __blk_run_queue(q);
363
-}
364
-EXPORT_SYMBOL(blk_start_queue);
365
-
366
-/**
367
- * blk_stop_queue - stop a queue
368
- * @q: The &struct request_queue in question
369
- *
370
- * Description:
371
- * The Linux block layer assumes that a block driver will consume all
372
- * entries on the request queue when the request_fn strategy is called.
373
- * Often this will not happen, because of hardware limitations (queue
374
- * depth settings). If a device driver gets a 'queue full' response,
375
- * or if it simply chooses not to queue more I/O at one point, it can
376
- * call this function to prevent the request_fn from being called until
377
- * the driver has signalled it's ready to go again. This happens by calling
378
- * blk_start_queue() to restart queue operations.
379
- **/
380
-void blk_stop_queue(struct request_queue *q)
381
-{
382
- lockdep_assert_held(q->queue_lock);
383
- WARN_ON_ONCE(q->mq_ops);
384
-
385
- cancel_delayed_work(&q->delay_work);
386
- queue_flag_set(QUEUE_FLAG_STOPPED, q);
387
-}
388
-EXPORT_SYMBOL(blk_stop_queue);
389
-
390286 /**
391287 * blk_sync_queue - cancel any pending callbacks on a queue
392288 * @q: the queue
....@@ -397,7 +293,7 @@
397293 * A block device may call blk_sync_queue to ensure that any
398294 * such activity is cancelled, thus allowing it to release resources
399295 * that the callbacks might use. The caller must already have made sure
400
- * that its ->make_request_fn will not re-add plugging prior to calling
296
+ * that its ->submit_bio will not re-add plugging prior to calling
401297 * this function.
402298 *
403299 * This function does not cancel any asynchronous activity arising
....@@ -409,16 +305,6 @@
409305 {
410306 del_timer_sync(&q->timeout);
411307 cancel_work_sync(&q->timeout_work);
412
-
413
- if (q->mq_ops) {
414
- struct blk_mq_hw_ctx *hctx;
415
- int i;
416
-
417
- queue_for_each_hw_ctx(q, hctx, i)
418
- cancel_delayed_work_sync(&hctx->run_work);
419
- } else {
420
- cancel_delayed_work_sync(&q->delay_work);
421
- }
422308 }
423309 EXPORT_SYMBOL(blk_sync_queue);
424310
....@@ -444,248 +330,20 @@
444330 EXPORT_SYMBOL_GPL(blk_clear_pm_only);
445331
446332 /**
447
- * __blk_run_queue_uncond - run a queue whether or not it has been stopped
448
- * @q: The queue to run
333
+ * blk_put_queue - decrement the request_queue refcount
334
+ * @q: the request_queue structure to decrement the refcount for
449335 *
450
- * Description:
451
- * Invoke request handling on a queue if there are any pending requests.
452
- * May be used to restart request handling after a request has completed.
453
- * This variant runs the queue whether or not the queue has been
454
- * stopped. Must be called with the queue lock held and interrupts
455
- * disabled. See also @blk_run_queue.
336
+ * Decrements the refcount of the request_queue kobject. When this reaches 0
337
+ * we'll have blk_release_queue() called.
338
+ *
339
+ * Context: Any context, but the last reference must not be dropped from
340
+ * atomic context.
456341 */
457
-inline void __blk_run_queue_uncond(struct request_queue *q)
458
-{
459
- lockdep_assert_held(q->queue_lock);
460
- WARN_ON_ONCE(q->mq_ops);
461
-
462
- if (unlikely(blk_queue_dead(q)))
463
- return;
464
-
465
- /*
466
- * Some request_fn implementations, e.g. scsi_request_fn(), unlock
467
- * the queue lock internally. As a result multiple threads may be
468
- * running such a request function concurrently. Keep track of the
469
- * number of active request_fn invocations such that blk_drain_queue()
470
- * can wait until all these request_fn calls have finished.
471
- */
472
- q->request_fn_active++;
473
- q->request_fn(q);
474
- q->request_fn_active--;
475
-}
476
-EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
477
-
478
-/**
479
- * __blk_run_queue - run a single device queue
480
- * @q: The queue to run
481
- *
482
- * Description:
483
- * See @blk_run_queue.
484
- */
485
-void __blk_run_queue(struct request_queue *q)
486
-{
487
- lockdep_assert_held(q->queue_lock);
488
- WARN_ON_ONCE(q->mq_ops);
489
-
490
- if (unlikely(blk_queue_stopped(q)))
491
- return;
492
-
493
- __blk_run_queue_uncond(q);
494
-}
495
-EXPORT_SYMBOL(__blk_run_queue);
496
-
497
-/**
498
- * blk_run_queue_async - run a single device queue in workqueue context
499
- * @q: The queue to run
500
- *
501
- * Description:
502
- * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
503
- * of us.
504
- *
505
- * Note:
506
- * Since it is not allowed to run q->delay_work after blk_cleanup_queue()
507
- * has canceled q->delay_work, callers must hold the queue lock to avoid
508
- * race conditions between blk_cleanup_queue() and blk_run_queue_async().
509
- */
510
-void blk_run_queue_async(struct request_queue *q)
511
-{
512
- lockdep_assert_held(q->queue_lock);
513
- WARN_ON_ONCE(q->mq_ops);
514
-
515
- if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
516
- mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
517
-}
518
-EXPORT_SYMBOL(blk_run_queue_async);
519
-
520
-/**
521
- * blk_run_queue - run a single device queue
522
- * @q: The queue to run
523
- *
524
- * Description:
525
- * Invoke request handling on this queue, if it has pending work to do.
526
- * May be used to restart queueing when a request has completed.
527
- */
528
-void blk_run_queue(struct request_queue *q)
529
-{
530
- unsigned long flags;
531
-
532
- WARN_ON_ONCE(q->mq_ops);
533
-
534
- spin_lock_irqsave(q->queue_lock, flags);
535
- __blk_run_queue(q);
536
- spin_unlock_irqrestore(q->queue_lock, flags);
537
-}
538
-EXPORT_SYMBOL(blk_run_queue);
539
-
540342 void blk_put_queue(struct request_queue *q)
541343 {
542344 kobject_put(&q->kobj);
543345 }
544346 EXPORT_SYMBOL(blk_put_queue);
545
-
546
-/**
547
- * __blk_drain_queue - drain requests from request_queue
548
- * @q: queue to drain
549
- * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
550
- *
551
- * Drain requests from @q. If @drain_all is set, all requests are drained.
552
- * If not, only ELVPRIV requests are drained. The caller is responsible
553
- * for ensuring that no new requests which need to be drained are queued.
554
- */
555
-static void __blk_drain_queue(struct request_queue *q, bool drain_all)
556
- __releases(q->queue_lock)
557
- __acquires(q->queue_lock)
558
-{
559
- int i;
560
-
561
- lockdep_assert_held(q->queue_lock);
562
- WARN_ON_ONCE(q->mq_ops);
563
-
564
- while (true) {
565
- bool drain = false;
566
-
567
- /*
568
- * The caller might be trying to drain @q before its
569
- * elevator is initialized.
570
- */
571
- if (q->elevator)
572
- elv_drain_elevator(q);
573
-
574
- blkcg_drain_queue(q);
575
-
576
- /*
577
- * This function might be called on a queue which failed
578
- * driver init after queue creation or is not yet fully
579
- * active yet. Some drivers (e.g. fd and loop) get unhappy
580
- * in such cases. Kick queue iff dispatch queue has
581
- * something on it and @q has request_fn set.
582
- */
583
- if (!list_empty(&q->queue_head) && q->request_fn)
584
- __blk_run_queue(q);
585
-
586
- drain |= q->nr_rqs_elvpriv;
587
- drain |= q->request_fn_active;
588
-
589
- /*
590
- * Unfortunately, requests are queued at and tracked from
591
- * multiple places and there's no single counter which can
592
- * be drained. Check all the queues and counters.
593
- */
594
- if (drain_all) {
595
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
596
- drain |= !list_empty(&q->queue_head);
597
- for (i = 0; i < 2; i++) {
598
- drain |= q->nr_rqs[i];
599
- drain |= q->in_flight[i];
600
- if (fq)
601
- drain |= !list_empty(&fq->flush_queue[i]);
602
- }
603
- }
604
-
605
- if (!drain)
606
- break;
607
-
608
- spin_unlock_irq(q->queue_lock);
609
-
610
- msleep(10);
611
-
612
- spin_lock_irq(q->queue_lock);
613
- }
614
-
615
- /*
616
- * With queue marked dead, any woken up waiter will fail the
617
- * allocation path, so the wakeup chaining is lost and we're
618
- * left with hung waiters. We need to wake up those waiters.
619
- */
620
- if (q->request_fn) {
621
- struct request_list *rl;
622
-
623
- blk_queue_for_each_rl(rl, q)
624
- for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
625
- wake_up_all(&rl->wait[i]);
626
- }
627
-}
628
-
629
-void blk_drain_queue(struct request_queue *q)
630
-{
631
- spin_lock_irq(q->queue_lock);
632
- __blk_drain_queue(q, true);
633
- spin_unlock_irq(q->queue_lock);
634
-}
635
-
636
-/**
637
- * blk_queue_bypass_start - enter queue bypass mode
638
- * @q: queue of interest
639
- *
640
- * In bypass mode, only the dispatch FIFO queue of @q is used. This
641
- * function makes @q enter bypass mode and drains all requests which were
642
- * throttled or issued before. On return, it's guaranteed that no request
643
- * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
644
- * inside queue or RCU read lock.
645
- */
646
-void blk_queue_bypass_start(struct request_queue *q)
647
-{
648
- WARN_ON_ONCE(q->mq_ops);
649
-
650
- spin_lock_irq(q->queue_lock);
651
- q->bypass_depth++;
652
- queue_flag_set(QUEUE_FLAG_BYPASS, q);
653
- spin_unlock_irq(q->queue_lock);
654
-
655
- /*
656
- * Queues start drained. Skip actual draining till init is
657
- * complete. This avoids lenghty delays during queue init which
658
- * can happen many times during boot.
659
- */
660
- if (blk_queue_init_done(q)) {
661
- spin_lock_irq(q->queue_lock);
662
- __blk_drain_queue(q, false);
663
- spin_unlock_irq(q->queue_lock);
664
-
665
- /* ensure blk_queue_bypass() is %true inside RCU read lock */
666
- synchronize_rcu();
667
- }
668
-}
669
-EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
670
-
671
-/**
672
- * blk_queue_bypass_end - leave queue bypass mode
673
- * @q: queue of interest
674
- *
675
- * Leave bypass mode and restore the normal queueing behavior.
676
- *
677
- * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
678
- * this function is called for both blk-sq and blk-mq queues.
679
- */
680
-void blk_queue_bypass_end(struct request_queue *q)
681
-{
682
- spin_lock_irq(q->queue_lock);
683
- if (!--q->bypass_depth)
684
- queue_flag_clear(QUEUE_FLAG_BYPASS, q);
685
- WARN_ON_ONCE(q->bypass_depth < 0);
686
- spin_unlock_irq(q->queue_lock);
687
-}
688
-EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
689347
690348 void blk_set_queue_dying(struct request_queue *q)
691349 {
....@@ -698,54 +356,13 @@
698356 */
699357 blk_freeze_queue_start(q);
700358
701
- if (q->mq_ops)
359
+ if (queue_is_mq(q))
702360 blk_mq_wake_waiters(q);
703
- else {
704
- struct request_list *rl;
705
-
706
- spin_lock_irq(q->queue_lock);
707
- blk_queue_for_each_rl(rl, q) {
708
- if (rl->rq_pool) {
709
- wake_up_all(&rl->wait[BLK_RW_SYNC]);
710
- wake_up_all(&rl->wait[BLK_RW_ASYNC]);
711
- }
712
- }
713
- spin_unlock_irq(q->queue_lock);
714
- }
715361
716362 /* Make blk_queue_enter() reexamine the DYING flag. */
717363 wake_up_all(&q->mq_freeze_wq);
718364 }
719365 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
720
-
721
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
722
-void blk_exit_queue(struct request_queue *q)
723
-{
724
- /*
725
- * Since the I/O scheduler exit code may access cgroup information,
726
- * perform I/O scheduler exit before disassociating from the block
727
- * cgroup controller.
728
- */
729
- if (q->elevator) {
730
- ioc_clear_queue(q);
731
- elevator_exit(q, q->elevator);
732
- q->elevator = NULL;
733
- }
734
-
735
- /*
736
- * Remove all references to @q from the block cgroup controller before
737
- * restoring @q->queue_lock to avoid that restoring this pointer causes
738
- * e.g. blkcg_print_blkgs() to crash.
739
- */
740
- blkcg_exit_queue(q);
741
-
742
- /*
743
- * Since the cgroup code may dereference the @q->backing_dev_info
744
- * pointer, only decrease its reference count after having removed the
745
- * association with the block cgroup controller.
746
- */
747
- bdi_put(q->backing_dev_info);
748
-}
749366
750367 /**
751368 * blk_cleanup_queue - shutdown a request queue
....@@ -753,57 +370,32 @@
753370 *
754371 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
755372 * put it. All future requests will be failed immediately with -ENODEV.
373
+ *
374
+ * Context: can sleep
756375 */
757376 void blk_cleanup_queue(struct request_queue *q)
758377 {
759
- spinlock_t *lock = q->queue_lock;
378
+ /* cannot be called from atomic context */
379
+ might_sleep();
380
+
381
+ WARN_ON_ONCE(blk_queue_registered(q));
760382
761383 /* mark @q DYING, no new request or merges will be allowed afterwards */
762
- mutex_lock(&q->sysfs_lock);
763384 blk_set_queue_dying(q);
764
- spin_lock_irq(lock);
765385
766
- /*
767
- * A dying queue is permanently in bypass mode till released. Note
768
- * that, unlike blk_queue_bypass_start(), we aren't performing
769
- * synchronize_rcu() after entering bypass mode to avoid the delay
770
- * as some drivers create and destroy a lot of queues while
771
- * probing. This is still safe because blk_release_queue() will be
772
- * called only after the queue refcnt drops to zero and nothing,
773
- * RCU or not, would be traversing the queue by then.
774
- */
775
- q->bypass_depth++;
776
- queue_flag_set(QUEUE_FLAG_BYPASS, q);
777
-
778
- queue_flag_set(QUEUE_FLAG_NOMERGES, q);
779
- queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
780
- queue_flag_set(QUEUE_FLAG_DYING, q);
781
- spin_unlock_irq(lock);
782
- mutex_unlock(&q->sysfs_lock);
386
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
387
+ blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
783388
784389 /*
785390 * Drain all requests queued before DYING marking. Set DEAD flag to
786
- * prevent that q->request_fn() gets invoked after draining finished.
391
+ * prevent that blk_mq_run_hw_queues() accesses the hardware queues
392
+ * after draining finished.
787393 */
788394 blk_freeze_queue(q);
789395
790396 rq_qos_exit(q);
791397
792
- spin_lock_irq(lock);
793
- queue_flag_set(QUEUE_FLAG_DEAD, q);
794
- spin_unlock_irq(lock);
795
-
796
- /*
797
- * make sure all in-progress dispatch are completed because
798
- * blk_freeze_queue() can only complete all requests, and
799
- * dispatch may still be in-progress since we dispatch requests
800
- * from more than one contexts.
801
- *
802
- * We rely on driver to deal with the race in case that queue
803
- * initialization isn't done.
804
- */
805
- if (q->mq_ops && blk_queue_init_done(q))
806
- blk_mq_quiesce_queue(q);
398
+ blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
807399
808400 /* for synchronous bio-based driver finish in-flight integrity i/o */
809401 blk_flush_integrity();
....@@ -812,118 +404,37 @@
812404 del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
813405 blk_sync_queue(q);
814406
815
- /*
816
- * I/O scheduler exit is only safe after the sysfs scheduler attribute
817
- * has been removed.
818
- */
819
- WARN_ON_ONCE(q->kobj.state_in_sysfs);
820
-
821
- blk_exit_queue(q);
822
-
823
- if (q->mq_ops)
407
+ if (queue_is_mq(q))
824408 blk_mq_exit_queue(q);
825409
826
- percpu_ref_exit(&q->q_usage_counter);
410
+ /*
411
+ * In theory, request pool of sched_tags belongs to request queue.
412
+ * However, the current implementation requires tag_set for freeing
413
+ * requests, so free the pool now.
414
+ *
415
+ * Queue has become frozen, there can't be any in-queue requests, so
416
+ * it is safe to free requests now.
417
+ */
418
+ mutex_lock(&q->sysfs_lock);
419
+ if (q->elevator)
420
+ blk_mq_sched_free_requests(q);
421
+ mutex_unlock(&q->sysfs_lock);
827422
828
- spin_lock_irq(lock);
829
- if (q->queue_lock != &q->__queue_lock)
830
- q->queue_lock = &q->__queue_lock;
831
- spin_unlock_irq(lock);
423
+ percpu_ref_exit(&q->q_usage_counter);
832424
833425 /* @q is and will stay empty, shutdown and put */
834426 blk_put_queue(q);
835427 }
836428 EXPORT_SYMBOL(blk_cleanup_queue);
837429
838
-/* Allocate memory local to the request queue */
839
-static void *alloc_request_simple(gfp_t gfp_mask, void *data)
840
-{
841
- struct request_queue *q = data;
842
-
843
- return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
844
-}
845
-
846
-static void free_request_simple(void *element, void *data)
847
-{
848
- kmem_cache_free(request_cachep, element);
849
-}
850
-
851
-static void *alloc_request_size(gfp_t gfp_mask, void *data)
852
-{
853
- struct request_queue *q = data;
854
- struct request *rq;
855
-
856
- rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
857
- q->node);
858
- if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
859
- kfree(rq);
860
- rq = NULL;
861
- }
862
- return rq;
863
-}
864
-
865
-static void free_request_size(void *element, void *data)
866
-{
867
- struct request_queue *q = data;
868
-
869
- if (q->exit_rq_fn)
870
- q->exit_rq_fn(q, element);
871
- kfree(element);
872
-}
873
-
874
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
875
- gfp_t gfp_mask)
876
-{
877
- if (unlikely(rl->rq_pool) || q->mq_ops)
878
- return 0;
879
-
880
- rl->q = q;
881
- rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
882
- rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
883
- init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
884
- init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
885
-
886
- if (q->cmd_size) {
887
- rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
888
- alloc_request_size, free_request_size,
889
- q, gfp_mask, q->node);
890
- } else {
891
- rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
892
- alloc_request_simple, free_request_simple,
893
- q, gfp_mask, q->node);
894
- }
895
- if (!rl->rq_pool)
896
- return -ENOMEM;
897
-
898
- if (rl != &q->root_rl)
899
- WARN_ON_ONCE(!blk_get_queue(q));
900
-
901
- return 0;
902
-}
903
-
904
-void blk_exit_rl(struct request_queue *q, struct request_list *rl)
905
-{
906
- if (rl->rq_pool) {
907
- mempool_destroy(rl->rq_pool);
908
- if (rl != &q->root_rl)
909
- blk_put_queue(q);
910
- }
911
-}
912
-
913
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
914
-{
915
- return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
916
-}
917
-EXPORT_SYMBOL(blk_alloc_queue);
918
-
919430 /**
920431 * blk_queue_enter() - try to increase q->q_usage_counter
921432 * @q: request queue pointer
922
- * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
433
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
923434 */
924435 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
925436 {
926
- const bool pm = flags & BLK_MQ_REQ_PREEMPT;
437
+ const bool pm = flags & BLK_MQ_REQ_PM;
927438
928439 while (true) {
929440 bool success = false;
....@@ -959,12 +470,30 @@
959470 smp_rmb();
960471
961472 wait_event(q->mq_freeze_wq,
962
- (atomic_read(&q->mq_freeze_depth) == 0 &&
963
- (pm || !blk_queue_pm_only(q))) ||
473
+ (!q->mq_freeze_depth &&
474
+ (pm || (blk_pm_request_resume(q),
475
+ !blk_queue_pm_only(q)))) ||
964476 blk_queue_dying(q));
965477 if (blk_queue_dying(q))
966478 return -ENODEV;
967479 }
480
+}
481
+
482
+static inline int bio_queue_enter(struct bio *bio)
483
+{
484
+ struct request_queue *q = bio->bi_disk->queue;
485
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
486
+ int ret;
487
+
488
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
489
+ if (unlikely(ret)) {
490
+ if (nowait && !blk_queue_dying(q))
491
+ bio_wouldblock_error(bio);
492
+ else
493
+ bio_io_error(bio);
494
+ }
495
+
496
+ return ret;
968497 }
969498
970499 void blk_queue_exit(struct request_queue *q)
....@@ -987,40 +516,23 @@
987516 kblockd_schedule_work(&q->timeout_work);
988517 }
989518
990
-static void blk_timeout_work_dummy(struct work_struct *work)
519
+static void blk_timeout_work(struct work_struct *work)
991520 {
992521 }
993522
994
-/**
995
- * blk_alloc_queue_node - allocate a request queue
996
- * @gfp_mask: memory allocation flags
997
- * @node_id: NUMA node to allocate memory from
998
- * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
999
- * serialize calls to the legacy .request_fn() callback. Ignored for
1000
- * blk-mq request queues.
1001
- *
1002
- * Note: pass the queue lock as the third argument to this function instead of
1003
- * setting the queue lock pointer explicitly to avoid triggering a sporadic
1004
- * crash in the blkcg code. This function namely calls blkcg_init_queue() and
1005
- * the queue lock pointer must be set before blkcg_init_queue() is called.
1006
- */
1007
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1008
- spinlock_t *lock)
523
+struct request_queue *blk_alloc_queue(int node_id)
1009524 {
1010525 struct request_queue *q;
1011526 int ret;
1012527
1013528 q = kmem_cache_alloc_node(blk_requestq_cachep,
1014
- gfp_mask | __GFP_ZERO, node_id);
529
+ GFP_KERNEL | __GFP_ZERO, node_id);
1015530 if (!q)
1016531 return NULL;
1017532
1018
- INIT_LIST_HEAD(&q->queue_head);
1019533 q->last_merge = NULL;
1020
- q->end_sector = 0;
1021
- q->boundary_rq = NULL;
1022534
1023
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
535
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
1024536 if (q->id < 0)
1025537 goto fail_q;
1026538
....@@ -1028,7 +540,7 @@
1028540 if (ret)
1029541 goto fail_id;
1030542
1031
- q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
543
+ q->backing_dev_info = bdi_alloc(node_id);
1032544 if (!q->backing_dev_info)
1033545 goto fail_split;
1034546
....@@ -1036,46 +548,28 @@
1036548 if (!q->stats)
1037549 goto fail_stats;
1038550
1039
- q->backing_dev_info->ra_pages =
1040
- (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1041
- q->backing_dev_info->io_pages =
1042
- (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1043
- q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
1044
- q->backing_dev_info->name = "block";
1045551 q->node = node_id;
552
+
553
+ atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
1046554
1047555 timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
1048556 laptop_mode_timer_fn, 0);
1049557 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
1050
- INIT_WORK(&q->timeout_work, blk_timeout_work_dummy);
1051
- INIT_LIST_HEAD(&q->timeout_list);
558
+ INIT_WORK(&q->timeout_work, blk_timeout_work);
1052559 INIT_LIST_HEAD(&q->icq_list);
1053560 #ifdef CONFIG_BLK_CGROUP
1054561 INIT_LIST_HEAD(&q->blkg_list);
1055562 #endif
1056
- INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
1057563
1058564 kobject_init(&q->kobj, &blk_queue_ktype);
1059565
1060
-#ifdef CONFIG_BLK_DEV_IO_TRACE
1061
- mutex_init(&q->blk_trace_mutex);
1062
-#endif
566
+ mutex_init(&q->debugfs_mutex);
1063567 mutex_init(&q->sysfs_lock);
1064
- spin_lock_init(&q->__queue_lock);
1065
-
1066
- if (!q->mq_ops)
1067
- q->queue_lock = lock ? : &q->__queue_lock;
1068
-
1069
- /*
1070
- * A queue starts its life with bypass turned on to avoid
1071
- * unnecessary bypass on/off overhead and nasty surprises during
1072
- * init. The initial bypass will be finished when the queue is
1073
- * registered by blk_register_queue().
1074
- */
1075
- q->bypass_depth = 1;
1076
- queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
568
+ mutex_init(&q->sysfs_dir_lock);
569
+ spin_lock_init(&q->queue_lock);
1077570
1078571 init_waitqueue_head(&q->mq_freeze_wq);
572
+ mutex_init(&q->mq_freeze_lock);
1079573
1080574 /*
1081575 * Init percpu_ref in atomic mode so that it's faster to shutdown.
....@@ -1088,6 +582,10 @@
1088582
1089583 if (blkcg_init_queue(q))
1090584 goto fail_ref;
585
+
586
+ blk_queue_dma_alignment(q, 511);
587
+ blk_set_default_limits(&q->limits);
588
+ q->nr_requests = BLKDEV_MAX_RQ;
1091589
1092590 return q;
1093591
....@@ -1105,107 +603,16 @@
1105603 kmem_cache_free(blk_requestq_cachep, q);
1106604 return NULL;
1107605 }
1108
-EXPORT_SYMBOL(blk_alloc_queue_node);
606
+EXPORT_SYMBOL(blk_alloc_queue);
1109607
1110608 /**
1111
- * blk_init_queue - prepare a request queue for use with a block device
1112
- * @rfn: The function to be called to process requests that have been
1113
- * placed on the queue.
1114
- * @lock: Request queue spin lock
609
+ * blk_get_queue - increment the request_queue refcount
610
+ * @q: the request_queue structure to increment the refcount for
1115611 *
1116
- * Description:
1117
- * If a block device wishes to use the standard request handling procedures,
1118
- * which sorts requests and coalesces adjacent requests, then it must
1119
- * call blk_init_queue(). The function @rfn will be called when there
1120
- * are requests on the queue that need to be processed. If the device
1121
- * supports plugging, then @rfn may not be called immediately when requests
1122
- * are available on the queue, but may be called at some time later instead.
1123
- * Plugged queues are generally unplugged when a buffer belonging to one
1124
- * of the requests on the queue is needed, or due to memory pressure.
612
+ * Increment the refcount of the request_queue kobject.
1125613 *
1126
- * @rfn is not required, or even expected, to remove all requests off the
1127
- * queue, but only as many as it can handle at a time. If it does leave
1128
- * requests on the queue, it is responsible for arranging that the requests
1129
- * get dealt with eventually.
1130
- *
1131
- * The queue spin lock must be held while manipulating the requests on the
1132
- * request queue; this lock will be taken also from interrupt context, so irq
1133
- * disabling is needed for it.
1134
- *
1135
- * Function returns a pointer to the initialized request queue, or %NULL if
1136
- * it didn't succeed.
1137
- *
1138
- * Note:
1139
- * blk_init_queue() must be paired with a blk_cleanup_queue() call
1140
- * when the block device is deactivated (such as at module unload).
1141
- **/
1142
-
1143
-struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1144
-{
1145
- return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
1146
-}
1147
-EXPORT_SYMBOL(blk_init_queue);
1148
-
1149
-struct request_queue *
1150
-blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1151
-{
1152
- struct request_queue *q;
1153
-
1154
- q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
1155
- if (!q)
1156
- return NULL;
1157
-
1158
- q->request_fn = rfn;
1159
- if (blk_init_allocated_queue(q) < 0) {
1160
- blk_cleanup_queue(q);
1161
- return NULL;
1162
- }
1163
-
1164
- return q;
1165
-}
1166
-EXPORT_SYMBOL(blk_init_queue_node);
1167
-
1168
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
1169
-
1170
-
1171
-int blk_init_allocated_queue(struct request_queue *q)
1172
-{
1173
- WARN_ON_ONCE(q->mq_ops);
1174
-
1175
- q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
1176
- if (!q->fq)
1177
- return -ENOMEM;
1178
-
1179
- if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
1180
- goto out_free_flush_queue;
1181
-
1182
- if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1183
- goto out_exit_flush_rq;
1184
-
1185
- INIT_WORK(&q->timeout_work, blk_timeout_work);
1186
- q->queue_flags |= QUEUE_FLAG_DEFAULT;
1187
-
1188
- /*
1189
- * This also sets hw/phys segments, boundary and size
1190
- */
1191
- blk_queue_make_request(q, blk_queue_bio);
1192
-
1193
- q->sg_reserved_size = INT_MAX;
1194
-
1195
- if (elevator_init(q))
1196
- goto out_exit_flush_rq;
1197
- return 0;
1198
-
1199
-out_exit_flush_rq:
1200
- if (q->exit_rq_fn)
1201
- q->exit_rq_fn(q, q->fq->flush_rq);
1202
-out_free_flush_queue:
1203
- blk_free_flush_queue(q->fq);
1204
- q->fq = NULL;
1205
- return -ENOMEM;
1206
-}
1207
-EXPORT_SYMBOL(blk_init_allocated_queue);
1208
-
614
+ * Context: Any context.
615
+ */
1209616 bool blk_get_queue(struct request_queue *q)
1210617 {
1211618 if (likely(!blk_queue_dying(q))) {
....@@ -1216,406 +623,6 @@
1216623 return false;
1217624 }
1218625 EXPORT_SYMBOL(blk_get_queue);
1219
-
1220
-static inline void blk_free_request(struct request_list *rl, struct request *rq)
1221
-{
1222
- if (rq->rq_flags & RQF_ELVPRIV) {
1223
- elv_put_request(rl->q, rq);
1224
- if (rq->elv.icq)
1225
- put_io_context(rq->elv.icq->ioc);
1226
- }
1227
-
1228
- mempool_free(rq, rl->rq_pool);
1229
-}
1230
-
1231
-/*
1232
- * ioc_batching returns true if the ioc is a valid batching request and
1233
- * should be given priority access to a request.
1234
- */
1235
-static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1236
-{
1237
- if (!ioc)
1238
- return 0;
1239
-
1240
- /*
1241
- * Make sure the process is able to allocate at least 1 request
1242
- * even if the batch times out, otherwise we could theoretically
1243
- * lose wakeups.
1244
- */
1245
- return ioc->nr_batch_requests == q->nr_batching ||
1246
- (ioc->nr_batch_requests > 0
1247
- && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1248
-}
1249
-
1250
-/*
1251
- * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1252
- * will cause the process to be a "batcher" on all queues in the system. This
1253
- * is the behaviour we want though - once it gets a wakeup it should be given
1254
- * a nice run.
1255
- */
1256
-static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
1257
-{
1258
- if (!ioc || ioc_batching(q, ioc))
1259
- return;
1260
-
1261
- ioc->nr_batch_requests = q->nr_batching;
1262
- ioc->last_waited = jiffies;
1263
-}
1264
-
1265
-static void __freed_request(struct request_list *rl, int sync)
1266
-{
1267
- struct request_queue *q = rl->q;
1268
-
1269
- if (rl->count[sync] < queue_congestion_off_threshold(q))
1270
- blk_clear_congested(rl, sync);
1271
-
1272
- if (rl->count[sync] + 1 <= q->nr_requests) {
1273
- if (waitqueue_active(&rl->wait[sync]))
1274
- wake_up(&rl->wait[sync]);
1275
-
1276
- blk_clear_rl_full(rl, sync);
1277
- }
1278
-}
1279
-
1280
-/*
1281
- * A request has just been released. Account for it, update the full and
1282
- * congestion status, wake up any waiters. Called under q->queue_lock.
1283
- */
1284
-static void freed_request(struct request_list *rl, bool sync,
1285
- req_flags_t rq_flags)
1286
-{
1287
- struct request_queue *q = rl->q;
1288
-
1289
- q->nr_rqs[sync]--;
1290
- rl->count[sync]--;
1291
- if (rq_flags & RQF_ELVPRIV)
1292
- q->nr_rqs_elvpriv--;
1293
-
1294
- __freed_request(rl, sync);
1295
-
1296
- if (unlikely(rl->starved[sync ^ 1]))
1297
- __freed_request(rl, sync ^ 1);
1298
-}
1299
-
1300
-int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1301
-{
1302
- struct request_list *rl;
1303
- int on_thresh, off_thresh;
1304
-
1305
- WARN_ON_ONCE(q->mq_ops);
1306
-
1307
- spin_lock_irq(q->queue_lock);
1308
- q->nr_requests = nr;
1309
- blk_queue_congestion_threshold(q);
1310
- on_thresh = queue_congestion_on_threshold(q);
1311
- off_thresh = queue_congestion_off_threshold(q);
1312
-
1313
- blk_queue_for_each_rl(rl, q) {
1314
- if (rl->count[BLK_RW_SYNC] >= on_thresh)
1315
- blk_set_congested(rl, BLK_RW_SYNC);
1316
- else if (rl->count[BLK_RW_SYNC] < off_thresh)
1317
- blk_clear_congested(rl, BLK_RW_SYNC);
1318
-
1319
- if (rl->count[BLK_RW_ASYNC] >= on_thresh)
1320
- blk_set_congested(rl, BLK_RW_ASYNC);
1321
- else if (rl->count[BLK_RW_ASYNC] < off_thresh)
1322
- blk_clear_congested(rl, BLK_RW_ASYNC);
1323
-
1324
- if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
1325
- blk_set_rl_full(rl, BLK_RW_SYNC);
1326
- } else {
1327
- blk_clear_rl_full(rl, BLK_RW_SYNC);
1328
- wake_up(&rl->wait[BLK_RW_SYNC]);
1329
- }
1330
-
1331
- if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
1332
- blk_set_rl_full(rl, BLK_RW_ASYNC);
1333
- } else {
1334
- blk_clear_rl_full(rl, BLK_RW_ASYNC);
1335
- wake_up(&rl->wait[BLK_RW_ASYNC]);
1336
- }
1337
- }
1338
-
1339
- spin_unlock_irq(q->queue_lock);
1340
- return 0;
1341
-}
1342
-
1343
-/**
1344
- * __get_request - get a free request
1345
- * @rl: request list to allocate from
1346
- * @op: operation and flags
1347
- * @bio: bio to allocate request for (can be %NULL)
1348
- * @flags: BLQ_MQ_REQ_* flags
1349
- * @gfp_mask: allocator flags
1350
- *
1351
- * Get a free request from @q. This function may fail under memory
1352
- * pressure or if @q is dead.
1353
- *
1354
- * Must be called with @q->queue_lock held and,
1355
- * Returns ERR_PTR on failure, with @q->queue_lock held.
1356
- * Returns request pointer on success, with @q->queue_lock *not held*.
1357
- */
1358
-static struct request *__get_request(struct request_list *rl, unsigned int op,
1359
- struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
1360
-{
1361
- struct request_queue *q = rl->q;
1362
- struct request *rq;
1363
- struct elevator_type *et = q->elevator->type;
1364
- struct io_context *ioc = rq_ioc(bio);
1365
- struct io_cq *icq = NULL;
1366
- const bool is_sync = op_is_sync(op);
1367
- int may_queue;
1368
- req_flags_t rq_flags = RQF_ALLOCED;
1369
-
1370
- lockdep_assert_held(q->queue_lock);
1371
-
1372
- if (unlikely(blk_queue_dying(q)))
1373
- return ERR_PTR(-ENODEV);
1374
-
1375
- may_queue = elv_may_queue(q, op);
1376
- if (may_queue == ELV_MQUEUE_NO)
1377
- goto rq_starved;
1378
-
1379
- if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
1380
- if (rl->count[is_sync]+1 >= q->nr_requests) {
1381
- /*
1382
- * The queue will fill after this allocation, so set
1383
- * it as full, and mark this process as "batching".
1384
- * This process will be allowed to complete a batch of
1385
- * requests, others will be blocked.
1386
- */
1387
- if (!blk_rl_full(rl, is_sync)) {
1388
- ioc_set_batching(q, ioc);
1389
- blk_set_rl_full(rl, is_sync);
1390
- } else {
1391
- if (may_queue != ELV_MQUEUE_MUST
1392
- && !ioc_batching(q, ioc)) {
1393
- /*
1394
- * The queue is full and the allocating
1395
- * process is not a "batcher", and not
1396
- * exempted by the IO scheduler
1397
- */
1398
- return ERR_PTR(-ENOMEM);
1399
- }
1400
- }
1401
- }
1402
- blk_set_congested(rl, is_sync);
1403
- }
1404
-
1405
- /*
1406
- * Only allow batching queuers to allocate up to 50% over the defined
1407
- * limit of requests, otherwise we could have thousands of requests
1408
- * allocated with any setting of ->nr_requests
1409
- */
1410
- if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
1411
- return ERR_PTR(-ENOMEM);
1412
-
1413
- q->nr_rqs[is_sync]++;
1414
- rl->count[is_sync]++;
1415
- rl->starved[is_sync] = 0;
1416
-
1417
- /*
1418
- * Decide whether the new request will be managed by elevator. If
1419
- * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will
1420
- * prevent the current elevator from being destroyed until the new
1421
- * request is freed. This guarantees icq's won't be destroyed and
1422
- * makes creating new ones safe.
1423
- *
1424
- * Flush requests do not use the elevator so skip initialization.
1425
- * This allows a request to share the flush and elevator data.
1426
- *
1427
- * Also, lookup icq while holding queue_lock. If it doesn't exist,
1428
- * it will be created after releasing queue_lock.
1429
- */
1430
- if (!op_is_flush(op) && !blk_queue_bypass(q)) {
1431
- rq_flags |= RQF_ELVPRIV;
1432
- q->nr_rqs_elvpriv++;
1433
- if (et->icq_cache && ioc)
1434
- icq = ioc_lookup_icq(ioc, q);
1435
- }
1436
-
1437
- if (blk_queue_io_stat(q))
1438
- rq_flags |= RQF_IO_STAT;
1439
- spin_unlock_irq(q->queue_lock);
1440
-
1441
- /* allocate and init request */
1442
- rq = mempool_alloc(rl->rq_pool, gfp_mask);
1443
- if (!rq)
1444
- goto fail_alloc;
1445
-
1446
- blk_rq_init(q, rq);
1447
- blk_rq_set_rl(rq, rl);
1448
- rq->cmd_flags = op;
1449
- rq->rq_flags = rq_flags;
1450
- if (flags & BLK_MQ_REQ_PREEMPT)
1451
- rq->rq_flags |= RQF_PREEMPT;
1452
-
1453
- /* init elvpriv */
1454
- if (rq_flags & RQF_ELVPRIV) {
1455
- if (unlikely(et->icq_cache && !icq)) {
1456
- if (ioc)
1457
- icq = ioc_create_icq(ioc, q, gfp_mask);
1458
- if (!icq)
1459
- goto fail_elvpriv;
1460
- }
1461
-
1462
- rq->elv.icq = icq;
1463
- if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1464
- goto fail_elvpriv;
1465
-
1466
- /* @rq->elv.icq holds io_context until @rq is freed */
1467
- if (icq)
1468
- get_io_context(icq->ioc);
1469
- }
1470
-out:
1471
- /*
1472
- * ioc may be NULL here, and ioc_batching will be false. That's
1473
- * OK, if the queue is under the request limit then requests need
1474
- * not count toward the nr_batch_requests limit. There will always
1475
- * be some limit enforced by BLK_BATCH_TIME.
1476
- */
1477
- if (ioc_batching(q, ioc))
1478
- ioc->nr_batch_requests--;
1479
-
1480
- trace_block_getrq(q, bio, op);
1481
- return rq;
1482
-
1483
-fail_elvpriv:
1484
- /*
1485
- * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
1486
- * and may fail indefinitely under memory pressure and thus
1487
- * shouldn't stall IO. Treat this request as !elvpriv. This will
1488
- * disturb iosched and blkcg but weird is bettern than dead.
1489
- */
1490
- printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
1491
- __func__, dev_name(q->backing_dev_info->dev));
1492
-
1493
- rq->rq_flags &= ~RQF_ELVPRIV;
1494
- rq->elv.icq = NULL;
1495
-
1496
- spin_lock_irq(q->queue_lock);
1497
- q->nr_rqs_elvpriv--;
1498
- spin_unlock_irq(q->queue_lock);
1499
- goto out;
1500
-
1501
-fail_alloc:
1502
- /*
1503
- * Allocation failed presumably due to memory. Undo anything we
1504
- * might have messed up.
1505
- *
1506
- * Allocating task should really be put onto the front of the wait
1507
- * queue, but this is pretty rare.
1508
- */
1509
- spin_lock_irq(q->queue_lock);
1510
- freed_request(rl, is_sync, rq_flags);
1511
-
1512
- /*
1513
- * in the very unlikely event that allocation failed and no
1514
- * requests for this direction was pending, mark us starved so that
1515
- * freeing of a request in the other direction will notice
1516
- * us. another possible fix would be to split the rq mempool into
1517
- * READ and WRITE
1518
- */
1519
-rq_starved:
1520
- if (unlikely(rl->count[is_sync] == 0))
1521
- rl->starved[is_sync] = 1;
1522
- return ERR_PTR(-ENOMEM);
1523
-}
1524
-
1525
-/**
1526
- * get_request - get a free request
1527
- * @q: request_queue to allocate request from
1528
- * @op: operation and flags
1529
- * @bio: bio to allocate request for (can be %NULL)
1530
- * @flags: BLK_MQ_REQ_* flags.
1531
- * @gfp: allocator flags
1532
- *
1533
- * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags,
1534
- * this function keeps retrying under memory pressure and fails iff @q is dead.
1535
- *
1536
- * Must be called with @q->queue_lock held and,
1537
- * Returns ERR_PTR on failure, with @q->queue_lock held.
1538
- * Returns request pointer on success, with @q->queue_lock *not held*.
1539
- */
1540
-static struct request *get_request(struct request_queue *q, unsigned int op,
1541
- struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
1542
-{
1543
- const bool is_sync = op_is_sync(op);
1544
- DEFINE_WAIT(wait);
1545
- struct request_list *rl;
1546
- struct request *rq;
1547
-
1548
- lockdep_assert_held(q->queue_lock);
1549
- WARN_ON_ONCE(q->mq_ops);
1550
-
1551
- rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1552
-retry:
1553
- rq = __get_request(rl, op, bio, flags, gfp);
1554
- if (!IS_ERR(rq))
1555
- return rq;
1556
-
1557
- if (op & REQ_NOWAIT) {
1558
- blk_put_rl(rl);
1559
- return ERR_PTR(-EAGAIN);
1560
- }
1561
-
1562
- if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1563
- blk_put_rl(rl);
1564
- return rq;
1565
- }
1566
-
1567
- /* wait on @rl and retry */
1568
- prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1569
- TASK_UNINTERRUPTIBLE);
1570
-
1571
- trace_block_sleeprq(q, bio, op);
1572
-
1573
- spin_unlock_irq(q->queue_lock);
1574
- io_schedule();
1575
-
1576
- /*
1577
- * After sleeping, we become a "batching" process and will be able
1578
- * to allocate at least one request, and up to a big batch of them
1579
- * for a small period time. See ioc_batching, ioc_set_batching
1580
- */
1581
- ioc_set_batching(q, current->io_context);
1582
-
1583
- spin_lock_irq(q->queue_lock);
1584
- finish_wait(&rl->wait[is_sync], &wait);
1585
-
1586
- goto retry;
1587
-}
1588
-
1589
-/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1590
-static struct request *blk_old_get_request(struct request_queue *q,
1591
- unsigned int op, blk_mq_req_flags_t flags)
1592
-{
1593
- struct request *rq;
1594
- gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
1595
- int ret = 0;
1596
-
1597
- WARN_ON_ONCE(q->mq_ops);
1598
-
1599
- /* create ioc upfront */
1600
- create_io_context(gfp_mask, q->node);
1601
-
1602
- ret = blk_queue_enter(q, flags);
1603
- if (ret)
1604
- return ERR_PTR(ret);
1605
- spin_lock_irq(q->queue_lock);
1606
- rq = get_request(q, op, NULL, flags, gfp_mask);
1607
- if (IS_ERR(rq)) {
1608
- spin_unlock_irq(q->queue_lock);
1609
- blk_queue_exit(q);
1610
- return rq;
1611
- }
1612
-
1613
- /* q->queue_lock is unlocked at this point */
1614
- rq->__data_len = 0;
1615
- rq->__sector = (sector_t) -1;
1616
- rq->bio = rq->biotail = NULL;
1617
- return rq;
1618
-}
1619626
1620627 /**
1621628 * blk_get_request - allocate a request
....@@ -1629,511 +636,30 @@
1629636 struct request *req;
1630637
1631638 WARN_ON_ONCE(op & REQ_NOWAIT);
1632
- WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
639
+ WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
1633640
1634
- if (q->mq_ops) {
1635
- req = blk_mq_alloc_request(q, op, flags);
1636
- if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1637
- q->mq_ops->initialize_rq_fn(req);
1638
- } else {
1639
- req = blk_old_get_request(q, op, flags);
1640
- if (!IS_ERR(req) && q->initialize_rq_fn)
1641
- q->initialize_rq_fn(req);
1642
- }
641
+ req = blk_mq_alloc_request(q, op, flags);
642
+ if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
643
+ q->mq_ops->initialize_rq_fn(req);
1643644
1644645 return req;
1645646 }
1646647 EXPORT_SYMBOL(blk_get_request);
1647648
1648
-/**
1649
- * blk_requeue_request - put a request back on queue
1650
- * @q: request queue where request should be inserted
1651
- * @rq: request to be inserted
1652
- *
1653
- * Description:
1654
- * Drivers often keep queueing requests until the hardware cannot accept
1655
- * more, when that condition happens we need to put the request back
1656
- * on the queue. Must be called with queue lock held.
1657
- */
1658
-void blk_requeue_request(struct request_queue *q, struct request *rq)
1659
-{
1660
- lockdep_assert_held(q->queue_lock);
1661
- WARN_ON_ONCE(q->mq_ops);
1662
-
1663
- blk_delete_timer(rq);
1664
- blk_clear_rq_complete(rq);
1665
- trace_block_rq_requeue(q, rq);
1666
- rq_qos_requeue(q, rq);
1667
-
1668
- if (rq->rq_flags & RQF_QUEUED)
1669
- blk_queue_end_tag(q, rq);
1670
-
1671
- BUG_ON(blk_queued_rq(rq));
1672
-
1673
- elv_requeue_request(q, rq);
1674
-}
1675
-EXPORT_SYMBOL(blk_requeue_request);
1676
-
1677
-static void add_acct_request(struct request_queue *q, struct request *rq,
1678
- int where)
1679
-{
1680
- blk_account_io_start(rq, true);
1681
- __elv_add_request(q, rq, where);
1682
-}
1683
-
1684
-static void part_round_stats_single(struct request_queue *q, int cpu,
1685
- struct hd_struct *part, unsigned long now,
1686
- unsigned int inflight)
1687
-{
1688
- if (inflight) {
1689
- __part_stat_add(cpu, part, time_in_queue,
1690
- inflight * (now - part->stamp));
1691
- __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1692
- }
1693
- part->stamp = now;
1694
-}
1695
-
1696
-/**
1697
- * part_round_stats() - Round off the performance stats on a struct disk_stats.
1698
- * @q: target block queue
1699
- * @cpu: cpu number for stats access
1700
- * @part: target partition
1701
- *
1702
- * The average IO queue length and utilisation statistics are maintained
1703
- * by observing the current state of the queue length and the amount of
1704
- * time it has been in this state for.
1705
- *
1706
- * Normally, that accounting is done on IO completion, but that can result
1707
- * in more than a second's worth of IO being accounted for within any one
1708
- * second, leading to >100% utilisation. To deal with that, we call this
1709
- * function to do a round-off before returning the results when reading
1710
- * /proc/diskstats. This accounts immediately for all queue usage up to
1711
- * the current jiffies and restarts the counters again.
1712
- */
1713
-void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
1714
-{
1715
- struct hd_struct *part2 = NULL;
1716
- unsigned long now = jiffies;
1717
- unsigned int inflight[2];
1718
- int stats = 0;
1719
-
1720
- if (part->stamp != now)
1721
- stats |= 1;
1722
-
1723
- if (part->partno) {
1724
- part2 = &part_to_disk(part)->part0;
1725
- if (part2->stamp != now)
1726
- stats |= 2;
1727
- }
1728
-
1729
- if (!stats)
1730
- return;
1731
-
1732
- part_in_flight(q, part, inflight);
1733
-
1734
- if (stats & 2)
1735
- part_round_stats_single(q, cpu, part2, now, inflight[1]);
1736
- if (stats & 1)
1737
- part_round_stats_single(q, cpu, part, now, inflight[0]);
1738
-}
1739
-EXPORT_SYMBOL_GPL(part_round_stats);
1740
-
1741
-#ifdef CONFIG_PM
1742
-static void blk_pm_put_request(struct request *rq)
1743
-{
1744
- if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
1745
- pm_runtime_mark_last_busy(rq->q->dev);
1746
-}
1747
-#else
1748
-static inline void blk_pm_put_request(struct request *rq) {}
1749
-#endif
1750
-
1751
-void __blk_put_request(struct request_queue *q, struct request *req)
1752
-{
1753
- req_flags_t rq_flags = req->rq_flags;
1754
-
1755
- if (unlikely(!q))
1756
- return;
1757
-
1758
- if (q->mq_ops) {
1759
- blk_mq_free_request(req);
1760
- return;
1761
- }
1762
-
1763
- lockdep_assert_held(q->queue_lock);
1764
-
1765
- blk_req_zone_write_unlock(req);
1766
- blk_pm_put_request(req);
1767
-
1768
- elv_completed_request(q, req);
1769
-
1770
- /* this is a bio leak */
1771
- WARN_ON(req->bio != NULL);
1772
-
1773
- rq_qos_done(q, req);
1774
-
1775
- /*
1776
- * Request may not have originated from ll_rw_blk. if not,
1777
- * it didn't come out of our reserved rq pools
1778
- */
1779
- if (rq_flags & RQF_ALLOCED) {
1780
- struct request_list *rl = blk_rq_rl(req);
1781
- bool sync = op_is_sync(req->cmd_flags);
1782
-
1783
- BUG_ON(!list_empty(&req->queuelist));
1784
- BUG_ON(ELV_ON_HASH(req));
1785
-
1786
- blk_free_request(rl, req);
1787
- freed_request(rl, sync, rq_flags);
1788
- blk_put_rl(rl);
1789
- blk_queue_exit(q);
1790
- }
1791
-}
1792
-EXPORT_SYMBOL_GPL(__blk_put_request);
1793
-
1794649 void blk_put_request(struct request *req)
1795650 {
1796
- struct request_queue *q = req->q;
1797
-
1798
- if (q->mq_ops)
1799
- blk_mq_free_request(req);
1800
- else {
1801
- unsigned long flags;
1802
-
1803
- spin_lock_irqsave(q->queue_lock, flags);
1804
- __blk_put_request(q, req);
1805
- spin_unlock_irqrestore(q->queue_lock, flags);
1806
- }
651
+ blk_mq_free_request(req);
1807652 }
1808653 EXPORT_SYMBOL(blk_put_request);
1809
-
1810
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1811
- struct bio *bio)
1812
-{
1813
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1814
-
1815
- if (!ll_back_merge_fn(q, req, bio))
1816
- return false;
1817
-
1818
- trace_block_bio_backmerge(q, req, bio);
1819
-
1820
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1821
- blk_rq_set_mixed_merge(req);
1822
-
1823
- req->biotail->bi_next = bio;
1824
- req->biotail = bio;
1825
- req->__data_len += bio->bi_iter.bi_size;
1826
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1827
-
1828
- blk_account_io_start(req, false);
1829
- return true;
1830
-}
1831
-
1832
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1833
- struct bio *bio)
1834
-{
1835
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1836
-
1837
- if (!ll_front_merge_fn(q, req, bio))
1838
- return false;
1839
-
1840
- trace_block_bio_frontmerge(q, req, bio);
1841
-
1842
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1843
- blk_rq_set_mixed_merge(req);
1844
-
1845
- bio->bi_next = req->bio;
1846
- req->bio = bio;
1847
-
1848
- req->__sector = bio->bi_iter.bi_sector;
1849
- req->__data_len += bio->bi_iter.bi_size;
1850
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1851
-
1852
- blk_account_io_start(req, false);
1853
- return true;
1854
-}
1855
-
1856
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
1857
- struct bio *bio)
1858
-{
1859
- unsigned short segments = blk_rq_nr_discard_segments(req);
1860
-
1861
- if (segments >= queue_max_discard_segments(q))
1862
- goto no_merge;
1863
- if (blk_rq_sectors(req) + bio_sectors(bio) >
1864
- blk_rq_get_max_sectors(req, blk_rq_pos(req)))
1865
- goto no_merge;
1866
-
1867
- req->biotail->bi_next = bio;
1868
- req->biotail = bio;
1869
- req->__data_len += bio->bi_iter.bi_size;
1870
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1871
- req->nr_phys_segments = segments + 1;
1872
-
1873
- blk_account_io_start(req, false);
1874
- return true;
1875
-no_merge:
1876
- req_set_nomerge(q, req);
1877
- return false;
1878
-}
1879
-
1880
-/**
1881
- * blk_attempt_plug_merge - try to merge with %current's plugged list
1882
- * @q: request_queue new bio is being queued at
1883
- * @bio: new bio being queued
1884
- * @request_count: out parameter for number of traversed plugged requests
1885
- * @same_queue_rq: pointer to &struct request that gets filled in when
1886
- * another request associated with @q is found on the plug list
1887
- * (optional, may be %NULL)
1888
- *
1889
- * Determine whether @bio being queued on @q can be merged with a request
1890
- * on %current's plugged list. Returns %true if merge was successful,
1891
- * otherwise %false.
1892
- *
1893
- * Plugging coalesces IOs from the same issuer for the same purpose without
1894
- * going through @q->queue_lock. As such it's more of an issuing mechanism
1895
- * than scheduling, and the request, while may have elvpriv data, is not
1896
- * added on the elevator at this point. In addition, we don't have
1897
- * reliable access to the elevator outside queue lock. Only check basic
1898
- * merging parameters without querying the elevator.
1899
- *
1900
- * Caller must ensure !blk_queue_nomerges(q) beforehand.
1901
- */
1902
-bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1903
- unsigned int *request_count,
1904
- struct request **same_queue_rq)
1905
-{
1906
- struct blk_plug *plug;
1907
- struct request *rq;
1908
- struct list_head *plug_list;
1909
-
1910
- plug = current->plug;
1911
- if (!plug)
1912
- return false;
1913
- *request_count = 0;
1914
-
1915
- if (q->mq_ops)
1916
- plug_list = &plug->mq_list;
1917
- else
1918
- plug_list = &plug->list;
1919
-
1920
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
1921
- bool merged = false;
1922
-
1923
- if (rq->q == q) {
1924
- (*request_count)++;
1925
- /*
1926
- * Only blk-mq multiple hardware queues case checks the
1927
- * rq in the same queue, there should be only one such
1928
- * rq in a queue
1929
- **/
1930
- if (same_queue_rq)
1931
- *same_queue_rq = rq;
1932
- }
1933
-
1934
- if (rq->q != q || !blk_rq_merge_ok(rq, bio))
1935
- continue;
1936
-
1937
- switch (blk_try_merge(rq, bio)) {
1938
- case ELEVATOR_BACK_MERGE:
1939
- merged = bio_attempt_back_merge(q, rq, bio);
1940
- break;
1941
- case ELEVATOR_FRONT_MERGE:
1942
- merged = bio_attempt_front_merge(q, rq, bio);
1943
- break;
1944
- case ELEVATOR_DISCARD_MERGE:
1945
- merged = bio_attempt_discard_merge(q, rq, bio);
1946
- break;
1947
- default:
1948
- break;
1949
- }
1950
-
1951
- if (merged)
1952
- return true;
1953
- }
1954
-
1955
- return false;
1956
-}
1957
-
1958
-unsigned int blk_plug_queued_count(struct request_queue *q)
1959
-{
1960
- struct blk_plug *plug;
1961
- struct request *rq;
1962
- struct list_head *plug_list;
1963
- unsigned int ret = 0;
1964
-
1965
- plug = current->plug;
1966
- if (!plug)
1967
- goto out;
1968
-
1969
- if (q->mq_ops)
1970
- plug_list = &plug->mq_list;
1971
- else
1972
- plug_list = &plug->list;
1973
-
1974
- list_for_each_entry(rq, plug_list, queuelist) {
1975
- if (rq->q == q)
1976
- ret++;
1977
- }
1978
-out:
1979
- return ret;
1980
-}
1981
-
1982
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
1983
-{
1984
- struct io_context *ioc = rq_ioc(bio);
1985
-
1986
- if (bio->bi_opf & REQ_RAHEAD)
1987
- req->cmd_flags |= REQ_FAILFAST_MASK;
1988
-
1989
- req->__sector = bio->bi_iter.bi_sector;
1990
- if (ioprio_valid(bio_prio(bio)))
1991
- req->ioprio = bio_prio(bio);
1992
- else if (ioc)
1993
- req->ioprio = ioc->ioprio;
1994
- else
1995
- req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
1996
- req->write_hint = bio->bi_write_hint;
1997
- blk_rq_bio_prep(req->q, req, bio);
1998
-}
1999
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
2000
-
2001
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
2002
-{
2003
- struct blk_plug *plug;
2004
- int where = ELEVATOR_INSERT_SORT;
2005
- struct request *req, *free;
2006
- unsigned int request_count = 0;
2007
-
2008
- /*
2009
- * low level driver can indicate that it wants pages above a
2010
- * certain limit bounced to low memory (ie for highmem, or even
2011
- * ISA dma in theory)
2012
- */
2013
- blk_queue_bounce(q, &bio);
2014
-
2015
- blk_queue_split(q, &bio);
2016
-
2017
- if (!bio_integrity_prep(bio))
2018
- return BLK_QC_T_NONE;
2019
-
2020
- if (op_is_flush(bio->bi_opf)) {
2021
- spin_lock_irq(q->queue_lock);
2022
- where = ELEVATOR_INSERT_FLUSH;
2023
- goto get_rq;
2024
- }
2025
-
2026
- /*
2027
- * Check if we can merge with the plugged list before grabbing
2028
- * any locks.
2029
- */
2030
- if (!blk_queue_nomerges(q)) {
2031
- if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
2032
- return BLK_QC_T_NONE;
2033
- } else
2034
- request_count = blk_plug_queued_count(q);
2035
-
2036
- spin_lock_irq(q->queue_lock);
2037
-
2038
- switch (elv_merge(q, &req, bio)) {
2039
- case ELEVATOR_BACK_MERGE:
2040
- if (!bio_attempt_back_merge(q, req, bio))
2041
- break;
2042
- elv_bio_merged(q, req, bio);
2043
- free = attempt_back_merge(q, req);
2044
- if (free)
2045
- __blk_put_request(q, free);
2046
- else
2047
- elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
2048
- goto out_unlock;
2049
- case ELEVATOR_FRONT_MERGE:
2050
- if (!bio_attempt_front_merge(q, req, bio))
2051
- break;
2052
- elv_bio_merged(q, req, bio);
2053
- free = attempt_front_merge(q, req);
2054
- if (free)
2055
- __blk_put_request(q, free);
2056
- else
2057
- elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
2058
- goto out_unlock;
2059
- default:
2060
- break;
2061
- }
2062
-
2063
-get_rq:
2064
- rq_qos_throttle(q, bio, q->queue_lock);
2065
-
2066
- /*
2067
- * Grab a free request. This is might sleep but can not fail.
2068
- * Returns with the queue unlocked.
2069
- */
2070
- blk_queue_enter_live(q);
2071
- req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2072
- if (IS_ERR(req)) {
2073
- blk_queue_exit(q);
2074
- rq_qos_cleanup(q, bio);
2075
- if (PTR_ERR(req) == -ENOMEM)
2076
- bio->bi_status = BLK_STS_RESOURCE;
2077
- else
2078
- bio->bi_status = BLK_STS_IOERR;
2079
- bio_endio(bio);
2080
- goto out_unlock;
2081
- }
2082
-
2083
- rq_qos_track(q, req, bio);
2084
-
2085
- /*
2086
- * After dropping the lock and possibly sleeping here, our request
2087
- * may now be mergeable after it had proven unmergeable (above).
2088
- * We don't worry about that case for efficiency. It won't happen
2089
- * often, and the elevators are able to handle it.
2090
- */
2091
- blk_init_request_from_bio(req, bio);
2092
-
2093
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
2094
- req->cpu = raw_smp_processor_id();
2095
-
2096
- plug = current->plug;
2097
- if (plug) {
2098
- /*
2099
- * If this is the first request added after a plug, fire
2100
- * of a plug trace.
2101
- *
2102
- * @request_count may become stale because of schedule
2103
- * out, so check plug list again.
2104
- */
2105
- if (!request_count || list_empty(&plug->list))
2106
- trace_block_plug(q);
2107
- else {
2108
- struct request *last = list_entry_rq(plug->list.prev);
2109
- if (request_count >= BLK_MAX_REQUEST_COUNT ||
2110
- blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
2111
- blk_flush_plug_list(plug, false);
2112
- trace_block_plug(q);
2113
- }
2114
- }
2115
- list_add_tail(&req->queuelist, &plug->list);
2116
- blk_account_io_start(req, true);
2117
- } else {
2118
- spin_lock_irq(q->queue_lock);
2119
- add_acct_request(q, req, where);
2120
- __blk_run_queue(q);
2121
-out_unlock:
2122
- spin_unlock_irq(q->queue_lock);
2123
- }
2124
-
2125
- return BLK_QC_T_NONE;
2126
-}
2127654
2128655 static void handle_bad_sector(struct bio *bio, sector_t maxsector)
2129656 {
2130657 char b[BDEVNAME_SIZE];
2131658
2132
- printk(KERN_INFO "attempt to access beyond end of device\n");
2133
- printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
2134
- bio_devname(bio, b), bio->bi_opf,
2135
- (unsigned long long)bio_end_sector(bio),
2136
- (long long)maxsector);
659
+ pr_info_ratelimited("attempt to access beyond end of device\n"
660
+ "%s: rw=%d, want=%llu, limit=%llu\n",
661
+ bio_devname(bio, b), bio->bi_opf,
662
+ bio_end_sector(bio), maxsector);
2137663 }
2138664
2139665 #ifdef CONFIG_FAIL_MAKE_REQUEST
....@@ -2182,8 +708,7 @@
2182708 return false;
2183709
2184710 WARN_ONCE(1,
2185
- "generic_make_request: Trying to write "
2186
- "to read-only block-device %s (partno %d)\n",
711
+ "Trying to write to read-only block-device %s (partno %d)\n",
2187712 bio_devname(bio, b), part->partno);
2188713 /* Older lvm-tools actually trigger this */
2189714 return false;
....@@ -2235,11 +760,7 @@
2235760 if (unlikely(bio_check_ro(bio, p)))
2236761 goto out;
2237762
2238
- /*
2239
- * Zone reset does not include bi_size so bio_sectors() is always 0.
2240
- * Include a test for the reset op code and perform the remap if needed.
2241
- */
2242
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
763
+ if (bio_sectors(bio)) {
2243764 if (bio_check_eod(bio, part_nr_sects_read(p)))
2244765 goto out;
2245766 bio->bi_iter.bi_sector += p->start_sect;
....@@ -2253,30 +774,58 @@
2253774 return ret;
2254775 }
2255776
2256
-static noinline_for_stack bool
2257
-generic_make_request_checks(struct bio *bio)
777
+/*
778
+ * Check write append to a zoned block device.
779
+ */
780
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
781
+ struct bio *bio)
2258782 {
2259
- struct request_queue *q;
783
+ sector_t pos = bio->bi_iter.bi_sector;
2260784 int nr_sectors = bio_sectors(bio);
785
+
786
+ /* Only applicable to zoned block devices */
787
+ if (!blk_queue_is_zoned(q))
788
+ return BLK_STS_NOTSUPP;
789
+
790
+ /* The bio sector must point to the start of a sequential zone */
791
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
792
+ !blk_queue_zone_is_seq(q, pos))
793
+ return BLK_STS_IOERR;
794
+
795
+ /*
796
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
797
+ * split and could result in non-contiguous sectors being written in
798
+ * different zones.
799
+ */
800
+ if (nr_sectors > q->limits.chunk_sectors)
801
+ return BLK_STS_IOERR;
802
+
803
+ /* Make sure the BIO is small enough and will not get split */
804
+ if (nr_sectors > q->limits.max_zone_append_sectors)
805
+ return BLK_STS_IOERR;
806
+
807
+ bio->bi_opf |= REQ_NOMERGE;
808
+
809
+ return BLK_STS_OK;
810
+}
811
+
812
+static noinline_for_stack bool submit_bio_checks(struct bio *bio)
813
+{
814
+ struct request_queue *q = bio->bi_disk->queue;
2261815 blk_status_t status = BLK_STS_IOERR;
2262
- char b[BDEVNAME_SIZE];
816
+ struct blk_plug *plug;
2263817
2264818 might_sleep();
2265819
2266
- q = bio->bi_disk->queue;
2267
- if (unlikely(!q)) {
2268
- printk(KERN_ERR
2269
- "generic_make_request: Trying to access "
2270
- "nonexistent block-device %s (%Lu)\n",
2271
- bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
2272
- goto end_io;
2273
- }
820
+ plug = blk_mq_plug(q, bio);
821
+ if (plug && plug->nowait)
822
+ bio->bi_opf |= REQ_NOWAIT;
2274823
2275824 /*
2276825 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2277
- * if queue is not a request based queue.
826
+ * if queue does not support NOWAIT.
2278827 */
2279
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
828
+ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
2280829 goto not_supported;
2281830
2282831 if (should_fail_bio(bio))
....@@ -2293,18 +842,20 @@
2293842 }
2294843
2295844 /*
2296
- * Filter flush bio's early so that make_request based
2297
- * drivers without flush support don't have to worry
2298
- * about them.
845
+ * Filter flush bio's early so that bio based drivers without flush
846
+ * support don't have to worry about them.
2299847 */
2300848 if (op_is_flush(bio->bi_opf) &&
2301849 !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
2302850 bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
2303
- if (!nr_sectors) {
851
+ if (!bio_sectors(bio)) {
2304852 status = BLK_STS_OK;
2305853 goto end_io;
2306854 }
2307855 }
856
+
857
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
858
+ bio->bi_opf &= ~REQ_HIPRI;
2308859
2309860 switch (bio_op(bio)) {
2310861 case REQ_OP_DISCARD:
....@@ -2319,9 +870,20 @@
2319870 if (!q->limits.max_write_same_sectors)
2320871 goto not_supported;
2321872 break;
2322
- case REQ_OP_ZONE_REPORT:
873
+ case REQ_OP_ZONE_APPEND:
874
+ status = blk_check_zone_append(q, bio);
875
+ if (status != BLK_STS_OK)
876
+ goto end_io;
877
+ break;
2323878 case REQ_OP_ZONE_RESET:
879
+ case REQ_OP_ZONE_OPEN:
880
+ case REQ_OP_ZONE_CLOSE:
881
+ case REQ_OP_ZONE_FINISH:
2324882 if (!blk_queue_is_zoned(q))
883
+ goto not_supported;
884
+ break;
885
+ case REQ_OP_ZONE_RESET_ALL:
886
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
2325887 goto not_supported;
2326888 break;
2327889 case REQ_OP_WRITE_ZEROES:
....@@ -2333,15 +895,19 @@
2333895 }
2334896
2335897 /*
2336
- * Various block parts want %current->io_context and lazy ioc
2337
- * allocation ends up trading a lot of pain for a small amount of
2338
- * memory. Just allocate it upfront. This may fail and block
2339
- * layer knows how to live with it.
898
+ * Various block parts want %current->io_context, so allocate it up
899
+ * front rather than dealing with lots of pain to allocate it only
900
+ * where needed. This may fail and the block layer knows how to live
901
+ * with it.
2340902 */
2341
- create_io_context(GFP_ATOMIC, q->node);
903
+ if (unlikely(!current->io_context))
904
+ create_task_io_context(current, GFP_ATOMIC, q->node);
2342905
2343
- if (!blkcg_bio_issue_check(q, bio))
906
+ if (blk_throtl_bio(bio))
2344907 return false;
908
+
909
+ blk_cgroup_bio_start(bio);
910
+ blkcg_bio_issue_init(bio);
2345911
2346912 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
2347913 trace_block_bio_queue(q, bio);
....@@ -2360,197 +926,162 @@
2360926 return false;
2361927 }
2362928
2363
-/**
2364
- * generic_make_request - hand a buffer to its device driver for I/O
2365
- * @bio: The bio describing the location in memory and on the device.
2366
- *
2367
- * generic_make_request() is used to make I/O requests of block
2368
- * devices. It is passed a &struct bio, which describes the I/O that needs
2369
- * to be done.
2370
- *
2371
- * generic_make_request() does not return any status. The
2372
- * success/failure status of the request, along with notification of
2373
- * completion, is delivered asynchronously through the bio->bi_end_io
2374
- * function described (one day) else where.
2375
- *
2376
- * The caller of generic_make_request must make sure that bi_io_vec
2377
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
2378
- * set to describe the device address, and the
2379
- * bi_end_io and optionally bi_private are set to describe how
2380
- * completion notification should be signaled.
2381
- *
2382
- * generic_make_request and the drivers it calls may use bi_next if this
2383
- * bio happens to be merged with someone else, and may resubmit the bio to
2384
- * a lower device by calling into generic_make_request recursively, which
2385
- * means the bio should NOT be touched after the call to ->make_request_fn.
2386
- */
2387
-blk_qc_t generic_make_request(struct bio *bio)
929
+static blk_qc_t __submit_bio(struct bio *bio)
2388930 {
2389
- /*
2390
- * bio_list_on_stack[0] contains bios submitted by the current
2391
- * make_request_fn.
2392
- * bio_list_on_stack[1] contains bios that were submitted before
2393
- * the current make_request_fn, but that haven't been processed
2394
- * yet.
2395
- */
2396
- struct bio_list bio_list_on_stack[2];
2397
- blk_mq_req_flags_t flags = 0;
2398
- struct request_queue *q = bio->bi_disk->queue;
931
+ struct gendisk *disk = bio->bi_disk;
2399932 blk_qc_t ret = BLK_QC_T_NONE;
2400933
2401
- if (bio->bi_opf & REQ_NOWAIT)
2402
- flags = BLK_MQ_REQ_NOWAIT;
2403
- if (bio_flagged(bio, BIO_QUEUE_ENTERED))
2404
- blk_queue_enter_live(q);
2405
- else if (blk_queue_enter(q, flags) < 0) {
2406
- if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
2407
- bio_wouldblock_error(bio);
2408
- else
2409
- bio_io_error(bio);
2410
- return ret;
934
+ if (blk_crypto_bio_prep(&bio)) {
935
+ if (!disk->fops->submit_bio)
936
+ return blk_mq_submit_bio(bio);
937
+ ret = disk->fops->submit_bio(bio);
2411938 }
939
+ blk_queue_exit(disk->queue);
940
+ return ret;
941
+}
2412942
2413
- if (!generic_make_request_checks(bio))
2414
- goto out;
943
+/*
944
+ * The loop in this function may be a bit non-obvious, and so deserves some
945
+ * explanation:
946
+ *
947
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
948
+ * that), so we have a list with a single bio.
949
+ * - We pretend that we have just taken it off a longer list, so we assign
950
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
951
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
952
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
953
+ * non-NULL value in bio_list and re-enter the loop from the top.
954
+ * - In this case we really did just take the bio of the top of the list (no
955
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
956
+ * again.
957
+ *
958
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
959
+ * bio_list_on_stack[1] contains bios that were submitted before the current
960
+ * ->submit_bio_bio, but that haven't been processed yet.
961
+ */
962
+static blk_qc_t __submit_bio_noacct(struct bio *bio)
963
+{
964
+ struct bio_list bio_list_on_stack[2];
965
+ blk_qc_t ret = BLK_QC_T_NONE;
966
+
967
+ BUG_ON(bio->bi_next);
968
+
969
+ bio_list_init(&bio_list_on_stack[0]);
970
+ current->bio_list = bio_list_on_stack;
971
+
972
+ do {
973
+ struct request_queue *q = bio->bi_disk->queue;
974
+ struct bio_list lower, same;
975
+
976
+ if (unlikely(bio_queue_enter(bio) != 0))
977
+ continue;
978
+
979
+ /*
980
+ * Create a fresh bio_list for all subordinate requests.
981
+ */
982
+ bio_list_on_stack[1] = bio_list_on_stack[0];
983
+ bio_list_init(&bio_list_on_stack[0]);
984
+
985
+ ret = __submit_bio(bio);
986
+
987
+ /*
988
+ * Sort new bios into those for a lower level and those for the
989
+ * same level.
990
+ */
991
+ bio_list_init(&lower);
992
+ bio_list_init(&same);
993
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
994
+ if (q == bio->bi_disk->queue)
995
+ bio_list_add(&same, bio);
996
+ else
997
+ bio_list_add(&lower, bio);
998
+
999
+ /*
1000
+ * Now assemble so we handle the lowest level first.
1001
+ */
1002
+ bio_list_merge(&bio_list_on_stack[0], &lower);
1003
+ bio_list_merge(&bio_list_on_stack[0], &same);
1004
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
1005
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
1006
+
1007
+ current->bio_list = NULL;
1008
+ return ret;
1009
+}
1010
+
1011
+static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
1012
+{
1013
+ struct bio_list bio_list[2] = { };
1014
+ blk_qc_t ret = BLK_QC_T_NONE;
1015
+
1016
+ current->bio_list = bio_list;
1017
+
1018
+ do {
1019
+ struct gendisk *disk = bio->bi_disk;
1020
+
1021
+ if (unlikely(bio_queue_enter(bio) != 0))
1022
+ continue;
1023
+
1024
+ if (!blk_crypto_bio_prep(&bio)) {
1025
+ blk_queue_exit(disk->queue);
1026
+ ret = BLK_QC_T_NONE;
1027
+ continue;
1028
+ }
1029
+
1030
+ ret = blk_mq_submit_bio(bio);
1031
+ } while ((bio = bio_list_pop(&bio_list[0])));
1032
+
1033
+ current->bio_list = NULL;
1034
+ return ret;
1035
+}
1036
+
1037
+/**
1038
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
1039
+ * @bio: The bio describing the location in memory and on the device.
1040
+ *
1041
+ * This is a version of submit_bio() that shall only be used for I/O that is
1042
+ * resubmitted to lower level drivers by stacking block drivers. All file
1043
+ * systems and other upper level users of the block layer should use
1044
+ * submit_bio() instead.
1045
+ */
1046
+blk_qc_t submit_bio_noacct(struct bio *bio)
1047
+{
1048
+ if (!submit_bio_checks(bio))
1049
+ return BLK_QC_T_NONE;
24151050
24161051 /*
2417
- * We only want one ->make_request_fn to be active at a time, else
2418
- * stack usage with stacked devices could be a problem. So use
2419
- * current->bio_list to keep a list of requests submited by a
2420
- * make_request_fn function. current->bio_list is also used as a
2421
- * flag to say if generic_make_request is currently active in this
2422
- * task or not. If it is NULL, then no make_request is active. If
2423
- * it is non-NULL, then a make_request is active, and new requests
2424
- * should be added at the tail
1052
+ * We only want one ->submit_bio to be active at a time, else stack
1053
+ * usage with stacked devices could be a problem. Use current->bio_list
1054
+ * to collect a list of requests submited by a ->submit_bio method while
1055
+ * it is active, and then process them after it returned.
24251056 */
24261057 if (current->bio_list) {
24271058 bio_list_add(&current->bio_list[0], bio);
2428
- goto out;
2429
- }
2430
-
2431
- /* following loop may be a bit non-obvious, and so deserves some
2432
- * explanation.
2433
- * Before entering the loop, bio->bi_next is NULL (as all callers
2434
- * ensure that) so we have a list with a single bio.
2435
- * We pretend that we have just taken it off a longer list, so
2436
- * we assign bio_list to a pointer to the bio_list_on_stack,
2437
- * thus initialising the bio_list of new bios to be
2438
- * added. ->make_request() may indeed add some more bios
2439
- * through a recursive call to generic_make_request. If it
2440
- * did, we find a non-NULL value in bio_list and re-enter the loop
2441
- * from the top. In this case we really did just take the bio
2442
- * of the top of the list (no pretending) and so remove it from
2443
- * bio_list, and call into ->make_request() again.
2444
- */
2445
- BUG_ON(bio->bi_next);
2446
- bio_list_init(&bio_list_on_stack[0]);
2447
- current->bio_list = bio_list_on_stack;
2448
- do {
2449
- bool enter_succeeded = true;
2450
-
2451
- if (unlikely(q != bio->bi_disk->queue)) {
2452
- if (q)
2453
- blk_queue_exit(q);
2454
- q = bio->bi_disk->queue;
2455
- flags = 0;
2456
- if (bio->bi_opf & REQ_NOWAIT)
2457
- flags = BLK_MQ_REQ_NOWAIT;
2458
- if (blk_queue_enter(q, flags) < 0)
2459
- enter_succeeded = false;
2460
- }
2461
-
2462
- if (enter_succeeded) {
2463
- struct bio_list lower, same;
2464
-
2465
- /* Create a fresh bio_list for all subordinate requests */
2466
- bio_list_on_stack[1] = bio_list_on_stack[0];
2467
- bio_list_init(&bio_list_on_stack[0]);
2468
-
2469
- if (!blk_crypto_submit_bio(&bio))
2470
- ret = q->make_request_fn(q, bio);
2471
-
2472
- /* sort new bios into those for a lower level
2473
- * and those for the same level
2474
- */
2475
- bio_list_init(&lower);
2476
- bio_list_init(&same);
2477
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
2478
- if (q == bio->bi_disk->queue)
2479
- bio_list_add(&same, bio);
2480
- else
2481
- bio_list_add(&lower, bio);
2482
- /* now assemble so we handle the lowest level first */
2483
- bio_list_merge(&bio_list_on_stack[0], &lower);
2484
- bio_list_merge(&bio_list_on_stack[0], &same);
2485
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
2486
- } else {
2487
- if (unlikely(!blk_queue_dying(q) &&
2488
- (bio->bi_opf & REQ_NOWAIT)))
2489
- bio_wouldblock_error(bio);
2490
- else
2491
- bio_io_error(bio);
2492
- q = NULL;
2493
- }
2494
- bio = bio_list_pop(&bio_list_on_stack[0]);
2495
- } while (bio);
2496
- current->bio_list = NULL; /* deactivate */
2497
-
2498
-out:
2499
- if (q)
2500
- blk_queue_exit(q);
2501
- return ret;
2502
-}
2503
-EXPORT_SYMBOL(generic_make_request);
2504
-
2505
-/**
2506
- * direct_make_request - hand a buffer directly to its device driver for I/O
2507
- * @bio: The bio describing the location in memory and on the device.
2508
- *
2509
- * This function behaves like generic_make_request(), but does not protect
2510
- * against recursion. Must only be used if the called driver is known
2511
- * to not call generic_make_request (or direct_make_request) again from
2512
- * its make_request function. (Calling direct_make_request again from
2513
- * a workqueue is perfectly fine as that doesn't recurse).
2514
- */
2515
-blk_qc_t direct_make_request(struct bio *bio)
2516
-{
2517
- struct request_queue *q = bio->bi_disk->queue;
2518
- bool nowait = bio->bi_opf & REQ_NOWAIT;
2519
- blk_qc_t ret = BLK_QC_T_NONE;
2520
-
2521
- if (!generic_make_request_checks(bio))
2522
- return BLK_QC_T_NONE;
2523
-
2524
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
2525
- if (nowait && !blk_queue_dying(q))
2526
- bio->bi_status = BLK_STS_AGAIN;
2527
- else
2528
- bio->bi_status = BLK_STS_IOERR;
2529
- bio_endio(bio);
25301059 return BLK_QC_T_NONE;
25311060 }
25321061
2533
- if (!blk_crypto_submit_bio(&bio))
2534
- ret = q->make_request_fn(q, bio);
2535
- blk_queue_exit(q);
2536
- return ret;
1062
+ if (!bio->bi_disk->fops->submit_bio)
1063
+ return __submit_bio_noacct_mq(bio);
1064
+ return __submit_bio_noacct(bio);
25371065 }
2538
-EXPORT_SYMBOL_GPL(direct_make_request);
1066
+EXPORT_SYMBOL(submit_bio_noacct);
25391067
25401068 /**
25411069 * submit_bio - submit a bio to the block device layer for I/O
25421070 * @bio: The &struct bio which describes the I/O
25431071 *
2544
- * submit_bio() is very similar in purpose to generic_make_request(), and
2545
- * uses that function to do most of the work. Both are fairly rough
2546
- * interfaces; @bio must be presetup and ready for I/O.
1072
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
1073
+ * fully set up &struct bio that describes the I/O that needs to be done. The
1074
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
25471075 *
1076
+ * The success/failure status of the request, along with notification of
1077
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
1078
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
1079
+ * been called.
25481080 */
25491081 blk_qc_t submit_bio(struct bio *bio)
25501082 {
2551
- bool workingset_read = false;
2552
- unsigned long pflags;
2553
- blk_qc_t ret;
1083
+ if (blkcg_punt_bio_submit(bio))
1084
+ return BLK_QC_T_NONE;
25541085
25551086 /*
25561087 * If it's a regular read/write or a barrier with data attached,
....@@ -2567,8 +1098,6 @@
25671098 if (op_is_write(bio_op(bio))) {
25681099 count_vm_events(PGPGOUT, count);
25691100 } else {
2570
- if (bio_flagged(bio, BIO_WORKINGSET))
2571
- workingset_read = true;
25721101 task_io_account_read(bio->bi_iter.bi_size);
25731102 count_vm_events(PGPGIN, count);
25741103 }
....@@ -2584,37 +1113,30 @@
25841113 }
25851114
25861115 /*
2587
- * If we're reading data that is part of the userspace
2588
- * workingset, count submission time as memory stall. When the
2589
- * device is congested, or the submitting cgroup IO-throttled,
2590
- * submission can be a significant part of overall IO time.
1116
+ * If we're reading data that is part of the userspace workingset, count
1117
+ * submission time as memory stall. When the device is congested, or
1118
+ * the submitting cgroup IO-throttled, submission can be a significant
1119
+ * part of overall IO time.
25911120 */
2592
- if (workingset_read)
1121
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
1122
+ bio_flagged(bio, BIO_WORKINGSET))) {
1123
+ unsigned long pflags;
1124
+ blk_qc_t ret;
1125
+
25931126 psi_memstall_enter(&pflags);
2594
-
2595
- ret = generic_make_request(bio);
2596
-
2597
- if (workingset_read)
1127
+ ret = submit_bio_noacct(bio);
25981128 psi_memstall_leave(&pflags);
25991129
2600
- return ret;
1130
+ return ret;
1131
+ }
1132
+
1133
+ return submit_bio_noacct(bio);
26011134 }
26021135 EXPORT_SYMBOL(submit_bio);
26031136
2604
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2605
-{
2606
- if (!q->poll_fn || !blk_qc_t_valid(cookie))
2607
- return false;
2608
-
2609
- if (current->plug)
2610
- blk_flush_plug_list(current->plug, false);
2611
- return q->poll_fn(q, cookie);
2612
-}
2613
-EXPORT_SYMBOL_GPL(blk_poll);
2614
-
26151137 /**
26161138 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2617
- * for new the queue limits
1139
+ * for the new queue limits
26181140 * @q: the queue
26191141 * @rq: the request being checked
26201142 *
....@@ -2629,12 +1151,28 @@
26291151 * limits when retrying requests on other queues. Those requests need
26301152 * to be checked against the new queue limits again during dispatch.
26311153 */
2632
-static int blk_cloned_rq_check_limits(struct request_queue *q,
1154
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
26331155 struct request *rq)
26341156 {
2635
- if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
2636
- printk(KERN_ERR "%s: over max size limit.\n", __func__);
2637
- return -EIO;
1157
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
1158
+
1159
+ if (blk_rq_sectors(rq) > max_sectors) {
1160
+ /*
1161
+ * SCSI device does not have a good way to return if
1162
+ * Write Same/Zero is actually supported. If a device rejects
1163
+ * a non-read/write command (discard, write same,etc.) the
1164
+ * low-level device driver will set the relevant queue limit to
1165
+ * 0 to prevent blk-lib from issuing more of the offending
1166
+ * operations. Commands queued prior to the queue limit being
1167
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
1168
+ * errors being propagated to upper layers.
1169
+ */
1170
+ if (max_sectors == 0)
1171
+ return BLK_STS_NOTSUPP;
1172
+
1173
+ printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
1174
+ __func__, blk_rq_sectors(rq), max_sectors);
1175
+ return BLK_STS_IOERR;
26381176 }
26391177
26401178 /*
....@@ -2643,13 +1181,14 @@
26431181 * Recalculate it to check the request correctly on this queue's
26441182 * limitation.
26451183 */
2646
- blk_recalc_rq_segments(rq);
1184
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
26471185 if (rq->nr_phys_segments > queue_max_segments(q)) {
2648
- printk(KERN_ERR "%s: over max segments limit.\n", __func__);
2649
- return -EIO;
1186
+ printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
1187
+ __func__, rq->nr_phys_segments, queue_max_segments(q));
1188
+ return BLK_STS_IOERR;
26501189 }
26511190
2652
- return 0;
1191
+ return BLK_STS_OK;
26531192 }
26541193
26551194 /**
....@@ -2659,48 +1198,28 @@
26591198 */
26601199 blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
26611200 {
2662
- unsigned long flags;
2663
- int where = ELEVATOR_INSERT_BACK;
1201
+ blk_status_t ret;
26641202
2665
- if (blk_cloned_rq_check_limits(q, rq))
2666
- return BLK_STS_IOERR;
1203
+ ret = blk_cloned_rq_check_limits(q, rq);
1204
+ if (ret != BLK_STS_OK)
1205
+ return ret;
26671206
26681207 if (rq->rq_disk &&
26691208 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
26701209 return BLK_STS_IOERR;
26711210
2672
- if (q->mq_ops) {
2673
- if (blk_queue_io_stat(q))
2674
- blk_account_io_start(rq, true);
2675
- /*
2676
- * Since we have a scheduler attached on the top device,
2677
- * bypass a potential scheduler on the bottom device for
2678
- * insert.
2679
- */
2680
- return blk_mq_request_issue_directly(rq);
2681
- }
2682
-
2683
- spin_lock_irqsave(q->queue_lock, flags);
2684
- if (unlikely(blk_queue_dying(q))) {
2685
- spin_unlock_irqrestore(q->queue_lock, flags);
1211
+ if (blk_crypto_insert_cloned_request(rq))
26861212 return BLK_STS_IOERR;
2687
- }
1213
+
1214
+ if (blk_queue_io_stat(q))
1215
+ blk_account_io_start(rq);
26881216
26891217 /*
2690
- * Submitting request must be dequeued before calling this function
2691
- * because it will be linked to another request_queue
1218
+ * Since we have a scheduler attached on the top device,
1219
+ * bypass a potential scheduler on the bottom device for
1220
+ * insert.
26921221 */
2693
- BUG_ON(blk_queued_rq(rq));
2694
-
2695
- if (op_is_flush(rq->cmd_flags))
2696
- where = ELEVATOR_INSERT_FLUSH;
2697
-
2698
- add_acct_request(q, rq, where);
2699
- if (where == ELEVATOR_INSERT_FLUSH)
2700
- __blk_run_queue(q);
2701
- spin_unlock_irqrestore(q->queue_lock, flags);
2702
-
2703
- return BLK_STS_OK;
1222
+ return blk_mq_request_issue_directly(rq, true);
27041223 }
27051224 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
27061225
....@@ -2745,16 +1264,30 @@
27451264 }
27461265 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
27471266
2748
-void blk_account_io_completion(struct request *req, unsigned int bytes)
1267
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
27491268 {
2750
- if (blk_do_io_stat(req)) {
1269
+ unsigned long stamp;
1270
+again:
1271
+ stamp = READ_ONCE(part->stamp);
1272
+ if (unlikely(stamp != now)) {
1273
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
1274
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
1275
+ }
1276
+ if (part->partno) {
1277
+ part = &part_to_disk(part)->part0;
1278
+ goto again;
1279
+ }
1280
+}
1281
+
1282
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
1283
+{
1284
+ if (req->part && blk_do_io_stat(req)) {
27511285 const int sgrp = op_stat_group(req_op(req));
27521286 struct hd_struct *part;
2753
- int cpu;
27541287
2755
- cpu = part_stat_lock();
1288
+ part_stat_lock();
27561289 part = req->part;
2757
- part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
1290
+ part_stat_add(part, sectors[sgrp], bytes >> 9);
27581291 part_stat_unlock();
27591292 }
27601293 }
....@@ -2766,299 +1299,95 @@
27661299 * normal IO on queueing nor completion. Accounting the
27671300 * containing request is enough.
27681301 */
2769
- if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
1302
+ if (req->part && blk_do_io_stat(req) &&
1303
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
27701304 const int sgrp = op_stat_group(req_op(req));
27711305 struct hd_struct *part;
2772
- int cpu;
27731306
2774
- cpu = part_stat_lock();
1307
+ part_stat_lock();
27751308 part = req->part;
27761309
2777
- part_stat_inc(cpu, part, ios[sgrp]);
2778
- part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
2779
- part_round_stats(req->q, cpu, part);
2780
- part_dec_in_flight(req->q, part, rq_data_dir(req));
1310
+ update_io_ticks(part, jiffies, true);
1311
+ part_stat_inc(part, ios[sgrp]);
1312
+ part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
1313
+ part_stat_unlock();
27811314
27821315 hd_struct_put(part);
2783
- part_stat_unlock();
27841316 }
27851317 }
27861318
2787
-#ifdef CONFIG_PM
2788
-/*
2789
- * Don't process normal requests when queue is suspended
2790
- * or in the process of suspending/resuming
2791
- */
2792
-static bool blk_pm_allow_request(struct request *rq)
1319
+void blk_account_io_start(struct request *rq)
27931320 {
2794
- switch (rq->q->rpm_status) {
2795
- case RPM_RESUMING:
2796
- case RPM_SUSPENDING:
2797
- return rq->rq_flags & RQF_PM;
2798
- case RPM_SUSPENDED:
2799
- return false;
2800
- default:
2801
- return true;
2802
- }
2803
-}
2804
-#else
2805
-static bool blk_pm_allow_request(struct request *rq)
2806
-{
2807
- return true;
2808
-}
2809
-#endif
2810
-
2811
-void blk_account_io_start(struct request *rq, bool new_io)
2812
-{
2813
- struct hd_struct *part;
2814
- int rw = rq_data_dir(rq);
2815
- int cpu;
2816
-
28171321 if (!blk_do_io_stat(rq))
28181322 return;
28191323
2820
- cpu = part_stat_lock();
1324
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
28211325
2822
- if (!new_io) {
2823
- part = rq->part;
2824
- part_stat_inc(cpu, part, merges[rw]);
2825
- } else {
2826
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2827
- if (!hd_struct_try_get(part)) {
2828
- /*
2829
- * The partition is already being removed,
2830
- * the request will be accounted on the disk only
2831
- *
2832
- * We take a reference on disk->part0 although that
2833
- * partition will never be deleted, so we can treat
2834
- * it as any other partition.
2835
- */
2836
- part = &rq->rq_disk->part0;
2837
- hd_struct_get(part);
2838
- }
2839
- part_round_stats(rq->q, cpu, part);
2840
- part_inc_in_flight(rq->q, part, rw);
2841
- rq->part = part;
2842
- }
2843
-
1326
+ part_stat_lock();
1327
+ update_io_ticks(rq->part, jiffies, false);
28441328 part_stat_unlock();
28451329 }
28461330
2847
-static struct request *elv_next_request(struct request_queue *q)
1331
+static unsigned long __part_start_io_acct(struct hd_struct *part,
1332
+ unsigned int sectors, unsigned int op)
28481333 {
2849
- struct request *rq;
2850
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
1334
+ const int sgrp = op_stat_group(op);
1335
+ unsigned long now = READ_ONCE(jiffies);
28511336
2852
- WARN_ON_ONCE(q->mq_ops);
1337
+ part_stat_lock();
1338
+ update_io_ticks(part, now, false);
1339
+ part_stat_inc(part, ios[sgrp]);
1340
+ part_stat_add(part, sectors[sgrp], sectors);
1341
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
1342
+ part_stat_unlock();
28531343
2854
- while (1) {
2855
- list_for_each_entry(rq, &q->queue_head, queuelist) {
2856
- if (blk_pm_allow_request(rq))
2857
- return rq;
2858
-
2859
- if (rq->rq_flags & RQF_SOFTBARRIER)
2860
- break;
2861
- }
2862
-
2863
- /*
2864
- * Flush request is running and flush request isn't queueable
2865
- * in the drive, we can hold the queue till flush request is
2866
- * finished. Even we don't do this, driver can't dispatch next
2867
- * requests and will requeue them. And this can improve
2868
- * throughput too. For example, we have request flush1, write1,
2869
- * flush 2. flush1 is dispatched, then queue is hold, write1
2870
- * isn't inserted to queue. After flush1 is finished, flush2
2871
- * will be dispatched. Since disk cache is already clean,
2872
- * flush2 will be finished very soon, so looks like flush2 is
2873
- * folded to flush1.
2874
- * Since the queue is hold, a flag is set to indicate the queue
2875
- * should be restarted later. Please see flush_end_io() for
2876
- * details.
2877
- */
2878
- if (fq->flush_pending_idx != fq->flush_running_idx &&
2879
- !queue_flush_queueable(q)) {
2880
- fq->flush_queue_delayed = 1;
2881
- return NULL;
2882
- }
2883
- if (unlikely(blk_queue_bypass(q)) ||
2884
- !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2885
- return NULL;
2886
- }
1344
+ return now;
28871345 }
28881346
2889
-/**
2890
- * blk_peek_request - peek at the top of a request queue
2891
- * @q: request queue to peek at
2892
- *
2893
- * Description:
2894
- * Return the request at the top of @q. The returned request
2895
- * should be started using blk_start_request() before LLD starts
2896
- * processing it.
2897
- *
2898
- * Return:
2899
- * Pointer to the request at the top of @q if available. Null
2900
- * otherwise.
2901
- */
2902
-struct request *blk_peek_request(struct request_queue *q)
1347
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
1348
+ struct bio *bio)
29031349 {
2904
- struct request *rq;
2905
- int ret;
1350
+ *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
29061351
2907
- lockdep_assert_held(q->queue_lock);
2908
- WARN_ON_ONCE(q->mq_ops);
2909
-
2910
- while ((rq = elv_next_request(q)) != NULL) {
2911
- if (!(rq->rq_flags & RQF_STARTED)) {
2912
- /*
2913
- * This is the first time the device driver
2914
- * sees this request (possibly after
2915
- * requeueing). Notify IO scheduler.
2916
- */
2917
- if (rq->rq_flags & RQF_SORTED)
2918
- elv_activate_rq(q, rq);
2919
-
2920
- /*
2921
- * just mark as started even if we don't start
2922
- * it, a request that has been delayed should
2923
- * not be passed by new incoming requests
2924
- */
2925
- rq->rq_flags |= RQF_STARTED;
2926
- trace_block_rq_issue(q, rq);
2927
- }
2928
-
2929
- if (!q->boundary_rq || q->boundary_rq == rq) {
2930
- q->end_sector = rq_end_sector(rq);
2931
- q->boundary_rq = NULL;
2932
- }
2933
-
2934
- if (rq->rq_flags & RQF_DONTPREP)
2935
- break;
2936
-
2937
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
2938
- /*
2939
- * make sure space for the drain appears we
2940
- * know we can do this because max_hw_segments
2941
- * has been adjusted to be one fewer than the
2942
- * device can handle
2943
- */
2944
- rq->nr_phys_segments++;
2945
- }
2946
-
2947
- if (!q->prep_rq_fn)
2948
- break;
2949
-
2950
- ret = q->prep_rq_fn(q, rq);
2951
- if (ret == BLKPREP_OK) {
2952
- break;
2953
- } else if (ret == BLKPREP_DEFER) {
2954
- /*
2955
- * the request may have been (partially) prepped.
2956
- * we need to keep this request in the front to
2957
- * avoid resource deadlock. RQF_STARTED will
2958
- * prevent other fs requests from passing this one.
2959
- */
2960
- if (q->dma_drain_size && blk_rq_bytes(rq) &&
2961
- !(rq->rq_flags & RQF_DONTPREP)) {
2962
- /*
2963
- * remove the space for the drain we added
2964
- * so that we don't add it again
2965
- */
2966
- --rq->nr_phys_segments;
2967
- }
2968
-
2969
- rq = NULL;
2970
- break;
2971
- } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
2972
- rq->rq_flags |= RQF_QUIET;
2973
- /*
2974
- * Mark this request as started so we don't trigger
2975
- * any debug logic in the end I/O path.
2976
- */
2977
- blk_start_request(rq);
2978
- __blk_end_request_all(rq, ret == BLKPREP_INVALID ?
2979
- BLK_STS_TARGET : BLK_STS_IOERR);
2980
- } else {
2981
- printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2982
- break;
2983
- }
2984
- }
2985
-
2986
- return rq;
1352
+ return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
29871353 }
2988
-EXPORT_SYMBOL(blk_peek_request);
1354
+EXPORT_SYMBOL_GPL(part_start_io_acct);
29891355
2990
-static void blk_dequeue_request(struct request *rq)
1356
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
1357
+ unsigned int op)
29911358 {
2992
- struct request_queue *q = rq->q;
1359
+ return __part_start_io_acct(&disk->part0, sectors, op);
1360
+}
1361
+EXPORT_SYMBOL(disk_start_io_acct);
29931362
2994
- BUG_ON(list_empty(&rq->queuelist));
2995
- BUG_ON(ELV_ON_HASH(rq));
1363
+static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
1364
+ unsigned long start_time)
1365
+{
1366
+ const int sgrp = op_stat_group(op);
1367
+ unsigned long now = READ_ONCE(jiffies);
1368
+ unsigned long duration = now - start_time;
29961369
2997
- list_del_init(&rq->queuelist);
2998
-
2999
- /*
3000
- * the time frame between a request being removed from the lists
3001
- * and to it is freed is accounted as io that is in progress at
3002
- * the driver side.
3003
- */
3004
- if (blk_account_rq(rq))
3005
- q->in_flight[rq_is_sync(rq)]++;
1370
+ part_stat_lock();
1371
+ update_io_ticks(part, now, true);
1372
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1373
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
1374
+ part_stat_unlock();
30061375 }
30071376
3008
-/**
3009
- * blk_start_request - start request processing on the driver
3010
- * @req: request to dequeue
3011
- *
3012
- * Description:
3013
- * Dequeue @req and start timeout timer on it. This hands off the
3014
- * request to the driver.
3015
- */
3016
-void blk_start_request(struct request *req)
1377
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
1378
+ unsigned long start_time)
30171379 {
3018
- lockdep_assert_held(req->q->queue_lock);
3019
- WARN_ON_ONCE(req->q->mq_ops);
3020
-
3021
- blk_dequeue_request(req);
3022
-
3023
- if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
3024
- req->io_start_time_ns = ktime_get_ns();
3025
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
3026
- req->throtl_size = blk_rq_sectors(req);
3027
-#endif
3028
- req->rq_flags |= RQF_STATS;
3029
- rq_qos_issue(req->q, req);
3030
- }
3031
-
3032
- BUG_ON(blk_rq_is_complete(req));
3033
- blk_add_timer(req);
1380
+ __part_end_io_acct(part, bio_op(bio), start_time);
1381
+ hd_struct_put(part);
30341382 }
3035
-EXPORT_SYMBOL(blk_start_request);
1383
+EXPORT_SYMBOL_GPL(part_end_io_acct);
30361384
3037
-/**
3038
- * blk_fetch_request - fetch a request from a request queue
3039
- * @q: request queue to fetch a request from
3040
- *
3041
- * Description:
3042
- * Return the request at the top of @q. The request is started on
3043
- * return and LLD can start processing it immediately.
3044
- *
3045
- * Return:
3046
- * Pointer to the request at the top of @q if available. Null
3047
- * otherwise.
3048
- */
3049
-struct request *blk_fetch_request(struct request_queue *q)
1385
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
1386
+ unsigned long start_time)
30501387 {
3051
- struct request *rq;
3052
-
3053
- lockdep_assert_held(q->queue_lock);
3054
- WARN_ON_ONCE(q->mq_ops);
3055
-
3056
- rq = blk_peek_request(q);
3057
- if (rq)
3058
- blk_start_request(rq);
3059
- return rq;
1388
+ __part_end_io_acct(&disk->part0, op, start_time);
30601389 }
3061
-EXPORT_SYMBOL(blk_fetch_request);
1390
+EXPORT_SYMBOL(disk_end_io_acct);
30621391
30631392 /*
30641393 * Steal bios from a request and add them to a bio list.
....@@ -3094,7 +1423,7 @@
30941423 *
30951424 * This special helper function is only for request stacking drivers
30961425 * (e.g. request-based dm) so that they can handle partial completion.
3097
- * Actual device drivers should use blk_end_request instead.
1426
+ * Actual device drivers should use blk_mq_end_request instead.
30981427 *
30991428 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
31001429 * %false return from this function.
....@@ -3117,9 +1446,15 @@
31171446 if (!req->bio)
31181447 return false;
31191448
1449
+#ifdef CONFIG_BLK_DEV_INTEGRITY
1450
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
1451
+ error == BLK_STS_OK)
1452
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
1453
+#endif
1454
+
31201455 if (unlikely(error && !blk_rq_is_passthrough(req) &&
31211456 !(req->rq_flags & RQF_QUIET)))
3122
- print_req_error(req, error);
1457
+ print_req_error(req, error, __func__);
31231458
31241459 blk_account_io_completion(req, nr_bytes);
31251460
....@@ -3178,276 +1513,12 @@
31781513 }
31791514
31801515 /* recalculate the number of segments */
3181
- blk_recalc_rq_segments(req);
1516
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
31821517 }
31831518
31841519 return true;
31851520 }
31861521 EXPORT_SYMBOL_GPL(blk_update_request);
3187
-
3188
-static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
3189
- unsigned int nr_bytes,
3190
- unsigned int bidi_bytes)
3191
-{
3192
- if (blk_update_request(rq, error, nr_bytes))
3193
- return true;
3194
-
3195
- /* Bidi request must be completed as a whole */
3196
- if (unlikely(blk_bidi_rq(rq)) &&
3197
- blk_update_request(rq->next_rq, error, bidi_bytes))
3198
- return true;
3199
-
3200
- if (blk_queue_add_random(rq->q))
3201
- add_disk_randomness(rq->rq_disk);
3202
-
3203
- return false;
3204
-}
3205
-
3206
-/**
3207
- * blk_unprep_request - unprepare a request
3208
- * @req: the request
3209
- *
3210
- * This function makes a request ready for complete resubmission (or
3211
- * completion). It happens only after all error handling is complete,
3212
- * so represents the appropriate moment to deallocate any resources
3213
- * that were allocated to the request in the prep_rq_fn. The queue
3214
- * lock is held when calling this.
3215
- */
3216
-void blk_unprep_request(struct request *req)
3217
-{
3218
- struct request_queue *q = req->q;
3219
-
3220
- req->rq_flags &= ~RQF_DONTPREP;
3221
- if (q->unprep_rq_fn)
3222
- q->unprep_rq_fn(q, req);
3223
-}
3224
-EXPORT_SYMBOL_GPL(blk_unprep_request);
3225
-
3226
-void blk_finish_request(struct request *req, blk_status_t error)
3227
-{
3228
- struct request_queue *q = req->q;
3229
- u64 now = ktime_get_ns();
3230
-
3231
- lockdep_assert_held(req->q->queue_lock);
3232
- WARN_ON_ONCE(q->mq_ops);
3233
-
3234
- if (req->rq_flags & RQF_STATS)
3235
- blk_stat_add(req, now);
3236
-
3237
- if (req->rq_flags & RQF_QUEUED)
3238
- blk_queue_end_tag(q, req);
3239
-
3240
- BUG_ON(blk_queued_rq(req));
3241
-
3242
- if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
3243
- laptop_io_completion(req->q->backing_dev_info);
3244
-
3245
- blk_delete_timer(req);
3246
-
3247
- if (req->rq_flags & RQF_DONTPREP)
3248
- blk_unprep_request(req);
3249
-
3250
- blk_account_io_done(req, now);
3251
-
3252
- if (req->end_io) {
3253
- rq_qos_done(q, req);
3254
- req->end_io(req, error);
3255
- } else {
3256
- if (blk_bidi_rq(req))
3257
- __blk_put_request(req->next_rq->q, req->next_rq);
3258
-
3259
- __blk_put_request(q, req);
3260
- }
3261
-}
3262
-EXPORT_SYMBOL(blk_finish_request);
3263
-
3264
-/**
3265
- * blk_end_bidi_request - Complete a bidi request
3266
- * @rq: the request to complete
3267
- * @error: block status code
3268
- * @nr_bytes: number of bytes to complete @rq
3269
- * @bidi_bytes: number of bytes to complete @rq->next_rq
3270
- *
3271
- * Description:
3272
- * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
3273
- * Drivers that supports bidi can safely call this member for any
3274
- * type of request, bidi or uni. In the later case @bidi_bytes is
3275
- * just ignored.
3276
- *
3277
- * Return:
3278
- * %false - we are done with this request
3279
- * %true - still buffers pending for this request
3280
- **/
3281
-static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
3282
- unsigned int nr_bytes, unsigned int bidi_bytes)
3283
-{
3284
- struct request_queue *q = rq->q;
3285
- unsigned long flags;
3286
-
3287
- WARN_ON_ONCE(q->mq_ops);
3288
-
3289
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3290
- return true;
3291
-
3292
- spin_lock_irqsave(q->queue_lock, flags);
3293
- blk_finish_request(rq, error);
3294
- spin_unlock_irqrestore(q->queue_lock, flags);
3295
-
3296
- return false;
3297
-}
3298
-
3299
-/**
3300
- * __blk_end_bidi_request - Complete a bidi request with queue lock held
3301
- * @rq: the request to complete
3302
- * @error: block status code
3303
- * @nr_bytes: number of bytes to complete @rq
3304
- * @bidi_bytes: number of bytes to complete @rq->next_rq
3305
- *
3306
- * Description:
3307
- * Identical to blk_end_bidi_request() except that queue lock is
3308
- * assumed to be locked on entry and remains so on return.
3309
- *
3310
- * Return:
3311
- * %false - we are done with this request
3312
- * %true - still buffers pending for this request
3313
- **/
3314
-static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
3315
- unsigned int nr_bytes, unsigned int bidi_bytes)
3316
-{
3317
- lockdep_assert_held(rq->q->queue_lock);
3318
- WARN_ON_ONCE(rq->q->mq_ops);
3319
-
3320
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3321
- return true;
3322
-
3323
- blk_finish_request(rq, error);
3324
-
3325
- return false;
3326
-}
3327
-
3328
-/**
3329
- * blk_end_request - Helper function for drivers to complete the request.
3330
- * @rq: the request being processed
3331
- * @error: block status code
3332
- * @nr_bytes: number of bytes to complete
3333
- *
3334
- * Description:
3335
- * Ends I/O on a number of bytes attached to @rq.
3336
- * If @rq has leftover, sets it up for the next range of segments.
3337
- *
3338
- * Return:
3339
- * %false - we are done with this request
3340
- * %true - still buffers pending for this request
3341
- **/
3342
-bool blk_end_request(struct request *rq, blk_status_t error,
3343
- unsigned int nr_bytes)
3344
-{
3345
- WARN_ON_ONCE(rq->q->mq_ops);
3346
- return blk_end_bidi_request(rq, error, nr_bytes, 0);
3347
-}
3348
-EXPORT_SYMBOL(blk_end_request);
3349
-
3350
-/**
3351
- * blk_end_request_all - Helper function for drives to finish the request.
3352
- * @rq: the request to finish
3353
- * @error: block status code
3354
- *
3355
- * Description:
3356
- * Completely finish @rq.
3357
- */
3358
-void blk_end_request_all(struct request *rq, blk_status_t error)
3359
-{
3360
- bool pending;
3361
- unsigned int bidi_bytes = 0;
3362
-
3363
- if (unlikely(blk_bidi_rq(rq)))
3364
- bidi_bytes = blk_rq_bytes(rq->next_rq);
3365
-
3366
- pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3367
- BUG_ON(pending);
3368
-}
3369
-EXPORT_SYMBOL(blk_end_request_all);
3370
-
3371
-/**
3372
- * __blk_end_request - Helper function for drivers to complete the request.
3373
- * @rq: the request being processed
3374
- * @error: block status code
3375
- * @nr_bytes: number of bytes to complete
3376
- *
3377
- * Description:
3378
- * Must be called with queue lock held unlike blk_end_request().
3379
- *
3380
- * Return:
3381
- * %false - we are done with this request
3382
- * %true - still buffers pending for this request
3383
- **/
3384
-bool __blk_end_request(struct request *rq, blk_status_t error,
3385
- unsigned int nr_bytes)
3386
-{
3387
- lockdep_assert_held(rq->q->queue_lock);
3388
- WARN_ON_ONCE(rq->q->mq_ops);
3389
-
3390
- return __blk_end_bidi_request(rq, error, nr_bytes, 0);
3391
-}
3392
-EXPORT_SYMBOL(__blk_end_request);
3393
-
3394
-/**
3395
- * __blk_end_request_all - Helper function for drives to finish the request.
3396
- * @rq: the request to finish
3397
- * @error: block status code
3398
- *
3399
- * Description:
3400
- * Completely finish @rq. Must be called with queue lock held.
3401
- */
3402
-void __blk_end_request_all(struct request *rq, blk_status_t error)
3403
-{
3404
- bool pending;
3405
- unsigned int bidi_bytes = 0;
3406
-
3407
- lockdep_assert_held(rq->q->queue_lock);
3408
- WARN_ON_ONCE(rq->q->mq_ops);
3409
-
3410
- if (unlikely(blk_bidi_rq(rq)))
3411
- bidi_bytes = blk_rq_bytes(rq->next_rq);
3412
-
3413
- pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3414
- BUG_ON(pending);
3415
-}
3416
-EXPORT_SYMBOL(__blk_end_request_all);
3417
-
3418
-/**
3419
- * __blk_end_request_cur - Helper function to finish the current request chunk.
3420
- * @rq: the request to finish the current chunk for
3421
- * @error: block status code
3422
- *
3423
- * Description:
3424
- * Complete the current consecutively mapped chunk from @rq. Must
3425
- * be called with queue lock held.
3426
- *
3427
- * Return:
3428
- * %false - we are done with this request
3429
- * %true - still buffers pending for this request
3430
- */
3431
-bool __blk_end_request_cur(struct request *rq, blk_status_t error)
3432
-{
3433
- return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
3434
-}
3435
-EXPORT_SYMBOL(__blk_end_request_cur);
3436
-
3437
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3438
- struct bio *bio)
3439
-{
3440
- if (bio_has_data(bio))
3441
- rq->nr_phys_segments = bio_phys_segments(q, bio);
3442
- else if (bio_op(bio) == REQ_OP_DISCARD)
3443
- rq->nr_phys_segments = 1;
3444
-
3445
- rq->__data_len = bio->bi_iter.bi_size;
3446
- rq->bio = rq->biotail = bio;
3447
-
3448
- if (bio->bi_disk)
3449
- rq->rq_disk = bio->bi_disk;
3450
-}
34511522
34521523 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
34531524 /**
....@@ -3489,8 +1560,8 @@
34891560 */
34901561 int blk_lld_busy(struct request_queue *q)
34911562 {
3492
- if (q->lld_busy_fn)
3493
- return q->lld_busy_fn(q);
1563
+ if (queue_is_mq(q) && q->mq_ops->busy)
1564
+ return q->mq_ops->busy(q);
34941565
34951566 return 0;
34961567 }
....@@ -3515,24 +1586,6 @@
35151586 }
35161587 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
35171588
3518
-/*
3519
- * Copy attributes of the original request to the clone request.
3520
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
3521
- */
3522
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
3523
-{
3524
- dst->cpu = src->cpu;
3525
- dst->__sector = blk_rq_pos(src);
3526
- dst->__data_len = blk_rq_bytes(src);
3527
- if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3528
- dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
3529
- dst->special_vec = src->special_vec;
3530
- }
3531
- dst->nr_phys_segments = src->nr_phys_segments;
3532
- dst->ioprio = src->ioprio;
3533
- dst->extra_len = src->extra_len;
3534
-}
3535
-
35361589 /**
35371590 * blk_rq_prep_clone - Helper function to setup clone request
35381591 * @rq: the request to be setup
....@@ -3545,8 +1598,6 @@
35451598 *
35461599 * Description:
35471600 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3548
- * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
3549
- * are not copied, and copying such parts is the caller's responsibility.
35501601 * Also, pages which the original bios are pointing to are not copied
35511602 * and the cloned bios just point same pages.
35521603 * So cloned bios must be completed before original bios, which means
....@@ -3573,11 +1624,24 @@
35731624 if (rq->bio) {
35741625 rq->biotail->bi_next = bio;
35751626 rq->biotail = bio;
3576
- } else
1627
+ } else {
35771628 rq->bio = rq->biotail = bio;
1629
+ }
1630
+ bio = NULL;
35781631 }
35791632
3580
- __blk_rq_prep_clone(rq, rq_src);
1633
+ /* Copy attributes of the original request to the clone request. */
1634
+ rq->__sector = blk_rq_pos(rq_src);
1635
+ rq->__data_len = blk_rq_bytes(rq_src);
1636
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
1637
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
1638
+ rq->special_vec = rq_src->special_vec;
1639
+ }
1640
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
1641
+ rq->ioprio = rq_src->ioprio;
1642
+
1643
+ if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
1644
+ goto free_and_out;
35811645
35821646 return 0;
35831647
....@@ -3596,12 +1660,6 @@
35961660 }
35971661 EXPORT_SYMBOL(kblockd_schedule_work);
35981662
3599
-int kblockd_schedule_work_on(int cpu, struct work_struct *work)
3600
-{
3601
- return queue_work_on(cpu, kblockd_workqueue, work);
3602
-}
3603
-EXPORT_SYMBOL(kblockd_schedule_work_on);
3604
-
36051663 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
36061664 unsigned long delay)
36071665 {
....@@ -3614,6 +1672,15 @@
36141672 * @plug: The &struct blk_plug that needs to be initialized
36151673 *
36161674 * Description:
1675
+ * blk_start_plug() indicates to the block layer an intent by the caller
1676
+ * to submit multiple I/O requests in a batch. The block layer may use
1677
+ * this hint to defer submitting I/Os from the caller until blk_finish_plug()
1678
+ * is called. However, the block layer may choose to submit requests
1679
+ * before a call to blk_finish_plug() if the number of queued I/Os
1680
+ * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1681
+ * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1682
+ * the task schedules (see below).
1683
+ *
36171684 * Tracking blk_plug inside the task_struct will help with auto-flushing the
36181685 * pending I/O should the task end up blocking between blk_start_plug() and
36191686 * blk_finish_plug(). This is important from a performance perspective, but
....@@ -3633,9 +1700,12 @@
36331700 if (tsk->plug)
36341701 return;
36351702
3636
- INIT_LIST_HEAD(&plug->list);
36371703 INIT_LIST_HEAD(&plug->mq_list);
36381704 INIT_LIST_HEAD(&plug->cb_list);
1705
+ plug->rq_count = 0;
1706
+ plug->multiple_queues = false;
1707
+ plug->nowait = false;
1708
+
36391709 /*
36401710 * Store ordering should not be needed here, since a potential
36411711 * preempt will imply a full memory barrier
....@@ -3643,36 +1713,6 @@
36431713 tsk->plug = plug;
36441714 }
36451715 EXPORT_SYMBOL(blk_start_plug);
3646
-
3647
-static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
3648
-{
3649
- struct request *rqa = container_of(a, struct request, queuelist);
3650
- struct request *rqb = container_of(b, struct request, queuelist);
3651
-
3652
- return !(rqa->q < rqb->q ||
3653
- (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
3654
-}
3655
-
3656
-/*
3657
- * If 'from_schedule' is true, then postpone the dispatch of requests
3658
- * until a safe kblockd context. We due this to avoid accidental big
3659
- * additional stack usage in driver dispatch, in places where the originally
3660
- * plugger did not intend it.
3661
- */
3662
-static void queue_unplugged(struct request_queue *q, unsigned int depth,
3663
- bool from_schedule)
3664
- __releases(q->queue_lock)
3665
-{
3666
- lockdep_assert_held(q->queue_lock);
3667
-
3668
- trace_block_unplug(q, depth, !from_schedule);
3669
-
3670
- if (from_schedule)
3671
- blk_run_queue_async(q);
3672
- else
3673
- __blk_run_queue(q);
3674
- spin_unlock_irq(q->queue_lock);
3675
-}
36761716
36771717 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
36781718 {
....@@ -3718,67 +1758,22 @@
37181758
37191759 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
37201760 {
3721
- struct request_queue *q;
3722
- struct request *rq;
3723
- LIST_HEAD(list);
3724
- unsigned int depth;
3725
-
37261761 flush_plug_callbacks(plug, from_schedule);
37271762
37281763 if (!list_empty(&plug->mq_list))
37291764 blk_mq_flush_plug_list(plug, from_schedule);
3730
-
3731
- if (list_empty(&plug->list))
3732
- return;
3733
-
3734
- list_splice_init(&plug->list, &list);
3735
-
3736
- list_sort(NULL, &list, plug_rq_cmp);
3737
-
3738
- q = NULL;
3739
- depth = 0;
3740
-
3741
- while (!list_empty(&list)) {
3742
- rq = list_entry_rq(list.next);
3743
- list_del_init(&rq->queuelist);
3744
- BUG_ON(!rq->q);
3745
- if (rq->q != q) {
3746
- /*
3747
- * This drops the queue lock
3748
- */
3749
- if (q)
3750
- queue_unplugged(q, depth, from_schedule);
3751
- q = rq->q;
3752
- depth = 0;
3753
- spin_lock_irq(q->queue_lock);
3754
- }
3755
-
3756
- /*
3757
- * Short-circuit if @q is dead
3758
- */
3759
- if (unlikely(blk_queue_dying(q))) {
3760
- __blk_end_request_all(rq, BLK_STS_IOERR);
3761
- continue;
3762
- }
3763
-
3764
- /*
3765
- * rq is already accounted, so use raw insert
3766
- */
3767
- if (op_is_flush(rq->cmd_flags))
3768
- __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3769
- else
3770
- __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3771
-
3772
- depth++;
3773
- }
3774
-
3775
- /*
3776
- * This drops the queue lock
3777
- */
3778
- if (q)
3779
- queue_unplugged(q, depth, from_schedule);
37801765 }
37811766
1767
+/**
1768
+ * blk_finish_plug - mark the end of a batch of submitted I/O
1769
+ * @plug: The &struct blk_plug passed to blk_start_plug()
1770
+ *
1771
+ * Description:
1772
+ * Indicate that a batch of I/O submissions is complete. This function
1773
+ * must be paired with an initial call to blk_start_plug(). The intent
1774
+ * is to allow the block layer to optimize I/O submission. See the
1775
+ * documentation for blk_start_plug() for more information.
1776
+ */
37821777 void blk_finish_plug(struct blk_plug *plug)
37831778 {
37841779 if (plug != current->plug)
....@@ -3789,198 +1784,25 @@
37891784 }
37901785 EXPORT_SYMBOL(blk_finish_plug);
37911786
3792
-#ifdef CONFIG_PM
3793
-/**
3794
- * blk_pm_runtime_init - Block layer runtime PM initialization routine
3795
- * @q: the queue of the device
3796
- * @dev: the device the queue belongs to
3797
- *
3798
- * Description:
3799
- * Initialize runtime-PM-related fields for @q and start auto suspend for
3800
- * @dev. Drivers that want to take advantage of request-based runtime PM
3801
- * should call this function after @dev has been initialized, and its
3802
- * request queue @q has been allocated, and runtime PM for it can not happen
3803
- * yet(either due to disabled/forbidden or its usage_count > 0). In most
3804
- * cases, driver should call this function before any I/O has taken place.
3805
- *
3806
- * This function takes care of setting up using auto suspend for the device,
3807
- * the autosuspend delay is set to -1 to make runtime suspend impossible
3808
- * until an updated value is either set by user or by driver. Drivers do
3809
- * not need to touch other autosuspend settings.
3810
- *
3811
- * The block layer runtime PM is request based, so only works for drivers
3812
- * that use request as their IO unit instead of those directly use bio's.
3813
- */
3814
-void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
1787
+void blk_io_schedule(void)
38151788 {
3816
- /* Don't enable runtime PM for blk-mq until it is ready */
3817
- if (q->mq_ops) {
3818
- pm_runtime_disable(dev);
3819
- return;
3820
- }
1789
+ /* Prevent hang_check timer from firing at us during very long I/O */
1790
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
38211791
3822
- q->dev = dev;
3823
- q->rpm_status = RPM_ACTIVE;
3824
- pm_runtime_set_autosuspend_delay(q->dev, -1);
3825
- pm_runtime_use_autosuspend(q->dev);
1792
+ if (timeout)
1793
+ io_schedule_timeout(timeout);
1794
+ else
1795
+ io_schedule();
38261796 }
3827
-EXPORT_SYMBOL(blk_pm_runtime_init);
3828
-
3829
-/**
3830
- * blk_pre_runtime_suspend - Pre runtime suspend check
3831
- * @q: the queue of the device
3832
- *
3833
- * Description:
3834
- * This function will check if runtime suspend is allowed for the device
3835
- * by examining if there are any requests pending in the queue. If there
3836
- * are requests pending, the device can not be runtime suspended; otherwise,
3837
- * the queue's status will be updated to SUSPENDING and the driver can
3838
- * proceed to suspend the device.
3839
- *
3840
- * For the not allowed case, we mark last busy for the device so that
3841
- * runtime PM core will try to autosuspend it some time later.
3842
- *
3843
- * This function should be called near the start of the device's
3844
- * runtime_suspend callback.
3845
- *
3846
- * Return:
3847
- * 0 - OK to runtime suspend the device
3848
- * -EBUSY - Device should not be runtime suspended
3849
- */
3850
-int blk_pre_runtime_suspend(struct request_queue *q)
3851
-{
3852
- int ret = 0;
3853
-
3854
- if (!q->dev)
3855
- return ret;
3856
-
3857
- spin_lock_irq(q->queue_lock);
3858
- if (q->nr_pending) {
3859
- ret = -EBUSY;
3860
- pm_runtime_mark_last_busy(q->dev);
3861
- } else {
3862
- q->rpm_status = RPM_SUSPENDING;
3863
- }
3864
- spin_unlock_irq(q->queue_lock);
3865
- return ret;
3866
-}
3867
-EXPORT_SYMBOL(blk_pre_runtime_suspend);
3868
-
3869
-/**
3870
- * blk_post_runtime_suspend - Post runtime suspend processing
3871
- * @q: the queue of the device
3872
- * @err: return value of the device's runtime_suspend function
3873
- *
3874
- * Description:
3875
- * Update the queue's runtime status according to the return value of the
3876
- * device's runtime suspend function and mark last busy for the device so
3877
- * that PM core will try to auto suspend the device at a later time.
3878
- *
3879
- * This function should be called near the end of the device's
3880
- * runtime_suspend callback.
3881
- */
3882
-void blk_post_runtime_suspend(struct request_queue *q, int err)
3883
-{
3884
- if (!q->dev)
3885
- return;
3886
-
3887
- spin_lock_irq(q->queue_lock);
3888
- if (!err) {
3889
- q->rpm_status = RPM_SUSPENDED;
3890
- } else {
3891
- q->rpm_status = RPM_ACTIVE;
3892
- pm_runtime_mark_last_busy(q->dev);
3893
- }
3894
- spin_unlock_irq(q->queue_lock);
3895
-}
3896
-EXPORT_SYMBOL(blk_post_runtime_suspend);
3897
-
3898
-/**
3899
- * blk_pre_runtime_resume - Pre runtime resume processing
3900
- * @q: the queue of the device
3901
- *
3902
- * Description:
3903
- * Update the queue's runtime status to RESUMING in preparation for the
3904
- * runtime resume of the device.
3905
- *
3906
- * This function should be called near the start of the device's
3907
- * runtime_resume callback.
3908
- */
3909
-void blk_pre_runtime_resume(struct request_queue *q)
3910
-{
3911
- if (!q->dev)
3912
- return;
3913
-
3914
- spin_lock_irq(q->queue_lock);
3915
- q->rpm_status = RPM_RESUMING;
3916
- spin_unlock_irq(q->queue_lock);
3917
-}
3918
-EXPORT_SYMBOL(blk_pre_runtime_resume);
3919
-
3920
-/**
3921
- * blk_post_runtime_resume - Post runtime resume processing
3922
- * @q: the queue of the device
3923
- * @err: return value of the device's runtime_resume function
3924
- *
3925
- * Description:
3926
- * Update the queue's runtime status according to the return value of the
3927
- * device's runtime_resume function. If it is successfully resumed, process
3928
- * the requests that are queued into the device's queue when it is resuming
3929
- * and then mark last busy and initiate autosuspend for it.
3930
- *
3931
- * This function should be called near the end of the device's
3932
- * runtime_resume callback.
3933
- */
3934
-void blk_post_runtime_resume(struct request_queue *q, int err)
3935
-{
3936
- if (!q->dev)
3937
- return;
3938
-
3939
- spin_lock_irq(q->queue_lock);
3940
- if (!err) {
3941
- q->rpm_status = RPM_ACTIVE;
3942
- __blk_run_queue(q);
3943
- pm_runtime_mark_last_busy(q->dev);
3944
- pm_request_autosuspend(q->dev);
3945
- } else {
3946
- q->rpm_status = RPM_SUSPENDED;
3947
- }
3948
- spin_unlock_irq(q->queue_lock);
3949
-}
3950
-EXPORT_SYMBOL(blk_post_runtime_resume);
3951
-
3952
-/**
3953
- * blk_set_runtime_active - Force runtime status of the queue to be active
3954
- * @q: the queue of the device
3955
- *
3956
- * If the device is left runtime suspended during system suspend the resume
3957
- * hook typically resumes the device and corrects runtime status
3958
- * accordingly. However, that does not affect the queue runtime PM status
3959
- * which is still "suspended". This prevents processing requests from the
3960
- * queue.
3961
- *
3962
- * This function can be used in driver's resume hook to correct queue
3963
- * runtime PM status and re-enable peeking requests from the queue. It
3964
- * should be called before first request is added to the queue.
3965
- */
3966
-void blk_set_runtime_active(struct request_queue *q)
3967
-{
3968
- spin_lock_irq(q->queue_lock);
3969
- q->rpm_status = RPM_ACTIVE;
3970
- pm_runtime_mark_last_busy(q->dev);
3971
- pm_request_autosuspend(q->dev);
3972
- spin_unlock_irq(q->queue_lock);
3973
-}
3974
-EXPORT_SYMBOL(blk_set_runtime_active);
3975
-#endif
1797
+EXPORT_SYMBOL_GPL(blk_io_schedule);
39761798
39771799 int __init blk_dev_init(void)
39781800 {
39791801 BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
39801802 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3981
- FIELD_SIZEOF(struct request, cmd_flags));
1803
+ sizeof_field(struct request, cmd_flags));
39821804 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3983
- FIELD_SIZEOF(struct bio, bi_opf));
1805
+ sizeof_field(struct bio, bi_opf));
39841806
39851807 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
39861808 kblockd_workqueue = alloc_workqueue("kblockd",
....@@ -3988,21 +1810,10 @@
39881810 if (!kblockd_workqueue)
39891811 panic("Failed to create kblockd\n");
39901812
3991
- request_cachep = kmem_cache_create("blkdev_requests",
3992
- sizeof(struct request), 0, SLAB_PANIC, NULL);
3993
-
39941813 blk_requestq_cachep = kmem_cache_create("request_queue",
39951814 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
39961815
3997
-#ifdef CONFIG_DEBUG_FS
39981816 blk_debugfs_root = debugfs_create_dir("block", NULL);
3999
-#endif
4000
-
4001
- if (bio_crypt_ctx_init() < 0)
4002
- panic("Failed to allocate mem for bio crypt ctxs\n");
4003
-
4004
- if (blk_crypto_fallback_init() < 0)
4005
- panic("Failed to init blk-crypto-fallback\n");
40061817
40071818 return 0;
40081819 }