hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/block/blk-core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Copyright (C) 1991, 1992 Linus Torvalds
34 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
....@@ -19,6 +20,7 @@
1920 #include <linux/blk-mq.h>
2021 #include <linux/highmem.h>
2122 #include <linux/mm.h>
23
+#include <linux/pagemap.h>
2224 #include <linux/kernel_stat.h>
2325 #include <linux/string.h>
2426 #include <linux/init.h>
....@@ -33,9 +35,11 @@
3335 #include <linux/ratelimit.h>
3436 #include <linux/pm_runtime.h>
3537 #include <linux/blk-cgroup.h>
38
+#include <linux/t10-pi.h>
3639 #include <linux/debugfs.h>
3740 #include <linux/bpf.h>
3841 #include <linux/psi.h>
42
+#include <linux/sched/sysctl.h>
3943 #include <linux/blk-crypto.h>
4044
4145 #define CREATE_TRACE_POINTS
....@@ -44,24 +48,25 @@
4448 #include "blk.h"
4549 #include "blk-mq.h"
4650 #include "blk-mq-sched.h"
51
+#include "blk-pm.h"
4752 #include "blk-rq-qos.h"
4853
49
-#ifdef CONFIG_DEBUG_FS
5054 struct dentry *blk_debugfs_root;
51
-#endif
5255
5356 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
5457 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
5558 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
5659 EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
5760 EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
61
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_queue);
62
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_getrq);
63
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
64
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_issue);
65
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_merge);
66
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_requeue);
67
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_complete);
5868
5969 DEFINE_IDA(blk_queue_ida);
60
-
61
-/*
62
- * For the allocated request tables
63
- */
64
-struct kmem_cache *request_cachep;
6570
6671 /*
6772 * For queue allocation
....@@ -80,11 +85,7 @@
8085 */
8186 void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
8287 {
83
- unsigned long flags;
84
-
85
- spin_lock_irqsave(q->queue_lock, flags);
86
- queue_flag_set(flag, q);
87
- spin_unlock_irqrestore(q->queue_lock, flags);
88
+ set_bit(flag, &q->queue_flags);
8889 }
8990 EXPORT_SYMBOL(blk_queue_flag_set);
9091
....@@ -95,11 +96,7 @@
9596 */
9697 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
9798 {
98
- unsigned long flags;
99
-
100
- spin_lock_irqsave(q->queue_lock, flags);
101
- queue_flag_clear(flag, q);
102
- spin_unlock_irqrestore(q->queue_lock, flags);
99
+ clear_bit(flag, &q->queue_flags);
103100 }
104101 EXPORT_SYMBOL(blk_queue_flag_clear);
105102
....@@ -113,99 +110,67 @@
113110 */
114111 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
115112 {
116
- unsigned long flags;
117
- bool res;
118
-
119
- spin_lock_irqsave(q->queue_lock, flags);
120
- res = queue_flag_test_and_set(flag, q);
121
- spin_unlock_irqrestore(q->queue_lock, flags);
122
-
123
- return res;
113
+ return test_and_set_bit(flag, &q->queue_flags);
124114 }
125115 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
126
-
127
-/**
128
- * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
129
- * @flag: flag to be cleared
130
- * @q: request queue
131
- *
132
- * Returns the previous value of @flag - 0 if the flag was not set and 1 if
133
- * the flag was set.
134
- */
135
-bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
136
-{
137
- unsigned long flags;
138
- bool res;
139
-
140
- spin_lock_irqsave(q->queue_lock, flags);
141
- res = queue_flag_test_and_clear(flag, q);
142
- spin_unlock_irqrestore(q->queue_lock, flags);
143
-
144
- return res;
145
-}
146
-EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
147
-
148
-static void blk_clear_congested(struct request_list *rl, int sync)
149
-{
150
-#ifdef CONFIG_CGROUP_WRITEBACK
151
- clear_wb_congested(rl->blkg->wb_congested, sync);
152
-#else
153
- /*
154
- * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
155
- * flip its congestion state for events on other blkcgs.
156
- */
157
- if (rl == &rl->q->root_rl)
158
- clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
159
-#endif
160
-}
161
-
162
-static void blk_set_congested(struct request_list *rl, int sync)
163
-{
164
-#ifdef CONFIG_CGROUP_WRITEBACK
165
- set_wb_congested(rl->blkg->wb_congested, sync);
166
-#else
167
- /* see blk_clear_congested() */
168
- if (rl == &rl->q->root_rl)
169
- set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
170
-#endif
171
-}
172
-
173
-void blk_queue_congestion_threshold(struct request_queue *q)
174
-{
175
- int nr;
176
-
177
- nr = q->nr_requests - (q->nr_requests / 8) + 1;
178
- if (nr > q->nr_requests)
179
- nr = q->nr_requests;
180
- q->nr_congestion_on = nr;
181
-
182
- nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
183
- if (nr < 1)
184
- nr = 1;
185
- q->nr_congestion_off = nr;
186
-}
187116
188117 void blk_rq_init(struct request_queue *q, struct request *rq)
189118 {
190119 memset(rq, 0, sizeof(*rq));
191120
192121 INIT_LIST_HEAD(&rq->queuelist);
193
- INIT_LIST_HEAD(&rq->timeout_list);
194
-#ifdef CONFIG_PREEMPT_RT_FULL
195
- INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
196
-#endif
197
- rq->cpu = -1;
198122 rq->q = q;
199123 rq->__sector = (sector_t) -1;
200124 INIT_HLIST_NODE(&rq->hash);
201125 RB_CLEAR_NODE(&rq->rb_node);
202
- rq->tag = -1;
203
- rq->internal_tag = -1;
126
+ rq->tag = BLK_MQ_NO_TAG;
127
+ rq->internal_tag = BLK_MQ_NO_TAG;
204128 rq->start_time_ns = ktime_get_ns();
205129 rq->part = NULL;
206
- refcount_set(&rq->ref, 1);
130
+ blk_crypto_rq_set_defaults(rq);
207131 }
208132 EXPORT_SYMBOL(blk_rq_init);
133
+
134
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
135
+static const char *const blk_op_name[] = {
136
+ REQ_OP_NAME(READ),
137
+ REQ_OP_NAME(WRITE),
138
+ REQ_OP_NAME(FLUSH),
139
+ REQ_OP_NAME(DISCARD),
140
+ REQ_OP_NAME(SECURE_ERASE),
141
+ REQ_OP_NAME(ZONE_RESET),
142
+ REQ_OP_NAME(ZONE_RESET_ALL),
143
+ REQ_OP_NAME(ZONE_OPEN),
144
+ REQ_OP_NAME(ZONE_CLOSE),
145
+ REQ_OP_NAME(ZONE_FINISH),
146
+ REQ_OP_NAME(ZONE_APPEND),
147
+ REQ_OP_NAME(WRITE_SAME),
148
+ REQ_OP_NAME(WRITE_ZEROES),
149
+ REQ_OP_NAME(SCSI_IN),
150
+ REQ_OP_NAME(SCSI_OUT),
151
+ REQ_OP_NAME(DRV_IN),
152
+ REQ_OP_NAME(DRV_OUT),
153
+};
154
+#undef REQ_OP_NAME
155
+
156
+/**
157
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
158
+ * @op: REQ_OP_XXX.
159
+ *
160
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
161
+ * string format. Useful in the debugging and tracing bio or request. For
162
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
163
+ */
164
+inline const char *blk_op_str(unsigned int op)
165
+{
166
+ const char *op_str = "UNKNOWN";
167
+
168
+ if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
169
+ op_str = blk_op_name[op];
170
+
171
+ return op_str;
172
+}
173
+EXPORT_SYMBOL_GPL(blk_op_str);
209174
210175 static const struct {
211176 int errno;
....@@ -226,6 +191,10 @@
226191
227192 /* device mapper special case, should not leak out: */
228193 [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
194
+
195
+ /* zone device specific errors */
196
+ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
197
+ [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
229198
230199 /* everything else not covered above: */
231200 [BLK_STS_IOERR] = { -EIO, "I/O" },
....@@ -254,17 +223,23 @@
254223 }
255224 EXPORT_SYMBOL_GPL(blk_status_to_errno);
256225
257
-static void print_req_error(struct request *req, blk_status_t status)
226
+static void print_req_error(struct request *req, blk_status_t status,
227
+ const char *caller)
258228 {
259229 int idx = (__force int)status;
260230
261231 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
262232 return;
263233
264
- printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
265
- __func__, blk_errors[idx].name, req->rq_disk ?
266
- req->rq_disk->disk_name : "?",
267
- (unsigned long long)blk_rq_pos(req));
234
+ printk_ratelimited(KERN_ERR
235
+ "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
236
+ "phys_seg %u prio class %u\n",
237
+ caller, blk_errors[idx].name,
238
+ req->rq_disk ? req->rq_disk->disk_name : "?",
239
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
240
+ req->cmd_flags & ~REQ_OP_MASK,
241
+ req->nr_phys_segments,
242
+ IOPRIO_PRIO_CLASS(req->ioprio));
268243 }
269244
270245 static void req_bio_endio(struct request *rq, struct bio *bio,
....@@ -277,6 +252,17 @@
277252 bio_set_flag(bio, BIO_QUIET);
278253
279254 bio_advance(bio, nbytes);
255
+
256
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
257
+ /*
258
+ * Partial zone append completions cannot be supported as the
259
+ * BIO fragments may end up not being written sequentially.
260
+ */
261
+ if (bio->bi_iter.bi_size)
262
+ bio->bi_status = BLK_STS_IOERR;
263
+ else
264
+ bio->bi_iter.bi_sector = rq->__sector;
265
+ }
280266
281267 /* don't actually finish bio if it's part of flush sequence */
282268 if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
....@@ -297,99 +283,6 @@
297283 }
298284 EXPORT_SYMBOL(blk_dump_rq_flags);
299285
300
-static void blk_delay_work(struct work_struct *work)
301
-{
302
- struct request_queue *q;
303
-
304
- q = container_of(work, struct request_queue, delay_work.work);
305
- spin_lock_irq(q->queue_lock);
306
- __blk_run_queue(q);
307
- spin_unlock_irq(q->queue_lock);
308
-}
309
-
310
-/**
311
- * blk_delay_queue - restart queueing after defined interval
312
- * @q: The &struct request_queue in question
313
- * @msecs: Delay in msecs
314
- *
315
- * Description:
316
- * Sometimes queueing needs to be postponed for a little while, to allow
317
- * resources to come back. This function will make sure that queueing is
318
- * restarted around the specified time.
319
- */
320
-void blk_delay_queue(struct request_queue *q, unsigned long msecs)
321
-{
322
- lockdep_assert_held(q->queue_lock);
323
- WARN_ON_ONCE(q->mq_ops);
324
-
325
- if (likely(!blk_queue_dead(q)))
326
- queue_delayed_work(kblockd_workqueue, &q->delay_work,
327
- msecs_to_jiffies(msecs));
328
-}
329
-EXPORT_SYMBOL(blk_delay_queue);
330
-
331
-/**
332
- * blk_start_queue_async - asynchronously restart a previously stopped queue
333
- * @q: The &struct request_queue in question
334
- *
335
- * Description:
336
- * blk_start_queue_async() will clear the stop flag on the queue, and
337
- * ensure that the request_fn for the queue is run from an async
338
- * context.
339
- **/
340
-void blk_start_queue_async(struct request_queue *q)
341
-{
342
- lockdep_assert_held(q->queue_lock);
343
- WARN_ON_ONCE(q->mq_ops);
344
-
345
- queue_flag_clear(QUEUE_FLAG_STOPPED, q);
346
- blk_run_queue_async(q);
347
-}
348
-EXPORT_SYMBOL(blk_start_queue_async);
349
-
350
-/**
351
- * blk_start_queue - restart a previously stopped queue
352
- * @q: The &struct request_queue in question
353
- *
354
- * Description:
355
- * blk_start_queue() will clear the stop flag on the queue, and call
356
- * the request_fn for the queue if it was in a stopped state when
357
- * entered. Also see blk_stop_queue().
358
- **/
359
-void blk_start_queue(struct request_queue *q)
360
-{
361
- lockdep_assert_held(q->queue_lock);
362
- WARN_ON_ONCE(q->mq_ops);
363
-
364
- queue_flag_clear(QUEUE_FLAG_STOPPED, q);
365
- __blk_run_queue(q);
366
-}
367
-EXPORT_SYMBOL(blk_start_queue);
368
-
369
-/**
370
- * blk_stop_queue - stop a queue
371
- * @q: The &struct request_queue in question
372
- *
373
- * Description:
374
- * The Linux block layer assumes that a block driver will consume all
375
- * entries on the request queue when the request_fn strategy is called.
376
- * Often this will not happen, because of hardware limitations (queue
377
- * depth settings). If a device driver gets a 'queue full' response,
378
- * or if it simply chooses not to queue more I/O at one point, it can
379
- * call this function to prevent the request_fn from being called until
380
- * the driver has signalled it's ready to go again. This happens by calling
381
- * blk_start_queue() to restart queue operations.
382
- **/
383
-void blk_stop_queue(struct request_queue *q)
384
-{
385
- lockdep_assert_held(q->queue_lock);
386
- WARN_ON_ONCE(q->mq_ops);
387
-
388
- cancel_delayed_work(&q->delay_work);
389
- queue_flag_set(QUEUE_FLAG_STOPPED, q);
390
-}
391
-EXPORT_SYMBOL(blk_stop_queue);
392
-
393286 /**
394287 * blk_sync_queue - cancel any pending callbacks on a queue
395288 * @q: the queue
....@@ -400,7 +293,7 @@
400293 * A block device may call blk_sync_queue to ensure that any
401294 * such activity is cancelled, thus allowing it to release resources
402295 * that the callbacks might use. The caller must already have made sure
403
- * that its ->make_request_fn will not re-add plugging prior to calling
296
+ * that its ->submit_bio will not re-add plugging prior to calling
404297 * this function.
405298 *
406299 * This function does not cancel any asynchronous activity arising
....@@ -412,16 +305,6 @@
412305 {
413306 del_timer_sync(&q->timeout);
414307 cancel_work_sync(&q->timeout_work);
415
-
416
- if (q->mq_ops) {
417
- struct blk_mq_hw_ctx *hctx;
418
- int i;
419
-
420
- queue_for_each_hw_ctx(q, hctx, i)
421
- cancel_delayed_work_sync(&hctx->run_work);
422
- } else {
423
- cancel_delayed_work_sync(&q->delay_work);
424
- }
425308 }
426309 EXPORT_SYMBOL(blk_sync_queue);
427310
....@@ -447,248 +330,20 @@
447330 EXPORT_SYMBOL_GPL(blk_clear_pm_only);
448331
449332 /**
450
- * __blk_run_queue_uncond - run a queue whether or not it has been stopped
451
- * @q: The queue to run
333
+ * blk_put_queue - decrement the request_queue refcount
334
+ * @q: the request_queue structure to decrement the refcount for
452335 *
453
- * Description:
454
- * Invoke request handling on a queue if there are any pending requests.
455
- * May be used to restart request handling after a request has completed.
456
- * This variant runs the queue whether or not the queue has been
457
- * stopped. Must be called with the queue lock held and interrupts
458
- * disabled. See also @blk_run_queue.
336
+ * Decrements the refcount of the request_queue kobject. When this reaches 0
337
+ * we'll have blk_release_queue() called.
338
+ *
339
+ * Context: Any context, but the last reference must not be dropped from
340
+ * atomic context.
459341 */
460
-inline void __blk_run_queue_uncond(struct request_queue *q)
461
-{
462
- lockdep_assert_held(q->queue_lock);
463
- WARN_ON_ONCE(q->mq_ops);
464
-
465
- if (unlikely(blk_queue_dead(q)))
466
- return;
467
-
468
- /*
469
- * Some request_fn implementations, e.g. scsi_request_fn(), unlock
470
- * the queue lock internally. As a result multiple threads may be
471
- * running such a request function concurrently. Keep track of the
472
- * number of active request_fn invocations such that blk_drain_queue()
473
- * can wait until all these request_fn calls have finished.
474
- */
475
- q->request_fn_active++;
476
- q->request_fn(q);
477
- q->request_fn_active--;
478
-}
479
-EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
480
-
481
-/**
482
- * __blk_run_queue - run a single device queue
483
- * @q: The queue to run
484
- *
485
- * Description:
486
- * See @blk_run_queue.
487
- */
488
-void __blk_run_queue(struct request_queue *q)
489
-{
490
- lockdep_assert_held(q->queue_lock);
491
- WARN_ON_ONCE(q->mq_ops);
492
-
493
- if (unlikely(blk_queue_stopped(q)))
494
- return;
495
-
496
- __blk_run_queue_uncond(q);
497
-}
498
-EXPORT_SYMBOL(__blk_run_queue);
499
-
500
-/**
501
- * blk_run_queue_async - run a single device queue in workqueue context
502
- * @q: The queue to run
503
- *
504
- * Description:
505
- * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
506
- * of us.
507
- *
508
- * Note:
509
- * Since it is not allowed to run q->delay_work after blk_cleanup_queue()
510
- * has canceled q->delay_work, callers must hold the queue lock to avoid
511
- * race conditions between blk_cleanup_queue() and blk_run_queue_async().
512
- */
513
-void blk_run_queue_async(struct request_queue *q)
514
-{
515
- lockdep_assert_held(q->queue_lock);
516
- WARN_ON_ONCE(q->mq_ops);
517
-
518
- if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
519
- mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
520
-}
521
-EXPORT_SYMBOL(blk_run_queue_async);
522
-
523
-/**
524
- * blk_run_queue - run a single device queue
525
- * @q: The queue to run
526
- *
527
- * Description:
528
- * Invoke request handling on this queue, if it has pending work to do.
529
- * May be used to restart queueing when a request has completed.
530
- */
531
-void blk_run_queue(struct request_queue *q)
532
-{
533
- unsigned long flags;
534
-
535
- WARN_ON_ONCE(q->mq_ops);
536
-
537
- spin_lock_irqsave(q->queue_lock, flags);
538
- __blk_run_queue(q);
539
- spin_unlock_irqrestore(q->queue_lock, flags);
540
-}
541
-EXPORT_SYMBOL(blk_run_queue);
542
-
543342 void blk_put_queue(struct request_queue *q)
544343 {
545344 kobject_put(&q->kobj);
546345 }
547346 EXPORT_SYMBOL(blk_put_queue);
548
-
549
-/**
550
- * __blk_drain_queue - drain requests from request_queue
551
- * @q: queue to drain
552
- * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
553
- *
554
- * Drain requests from @q. If @drain_all is set, all requests are drained.
555
- * If not, only ELVPRIV requests are drained. The caller is responsible
556
- * for ensuring that no new requests which need to be drained are queued.
557
- */
558
-static void __blk_drain_queue(struct request_queue *q, bool drain_all)
559
- __releases(q->queue_lock)
560
- __acquires(q->queue_lock)
561
-{
562
- int i;
563
-
564
- lockdep_assert_held(q->queue_lock);
565
- WARN_ON_ONCE(q->mq_ops);
566
-
567
- while (true) {
568
- bool drain = false;
569
-
570
- /*
571
- * The caller might be trying to drain @q before its
572
- * elevator is initialized.
573
- */
574
- if (q->elevator)
575
- elv_drain_elevator(q);
576
-
577
- blkcg_drain_queue(q);
578
-
579
- /*
580
- * This function might be called on a queue which failed
581
- * driver init after queue creation or is not yet fully
582
- * active yet. Some drivers (e.g. fd and loop) get unhappy
583
- * in such cases. Kick queue iff dispatch queue has
584
- * something on it and @q has request_fn set.
585
- */
586
- if (!list_empty(&q->queue_head) && q->request_fn)
587
- __blk_run_queue(q);
588
-
589
- drain |= q->nr_rqs_elvpriv;
590
- drain |= q->request_fn_active;
591
-
592
- /*
593
- * Unfortunately, requests are queued at and tracked from
594
- * multiple places and there's no single counter which can
595
- * be drained. Check all the queues and counters.
596
- */
597
- if (drain_all) {
598
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
599
- drain |= !list_empty(&q->queue_head);
600
- for (i = 0; i < 2; i++) {
601
- drain |= q->nr_rqs[i];
602
- drain |= q->in_flight[i];
603
- if (fq)
604
- drain |= !list_empty(&fq->flush_queue[i]);
605
- }
606
- }
607
-
608
- if (!drain)
609
- break;
610
-
611
- spin_unlock_irq(q->queue_lock);
612
-
613
- msleep(10);
614
-
615
- spin_lock_irq(q->queue_lock);
616
- }
617
-
618
- /*
619
- * With queue marked dead, any woken up waiter will fail the
620
- * allocation path, so the wakeup chaining is lost and we're
621
- * left with hung waiters. We need to wake up those waiters.
622
- */
623
- if (q->request_fn) {
624
- struct request_list *rl;
625
-
626
- blk_queue_for_each_rl(rl, q)
627
- for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
628
- wake_up_all(&rl->wait[i]);
629
- }
630
-}
631
-
632
-void blk_drain_queue(struct request_queue *q)
633
-{
634
- spin_lock_irq(q->queue_lock);
635
- __blk_drain_queue(q, true);
636
- spin_unlock_irq(q->queue_lock);
637
-}
638
-
639
-/**
640
- * blk_queue_bypass_start - enter queue bypass mode
641
- * @q: queue of interest
642
- *
643
- * In bypass mode, only the dispatch FIFO queue of @q is used. This
644
- * function makes @q enter bypass mode and drains all requests which were
645
- * throttled or issued before. On return, it's guaranteed that no request
646
- * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
647
- * inside queue or RCU read lock.
648
- */
649
-void blk_queue_bypass_start(struct request_queue *q)
650
-{
651
- WARN_ON_ONCE(q->mq_ops);
652
-
653
- spin_lock_irq(q->queue_lock);
654
- q->bypass_depth++;
655
- queue_flag_set(QUEUE_FLAG_BYPASS, q);
656
- spin_unlock_irq(q->queue_lock);
657
-
658
- /*
659
- * Queues start drained. Skip actual draining till init is
660
- * complete. This avoids lenghty delays during queue init which
661
- * can happen many times during boot.
662
- */
663
- if (blk_queue_init_done(q)) {
664
- spin_lock_irq(q->queue_lock);
665
- __blk_drain_queue(q, false);
666
- spin_unlock_irq(q->queue_lock);
667
-
668
- /* ensure blk_queue_bypass() is %true inside RCU read lock */
669
- synchronize_rcu();
670
- }
671
-}
672
-EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
673
-
674
-/**
675
- * blk_queue_bypass_end - leave queue bypass mode
676
- * @q: queue of interest
677
- *
678
- * Leave bypass mode and restore the normal queueing behavior.
679
- *
680
- * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
681
- * this function is called for both blk-sq and blk-mq queues.
682
- */
683
-void blk_queue_bypass_end(struct request_queue *q)
684
-{
685
- spin_lock_irq(q->queue_lock);
686
- if (!--q->bypass_depth)
687
- queue_flag_clear(QUEUE_FLAG_BYPASS, q);
688
- WARN_ON_ONCE(q->bypass_depth < 0);
689
- spin_unlock_irq(q->queue_lock);
690
-}
691
-EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
692347
693348 void blk_set_queue_dying(struct request_queue *q)
694349 {
....@@ -701,54 +356,13 @@
701356 */
702357 blk_freeze_queue_start(q);
703358
704
- if (q->mq_ops)
359
+ if (queue_is_mq(q))
705360 blk_mq_wake_waiters(q);
706
- else {
707
- struct request_list *rl;
708
-
709
- spin_lock_irq(q->queue_lock);
710
- blk_queue_for_each_rl(rl, q) {
711
- if (rl->rq_pool) {
712
- wake_up_all(&rl->wait[BLK_RW_SYNC]);
713
- wake_up_all(&rl->wait[BLK_RW_ASYNC]);
714
- }
715
- }
716
- spin_unlock_irq(q->queue_lock);
717
- }
718361
719362 /* Make blk_queue_enter() reexamine the DYING flag. */
720363 wake_up_all(&q->mq_freeze_wq);
721364 }
722365 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
723
-
724
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
725
-void blk_exit_queue(struct request_queue *q)
726
-{
727
- /*
728
- * Since the I/O scheduler exit code may access cgroup information,
729
- * perform I/O scheduler exit before disassociating from the block
730
- * cgroup controller.
731
- */
732
- if (q->elevator) {
733
- ioc_clear_queue(q);
734
- elevator_exit(q, q->elevator);
735
- q->elevator = NULL;
736
- }
737
-
738
- /*
739
- * Remove all references to @q from the block cgroup controller before
740
- * restoring @q->queue_lock to avoid that restoring this pointer causes
741
- * e.g. blkcg_print_blkgs() to crash.
742
- */
743
- blkcg_exit_queue(q);
744
-
745
- /*
746
- * Since the cgroup code may dereference the @q->backing_dev_info
747
- * pointer, only decrease its reference count after having removed the
748
- * association with the block cgroup controller.
749
- */
750
- bdi_put(q->backing_dev_info);
751
-}
752366
753367 /**
754368 * blk_cleanup_queue - shutdown a request queue
....@@ -756,57 +370,32 @@
756370 *
757371 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
758372 * put it. All future requests will be failed immediately with -ENODEV.
373
+ *
374
+ * Context: can sleep
759375 */
760376 void blk_cleanup_queue(struct request_queue *q)
761377 {
762
- spinlock_t *lock = q->queue_lock;
378
+ /* cannot be called from atomic context */
379
+ might_sleep();
380
+
381
+ WARN_ON_ONCE(blk_queue_registered(q));
763382
764383 /* mark @q DYING, no new request or merges will be allowed afterwards */
765
- mutex_lock(&q->sysfs_lock);
766384 blk_set_queue_dying(q);
767
- spin_lock_irq(lock);
768385
769
- /*
770
- * A dying queue is permanently in bypass mode till released. Note
771
- * that, unlike blk_queue_bypass_start(), we aren't performing
772
- * synchronize_rcu() after entering bypass mode to avoid the delay
773
- * as some drivers create and destroy a lot of queues while
774
- * probing. This is still safe because blk_release_queue() will be
775
- * called only after the queue refcnt drops to zero and nothing,
776
- * RCU or not, would be traversing the queue by then.
777
- */
778
- q->bypass_depth++;
779
- queue_flag_set(QUEUE_FLAG_BYPASS, q);
780
-
781
- queue_flag_set(QUEUE_FLAG_NOMERGES, q);
782
- queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
783
- queue_flag_set(QUEUE_FLAG_DYING, q);
784
- spin_unlock_irq(lock);
785
- mutex_unlock(&q->sysfs_lock);
386
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
387
+ blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
786388
787389 /*
788390 * Drain all requests queued before DYING marking. Set DEAD flag to
789
- * prevent that q->request_fn() gets invoked after draining finished.
391
+ * prevent that blk_mq_run_hw_queues() accesses the hardware queues
392
+ * after draining finished.
790393 */
791394 blk_freeze_queue(q);
792395
793396 rq_qos_exit(q);
794397
795
- spin_lock_irq(lock);
796
- queue_flag_set(QUEUE_FLAG_DEAD, q);
797
- spin_unlock_irq(lock);
798
-
799
- /*
800
- * make sure all in-progress dispatch are completed because
801
- * blk_freeze_queue() can only complete all requests, and
802
- * dispatch may still be in-progress since we dispatch requests
803
- * from more than one contexts.
804
- *
805
- * We rely on driver to deal with the race in case that queue
806
- * initialization isn't done.
807
- */
808
- if (q->mq_ops && blk_queue_init_done(q))
809
- blk_mq_quiesce_queue(q);
398
+ blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
810399
811400 /* for synchronous bio-based driver finish in-flight integrity i/o */
812401 blk_flush_integrity();
....@@ -815,118 +404,35 @@
815404 del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
816405 blk_sync_queue(q);
817406
818
- /*
819
- * I/O scheduler exit is only safe after the sysfs scheduler attribute
820
- * has been removed.
821
- */
822
- WARN_ON_ONCE(q->kobj.state_in_sysfs);
823
-
824
- blk_exit_queue(q);
825
-
826
- if (q->mq_ops)
407
+ if (queue_is_mq(q))
827408 blk_mq_exit_queue(q);
828409
829
- percpu_ref_exit(&q->q_usage_counter);
830
-
831
- spin_lock_irq(lock);
832
- if (q->queue_lock != &q->__queue_lock)
833
- q->queue_lock = &q->__queue_lock;
834
- spin_unlock_irq(lock);
410
+ /*
411
+ * In theory, request pool of sched_tags belongs to request queue.
412
+ * However, the current implementation requires tag_set for freeing
413
+ * requests, so free the pool now.
414
+ *
415
+ * Queue has become frozen, there can't be any in-queue requests, so
416
+ * it is safe to free requests now.
417
+ */
418
+ mutex_lock(&q->sysfs_lock);
419
+ if (q->elevator)
420
+ blk_mq_sched_free_requests(q);
421
+ mutex_unlock(&q->sysfs_lock);
835422
836423 /* @q is and will stay empty, shutdown and put */
837424 blk_put_queue(q);
838425 }
839426 EXPORT_SYMBOL(blk_cleanup_queue);
840427
841
-/* Allocate memory local to the request queue */
842
-static void *alloc_request_simple(gfp_t gfp_mask, void *data)
843
-{
844
- struct request_queue *q = data;
845
-
846
- return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
847
-}
848
-
849
-static void free_request_simple(void *element, void *data)
850
-{
851
- kmem_cache_free(request_cachep, element);
852
-}
853
-
854
-static void *alloc_request_size(gfp_t gfp_mask, void *data)
855
-{
856
- struct request_queue *q = data;
857
- struct request *rq;
858
-
859
- rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
860
- q->node);
861
- if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
862
- kfree(rq);
863
- rq = NULL;
864
- }
865
- return rq;
866
-}
867
-
868
-static void free_request_size(void *element, void *data)
869
-{
870
- struct request_queue *q = data;
871
-
872
- if (q->exit_rq_fn)
873
- q->exit_rq_fn(q, element);
874
- kfree(element);
875
-}
876
-
877
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
878
- gfp_t gfp_mask)
879
-{
880
- if (unlikely(rl->rq_pool) || q->mq_ops)
881
- return 0;
882
-
883
- rl->q = q;
884
- rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
885
- rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
886
- init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
887
- init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
888
-
889
- if (q->cmd_size) {
890
- rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
891
- alloc_request_size, free_request_size,
892
- q, gfp_mask, q->node);
893
- } else {
894
- rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
895
- alloc_request_simple, free_request_simple,
896
- q, gfp_mask, q->node);
897
- }
898
- if (!rl->rq_pool)
899
- return -ENOMEM;
900
-
901
- if (rl != &q->root_rl)
902
- WARN_ON_ONCE(!blk_get_queue(q));
903
-
904
- return 0;
905
-}
906
-
907
-void blk_exit_rl(struct request_queue *q, struct request_list *rl)
908
-{
909
- if (rl->rq_pool) {
910
- mempool_destroy(rl->rq_pool);
911
- if (rl != &q->root_rl)
912
- blk_put_queue(q);
913
- }
914
-}
915
-
916
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
917
-{
918
- return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
919
-}
920
-EXPORT_SYMBOL(blk_alloc_queue);
921
-
922428 /**
923429 * blk_queue_enter() - try to increase q->q_usage_counter
924430 * @q: request queue pointer
925
- * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
431
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
926432 */
927433 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
928434 {
929
- const bool pm = flags & BLK_MQ_REQ_PREEMPT;
435
+ const bool pm = flags & BLK_MQ_REQ_PM;
930436
931437 while (true) {
932438 bool success = false;
....@@ -962,12 +468,30 @@
962468 smp_rmb();
963469
964470 wait_event(q->mq_freeze_wq,
965
- (atomic_read(&q->mq_freeze_depth) == 0 &&
966
- (pm || !blk_queue_pm_only(q))) ||
471
+ (!q->mq_freeze_depth &&
472
+ (pm || (blk_pm_request_resume(q),
473
+ !blk_queue_pm_only(q)))) ||
967474 blk_queue_dying(q));
968475 if (blk_queue_dying(q))
969476 return -ENODEV;
970477 }
478
+}
479
+
480
+static inline int bio_queue_enter(struct bio *bio)
481
+{
482
+ struct request_queue *q = bio->bi_disk->queue;
483
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
484
+ int ret;
485
+
486
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
487
+ if (unlikely(ret)) {
488
+ if (nowait && !blk_queue_dying(q))
489
+ bio_wouldblock_error(bio);
490
+ else
491
+ bio_io_error(bio);
492
+ }
493
+
494
+ return ret;
971495 }
972496
973497 void blk_queue_exit(struct request_queue *q)
....@@ -975,21 +499,12 @@
975499 percpu_ref_put(&q->q_usage_counter);
976500 }
977501
978
-static void blk_queue_usage_counter_release_wrk(struct work_struct *work)
979
-{
980
- struct request_queue *q =
981
- container_of(work, struct request_queue, mq_pcpu_wake);
982
-
983
- wake_up_all(&q->mq_freeze_wq);
984
-}
985
-
986502 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
987503 {
988504 struct request_queue *q =
989505 container_of(ref, struct request_queue, q_usage_counter);
990506
991
- if (wq_has_sleeper(&q->mq_freeze_wq))
992
- schedule_work(&q->mq_pcpu_wake);
507
+ wake_up_all(&q->mq_freeze_wq);
993508 }
994509
995510 static void blk_rq_timed_out_timer(struct timer_list *t)
....@@ -999,40 +514,23 @@
999514 kblockd_schedule_work(&q->timeout_work);
1000515 }
1001516
1002
-static void blk_timeout_work_dummy(struct work_struct *work)
517
+static void blk_timeout_work(struct work_struct *work)
1003518 {
1004519 }
1005520
1006
-/**
1007
- * blk_alloc_queue_node - allocate a request queue
1008
- * @gfp_mask: memory allocation flags
1009
- * @node_id: NUMA node to allocate memory from
1010
- * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
1011
- * serialize calls to the legacy .request_fn() callback. Ignored for
1012
- * blk-mq request queues.
1013
- *
1014
- * Note: pass the queue lock as the third argument to this function instead of
1015
- * setting the queue lock pointer explicitly to avoid triggering a sporadic
1016
- * crash in the blkcg code. This function namely calls blkcg_init_queue() and
1017
- * the queue lock pointer must be set before blkcg_init_queue() is called.
1018
- */
1019
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1020
- spinlock_t *lock)
521
+struct request_queue *blk_alloc_queue(int node_id)
1021522 {
1022523 struct request_queue *q;
1023524 int ret;
1024525
1025526 q = kmem_cache_alloc_node(blk_requestq_cachep,
1026
- gfp_mask | __GFP_ZERO, node_id);
527
+ GFP_KERNEL | __GFP_ZERO, node_id);
1027528 if (!q)
1028529 return NULL;
1029530
1030
- INIT_LIST_HEAD(&q->queue_head);
1031531 q->last_merge = NULL;
1032
- q->end_sector = 0;
1033
- q->boundary_rq = NULL;
1034532
1035
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
533
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
1036534 if (q->id < 0)
1037535 goto fail_q;
1038536
....@@ -1040,7 +538,7 @@
1040538 if (ret)
1041539 goto fail_id;
1042540
1043
- q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
541
+ q->backing_dev_info = bdi_alloc(node_id);
1044542 if (!q->backing_dev_info)
1045543 goto fail_split;
1046544
....@@ -1048,47 +546,28 @@
1048546 if (!q->stats)
1049547 goto fail_stats;
1050548
1051
- q->backing_dev_info->ra_pages =
1052
- (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1053
- q->backing_dev_info->io_pages =
1054
- (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1055
- q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
1056
- q->backing_dev_info->name = "block";
1057549 q->node = node_id;
550
+
551
+ atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
1058552
1059553 timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
1060554 laptop_mode_timer_fn, 0);
1061555 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
1062
- INIT_WORK(&q->timeout_work, blk_timeout_work_dummy);
1063
- INIT_LIST_HEAD(&q->timeout_list);
556
+ INIT_WORK(&q->timeout_work, blk_timeout_work);
1064557 INIT_LIST_HEAD(&q->icq_list);
1065558 #ifdef CONFIG_BLK_CGROUP
1066559 INIT_LIST_HEAD(&q->blkg_list);
1067560 #endif
1068
- INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
1069561
1070562 kobject_init(&q->kobj, &blk_queue_ktype);
1071563
1072
-#ifdef CONFIG_BLK_DEV_IO_TRACE
1073
- mutex_init(&q->blk_trace_mutex);
1074
-#endif
564
+ mutex_init(&q->debugfs_mutex);
1075565 mutex_init(&q->sysfs_lock);
1076
- spin_lock_init(&q->__queue_lock);
1077
-
1078
- if (!q->mq_ops)
1079
- q->queue_lock = lock ? : &q->__queue_lock;
1080
-
1081
- /*
1082
- * A queue starts its life with bypass turned on to avoid
1083
- * unnecessary bypass on/off overhead and nasty surprises during
1084
- * init. The initial bypass will be finished when the queue is
1085
- * registered by blk_register_queue().
1086
- */
1087
- q->bypass_depth = 1;
1088
- queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
566
+ mutex_init(&q->sysfs_dir_lock);
567
+ spin_lock_init(&q->queue_lock);
1089568
1090569 init_waitqueue_head(&q->mq_freeze_wq);
1091
- INIT_WORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_wrk);
570
+ mutex_init(&q->mq_freeze_lock);
1092571
1093572 /*
1094573 * Init percpu_ref in atomic mode so that it's faster to shutdown.
....@@ -1101,6 +580,10 @@
1101580
1102581 if (blkcg_init_queue(q))
1103582 goto fail_ref;
583
+
584
+ blk_queue_dma_alignment(q, 511);
585
+ blk_set_default_limits(&q->limits);
586
+ q->nr_requests = BLKDEV_MAX_RQ;
1104587
1105588 return q;
1106589
....@@ -1118,107 +601,16 @@
1118601 kmem_cache_free(blk_requestq_cachep, q);
1119602 return NULL;
1120603 }
1121
-EXPORT_SYMBOL(blk_alloc_queue_node);
604
+EXPORT_SYMBOL(blk_alloc_queue);
1122605
1123606 /**
1124
- * blk_init_queue - prepare a request queue for use with a block device
1125
- * @rfn: The function to be called to process requests that have been
1126
- * placed on the queue.
1127
- * @lock: Request queue spin lock
607
+ * blk_get_queue - increment the request_queue refcount
608
+ * @q: the request_queue structure to increment the refcount for
1128609 *
1129
- * Description:
1130
- * If a block device wishes to use the standard request handling procedures,
1131
- * which sorts requests and coalesces adjacent requests, then it must
1132
- * call blk_init_queue(). The function @rfn will be called when there
1133
- * are requests on the queue that need to be processed. If the device
1134
- * supports plugging, then @rfn may not be called immediately when requests
1135
- * are available on the queue, but may be called at some time later instead.
1136
- * Plugged queues are generally unplugged when a buffer belonging to one
1137
- * of the requests on the queue is needed, or due to memory pressure.
610
+ * Increment the refcount of the request_queue kobject.
1138611 *
1139
- * @rfn is not required, or even expected, to remove all requests off the
1140
- * queue, but only as many as it can handle at a time. If it does leave
1141
- * requests on the queue, it is responsible for arranging that the requests
1142
- * get dealt with eventually.
1143
- *
1144
- * The queue spin lock must be held while manipulating the requests on the
1145
- * request queue; this lock will be taken also from interrupt context, so irq
1146
- * disabling is needed for it.
1147
- *
1148
- * Function returns a pointer to the initialized request queue, or %NULL if
1149
- * it didn't succeed.
1150
- *
1151
- * Note:
1152
- * blk_init_queue() must be paired with a blk_cleanup_queue() call
1153
- * when the block device is deactivated (such as at module unload).
1154
- **/
1155
-
1156
-struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1157
-{
1158
- return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
1159
-}
1160
-EXPORT_SYMBOL(blk_init_queue);
1161
-
1162
-struct request_queue *
1163
-blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1164
-{
1165
- struct request_queue *q;
1166
-
1167
- q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
1168
- if (!q)
1169
- return NULL;
1170
-
1171
- q->request_fn = rfn;
1172
- if (blk_init_allocated_queue(q) < 0) {
1173
- blk_cleanup_queue(q);
1174
- return NULL;
1175
- }
1176
-
1177
- return q;
1178
-}
1179
-EXPORT_SYMBOL(blk_init_queue_node);
1180
-
1181
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
1182
-
1183
-
1184
-int blk_init_allocated_queue(struct request_queue *q)
1185
-{
1186
- WARN_ON_ONCE(q->mq_ops);
1187
-
1188
- q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
1189
- if (!q->fq)
1190
- return -ENOMEM;
1191
-
1192
- if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
1193
- goto out_free_flush_queue;
1194
-
1195
- if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1196
- goto out_exit_flush_rq;
1197
-
1198
- INIT_WORK(&q->timeout_work, blk_timeout_work);
1199
- q->queue_flags |= QUEUE_FLAG_DEFAULT;
1200
-
1201
- /*
1202
- * This also sets hw/phys segments, boundary and size
1203
- */
1204
- blk_queue_make_request(q, blk_queue_bio);
1205
-
1206
- q->sg_reserved_size = INT_MAX;
1207
-
1208
- if (elevator_init(q))
1209
- goto out_exit_flush_rq;
1210
- return 0;
1211
-
1212
-out_exit_flush_rq:
1213
- if (q->exit_rq_fn)
1214
- q->exit_rq_fn(q, q->fq->flush_rq);
1215
-out_free_flush_queue:
1216
- blk_free_flush_queue(q->fq);
1217
- q->fq = NULL;
1218
- return -ENOMEM;
1219
-}
1220
-EXPORT_SYMBOL(blk_init_allocated_queue);
1221
-
612
+ * Context: Any context.
613
+ */
1222614 bool blk_get_queue(struct request_queue *q)
1223615 {
1224616 if (likely(!blk_queue_dying(q))) {
....@@ -1229,406 +621,6 @@
1229621 return false;
1230622 }
1231623 EXPORT_SYMBOL(blk_get_queue);
1232
-
1233
-static inline void blk_free_request(struct request_list *rl, struct request *rq)
1234
-{
1235
- if (rq->rq_flags & RQF_ELVPRIV) {
1236
- elv_put_request(rl->q, rq);
1237
- if (rq->elv.icq)
1238
- put_io_context(rq->elv.icq->ioc);
1239
- }
1240
-
1241
- mempool_free(rq, rl->rq_pool);
1242
-}
1243
-
1244
-/*
1245
- * ioc_batching returns true if the ioc is a valid batching request and
1246
- * should be given priority access to a request.
1247
- */
1248
-static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1249
-{
1250
- if (!ioc)
1251
- return 0;
1252
-
1253
- /*
1254
- * Make sure the process is able to allocate at least 1 request
1255
- * even if the batch times out, otherwise we could theoretically
1256
- * lose wakeups.
1257
- */
1258
- return ioc->nr_batch_requests == q->nr_batching ||
1259
- (ioc->nr_batch_requests > 0
1260
- && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1261
-}
1262
-
1263
-/*
1264
- * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1265
- * will cause the process to be a "batcher" on all queues in the system. This
1266
- * is the behaviour we want though - once it gets a wakeup it should be given
1267
- * a nice run.
1268
- */
1269
-static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
1270
-{
1271
- if (!ioc || ioc_batching(q, ioc))
1272
- return;
1273
-
1274
- ioc->nr_batch_requests = q->nr_batching;
1275
- ioc->last_waited = jiffies;
1276
-}
1277
-
1278
-static void __freed_request(struct request_list *rl, int sync)
1279
-{
1280
- struct request_queue *q = rl->q;
1281
-
1282
- if (rl->count[sync] < queue_congestion_off_threshold(q))
1283
- blk_clear_congested(rl, sync);
1284
-
1285
- if (rl->count[sync] + 1 <= q->nr_requests) {
1286
- if (waitqueue_active(&rl->wait[sync]))
1287
- wake_up(&rl->wait[sync]);
1288
-
1289
- blk_clear_rl_full(rl, sync);
1290
- }
1291
-}
1292
-
1293
-/*
1294
- * A request has just been released. Account for it, update the full and
1295
- * congestion status, wake up any waiters. Called under q->queue_lock.
1296
- */
1297
-static void freed_request(struct request_list *rl, bool sync,
1298
- req_flags_t rq_flags)
1299
-{
1300
- struct request_queue *q = rl->q;
1301
-
1302
- q->nr_rqs[sync]--;
1303
- rl->count[sync]--;
1304
- if (rq_flags & RQF_ELVPRIV)
1305
- q->nr_rqs_elvpriv--;
1306
-
1307
- __freed_request(rl, sync);
1308
-
1309
- if (unlikely(rl->starved[sync ^ 1]))
1310
- __freed_request(rl, sync ^ 1);
1311
-}
1312
-
1313
-int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1314
-{
1315
- struct request_list *rl;
1316
- int on_thresh, off_thresh;
1317
-
1318
- WARN_ON_ONCE(q->mq_ops);
1319
-
1320
- spin_lock_irq(q->queue_lock);
1321
- q->nr_requests = nr;
1322
- blk_queue_congestion_threshold(q);
1323
- on_thresh = queue_congestion_on_threshold(q);
1324
- off_thresh = queue_congestion_off_threshold(q);
1325
-
1326
- blk_queue_for_each_rl(rl, q) {
1327
- if (rl->count[BLK_RW_SYNC] >= on_thresh)
1328
- blk_set_congested(rl, BLK_RW_SYNC);
1329
- else if (rl->count[BLK_RW_SYNC] < off_thresh)
1330
- blk_clear_congested(rl, BLK_RW_SYNC);
1331
-
1332
- if (rl->count[BLK_RW_ASYNC] >= on_thresh)
1333
- blk_set_congested(rl, BLK_RW_ASYNC);
1334
- else if (rl->count[BLK_RW_ASYNC] < off_thresh)
1335
- blk_clear_congested(rl, BLK_RW_ASYNC);
1336
-
1337
- if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
1338
- blk_set_rl_full(rl, BLK_RW_SYNC);
1339
- } else {
1340
- blk_clear_rl_full(rl, BLK_RW_SYNC);
1341
- wake_up(&rl->wait[BLK_RW_SYNC]);
1342
- }
1343
-
1344
- if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
1345
- blk_set_rl_full(rl, BLK_RW_ASYNC);
1346
- } else {
1347
- blk_clear_rl_full(rl, BLK_RW_ASYNC);
1348
- wake_up(&rl->wait[BLK_RW_ASYNC]);
1349
- }
1350
- }
1351
-
1352
- spin_unlock_irq(q->queue_lock);
1353
- return 0;
1354
-}
1355
-
1356
-/**
1357
- * __get_request - get a free request
1358
- * @rl: request list to allocate from
1359
- * @op: operation and flags
1360
- * @bio: bio to allocate request for (can be %NULL)
1361
- * @flags: BLQ_MQ_REQ_* flags
1362
- * @gfp_mask: allocator flags
1363
- *
1364
- * Get a free request from @q. This function may fail under memory
1365
- * pressure or if @q is dead.
1366
- *
1367
- * Must be called with @q->queue_lock held and,
1368
- * Returns ERR_PTR on failure, with @q->queue_lock held.
1369
- * Returns request pointer on success, with @q->queue_lock *not held*.
1370
- */
1371
-static struct request *__get_request(struct request_list *rl, unsigned int op,
1372
- struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
1373
-{
1374
- struct request_queue *q = rl->q;
1375
- struct request *rq;
1376
- struct elevator_type *et = q->elevator->type;
1377
- struct io_context *ioc = rq_ioc(bio);
1378
- struct io_cq *icq = NULL;
1379
- const bool is_sync = op_is_sync(op);
1380
- int may_queue;
1381
- req_flags_t rq_flags = RQF_ALLOCED;
1382
-
1383
- lockdep_assert_held(q->queue_lock);
1384
-
1385
- if (unlikely(blk_queue_dying(q)))
1386
- return ERR_PTR(-ENODEV);
1387
-
1388
- may_queue = elv_may_queue(q, op);
1389
- if (may_queue == ELV_MQUEUE_NO)
1390
- goto rq_starved;
1391
-
1392
- if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
1393
- if (rl->count[is_sync]+1 >= q->nr_requests) {
1394
- /*
1395
- * The queue will fill after this allocation, so set
1396
- * it as full, and mark this process as "batching".
1397
- * This process will be allowed to complete a batch of
1398
- * requests, others will be blocked.
1399
- */
1400
- if (!blk_rl_full(rl, is_sync)) {
1401
- ioc_set_batching(q, ioc);
1402
- blk_set_rl_full(rl, is_sync);
1403
- } else {
1404
- if (may_queue != ELV_MQUEUE_MUST
1405
- && !ioc_batching(q, ioc)) {
1406
- /*
1407
- * The queue is full and the allocating
1408
- * process is not a "batcher", and not
1409
- * exempted by the IO scheduler
1410
- */
1411
- return ERR_PTR(-ENOMEM);
1412
- }
1413
- }
1414
- }
1415
- blk_set_congested(rl, is_sync);
1416
- }
1417
-
1418
- /*
1419
- * Only allow batching queuers to allocate up to 50% over the defined
1420
- * limit of requests, otherwise we could have thousands of requests
1421
- * allocated with any setting of ->nr_requests
1422
- */
1423
- if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
1424
- return ERR_PTR(-ENOMEM);
1425
-
1426
- q->nr_rqs[is_sync]++;
1427
- rl->count[is_sync]++;
1428
- rl->starved[is_sync] = 0;
1429
-
1430
- /*
1431
- * Decide whether the new request will be managed by elevator. If
1432
- * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will
1433
- * prevent the current elevator from being destroyed until the new
1434
- * request is freed. This guarantees icq's won't be destroyed and
1435
- * makes creating new ones safe.
1436
- *
1437
- * Flush requests do not use the elevator so skip initialization.
1438
- * This allows a request to share the flush and elevator data.
1439
- *
1440
- * Also, lookup icq while holding queue_lock. If it doesn't exist,
1441
- * it will be created after releasing queue_lock.
1442
- */
1443
- if (!op_is_flush(op) && !blk_queue_bypass(q)) {
1444
- rq_flags |= RQF_ELVPRIV;
1445
- q->nr_rqs_elvpriv++;
1446
- if (et->icq_cache && ioc)
1447
- icq = ioc_lookup_icq(ioc, q);
1448
- }
1449
-
1450
- if (blk_queue_io_stat(q))
1451
- rq_flags |= RQF_IO_STAT;
1452
- spin_unlock_irq(q->queue_lock);
1453
-
1454
- /* allocate and init request */
1455
- rq = mempool_alloc(rl->rq_pool, gfp_mask);
1456
- if (!rq)
1457
- goto fail_alloc;
1458
-
1459
- blk_rq_init(q, rq);
1460
- blk_rq_set_rl(rq, rl);
1461
- rq->cmd_flags = op;
1462
- rq->rq_flags = rq_flags;
1463
- if (flags & BLK_MQ_REQ_PREEMPT)
1464
- rq->rq_flags |= RQF_PREEMPT;
1465
-
1466
- /* init elvpriv */
1467
- if (rq_flags & RQF_ELVPRIV) {
1468
- if (unlikely(et->icq_cache && !icq)) {
1469
- if (ioc)
1470
- icq = ioc_create_icq(ioc, q, gfp_mask);
1471
- if (!icq)
1472
- goto fail_elvpriv;
1473
- }
1474
-
1475
- rq->elv.icq = icq;
1476
- if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1477
- goto fail_elvpriv;
1478
-
1479
- /* @rq->elv.icq holds io_context until @rq is freed */
1480
- if (icq)
1481
- get_io_context(icq->ioc);
1482
- }
1483
-out:
1484
- /*
1485
- * ioc may be NULL here, and ioc_batching will be false. That's
1486
- * OK, if the queue is under the request limit then requests need
1487
- * not count toward the nr_batch_requests limit. There will always
1488
- * be some limit enforced by BLK_BATCH_TIME.
1489
- */
1490
- if (ioc_batching(q, ioc))
1491
- ioc->nr_batch_requests--;
1492
-
1493
- trace_block_getrq(q, bio, op);
1494
- return rq;
1495
-
1496
-fail_elvpriv:
1497
- /*
1498
- * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
1499
- * and may fail indefinitely under memory pressure and thus
1500
- * shouldn't stall IO. Treat this request as !elvpriv. This will
1501
- * disturb iosched and blkcg but weird is bettern than dead.
1502
- */
1503
- printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
1504
- __func__, dev_name(q->backing_dev_info->dev));
1505
-
1506
- rq->rq_flags &= ~RQF_ELVPRIV;
1507
- rq->elv.icq = NULL;
1508
-
1509
- spin_lock_irq(q->queue_lock);
1510
- q->nr_rqs_elvpriv--;
1511
- spin_unlock_irq(q->queue_lock);
1512
- goto out;
1513
-
1514
-fail_alloc:
1515
- /*
1516
- * Allocation failed presumably due to memory. Undo anything we
1517
- * might have messed up.
1518
- *
1519
- * Allocating task should really be put onto the front of the wait
1520
- * queue, but this is pretty rare.
1521
- */
1522
- spin_lock_irq(q->queue_lock);
1523
- freed_request(rl, is_sync, rq_flags);
1524
-
1525
- /*
1526
- * in the very unlikely event that allocation failed and no
1527
- * requests for this direction was pending, mark us starved so that
1528
- * freeing of a request in the other direction will notice
1529
- * us. another possible fix would be to split the rq mempool into
1530
- * READ and WRITE
1531
- */
1532
-rq_starved:
1533
- if (unlikely(rl->count[is_sync] == 0))
1534
- rl->starved[is_sync] = 1;
1535
- return ERR_PTR(-ENOMEM);
1536
-}
1537
-
1538
-/**
1539
- * get_request - get a free request
1540
- * @q: request_queue to allocate request from
1541
- * @op: operation and flags
1542
- * @bio: bio to allocate request for (can be %NULL)
1543
- * @flags: BLK_MQ_REQ_* flags.
1544
- * @gfp: allocator flags
1545
- *
1546
- * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags,
1547
- * this function keeps retrying under memory pressure and fails iff @q is dead.
1548
- *
1549
- * Must be called with @q->queue_lock held and,
1550
- * Returns ERR_PTR on failure, with @q->queue_lock held.
1551
- * Returns request pointer on success, with @q->queue_lock *not held*.
1552
- */
1553
-static struct request *get_request(struct request_queue *q, unsigned int op,
1554
- struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
1555
-{
1556
- const bool is_sync = op_is_sync(op);
1557
- DEFINE_WAIT(wait);
1558
- struct request_list *rl;
1559
- struct request *rq;
1560
-
1561
- lockdep_assert_held(q->queue_lock);
1562
- WARN_ON_ONCE(q->mq_ops);
1563
-
1564
- rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1565
-retry:
1566
- rq = __get_request(rl, op, bio, flags, gfp);
1567
- if (!IS_ERR(rq))
1568
- return rq;
1569
-
1570
- if (op & REQ_NOWAIT) {
1571
- blk_put_rl(rl);
1572
- return ERR_PTR(-EAGAIN);
1573
- }
1574
-
1575
- if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1576
- blk_put_rl(rl);
1577
- return rq;
1578
- }
1579
-
1580
- /* wait on @rl and retry */
1581
- prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1582
- TASK_UNINTERRUPTIBLE);
1583
-
1584
- trace_block_sleeprq(q, bio, op);
1585
-
1586
- spin_unlock_irq(q->queue_lock);
1587
- io_schedule();
1588
-
1589
- /*
1590
- * After sleeping, we become a "batching" process and will be able
1591
- * to allocate at least one request, and up to a big batch of them
1592
- * for a small period time. See ioc_batching, ioc_set_batching
1593
- */
1594
- ioc_set_batching(q, current->io_context);
1595
-
1596
- spin_lock_irq(q->queue_lock);
1597
- finish_wait(&rl->wait[is_sync], &wait);
1598
-
1599
- goto retry;
1600
-}
1601
-
1602
-/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1603
-static struct request *blk_old_get_request(struct request_queue *q,
1604
- unsigned int op, blk_mq_req_flags_t flags)
1605
-{
1606
- struct request *rq;
1607
- gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
1608
- int ret = 0;
1609
-
1610
- WARN_ON_ONCE(q->mq_ops);
1611
-
1612
- /* create ioc upfront */
1613
- create_io_context(gfp_mask, q->node);
1614
-
1615
- ret = blk_queue_enter(q, flags);
1616
- if (ret)
1617
- return ERR_PTR(ret);
1618
- spin_lock_irq(q->queue_lock);
1619
- rq = get_request(q, op, NULL, flags, gfp_mask);
1620
- if (IS_ERR(rq)) {
1621
- spin_unlock_irq(q->queue_lock);
1622
- blk_queue_exit(q);
1623
- return rq;
1624
- }
1625
-
1626
- /* q->queue_lock is unlocked at this point */
1627
- rq->__data_len = 0;
1628
- rq->__sector = (sector_t) -1;
1629
- rq->bio = rq->biotail = NULL;
1630
- return rq;
1631
-}
1632624
1633625 /**
1634626 * blk_get_request - allocate a request
....@@ -1642,511 +634,30 @@
1642634 struct request *req;
1643635
1644636 WARN_ON_ONCE(op & REQ_NOWAIT);
1645
- WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
637
+ WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
1646638
1647
- if (q->mq_ops) {
1648
- req = blk_mq_alloc_request(q, op, flags);
1649
- if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1650
- q->mq_ops->initialize_rq_fn(req);
1651
- } else {
1652
- req = blk_old_get_request(q, op, flags);
1653
- if (!IS_ERR(req) && q->initialize_rq_fn)
1654
- q->initialize_rq_fn(req);
1655
- }
639
+ req = blk_mq_alloc_request(q, op, flags);
640
+ if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
641
+ q->mq_ops->initialize_rq_fn(req);
1656642
1657643 return req;
1658644 }
1659645 EXPORT_SYMBOL(blk_get_request);
1660646
1661
-/**
1662
- * blk_requeue_request - put a request back on queue
1663
- * @q: request queue where request should be inserted
1664
- * @rq: request to be inserted
1665
- *
1666
- * Description:
1667
- * Drivers often keep queueing requests until the hardware cannot accept
1668
- * more, when that condition happens we need to put the request back
1669
- * on the queue. Must be called with queue lock held.
1670
- */
1671
-void blk_requeue_request(struct request_queue *q, struct request *rq)
1672
-{
1673
- lockdep_assert_held(q->queue_lock);
1674
- WARN_ON_ONCE(q->mq_ops);
1675
-
1676
- blk_delete_timer(rq);
1677
- blk_clear_rq_complete(rq);
1678
- trace_block_rq_requeue(q, rq);
1679
- rq_qos_requeue(q, rq);
1680
-
1681
- if (rq->rq_flags & RQF_QUEUED)
1682
- blk_queue_end_tag(q, rq);
1683
-
1684
- BUG_ON(blk_queued_rq(rq));
1685
-
1686
- elv_requeue_request(q, rq);
1687
-}
1688
-EXPORT_SYMBOL(blk_requeue_request);
1689
-
1690
-static void add_acct_request(struct request_queue *q, struct request *rq,
1691
- int where)
1692
-{
1693
- blk_account_io_start(rq, true);
1694
- __elv_add_request(q, rq, where);
1695
-}
1696
-
1697
-static void part_round_stats_single(struct request_queue *q, int cpu,
1698
- struct hd_struct *part, unsigned long now,
1699
- unsigned int inflight)
1700
-{
1701
- if (inflight) {
1702
- __part_stat_add(cpu, part, time_in_queue,
1703
- inflight * (now - part->stamp));
1704
- __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1705
- }
1706
- part->stamp = now;
1707
-}
1708
-
1709
-/**
1710
- * part_round_stats() - Round off the performance stats on a struct disk_stats.
1711
- * @q: target block queue
1712
- * @cpu: cpu number for stats access
1713
- * @part: target partition
1714
- *
1715
- * The average IO queue length and utilisation statistics are maintained
1716
- * by observing the current state of the queue length and the amount of
1717
- * time it has been in this state for.
1718
- *
1719
- * Normally, that accounting is done on IO completion, but that can result
1720
- * in more than a second's worth of IO being accounted for within any one
1721
- * second, leading to >100% utilisation. To deal with that, we call this
1722
- * function to do a round-off before returning the results when reading
1723
- * /proc/diskstats. This accounts immediately for all queue usage up to
1724
- * the current jiffies and restarts the counters again.
1725
- */
1726
-void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
1727
-{
1728
- struct hd_struct *part2 = NULL;
1729
- unsigned long now = jiffies;
1730
- unsigned int inflight[2];
1731
- int stats = 0;
1732
-
1733
- if (part->stamp != now)
1734
- stats |= 1;
1735
-
1736
- if (part->partno) {
1737
- part2 = &part_to_disk(part)->part0;
1738
- if (part2->stamp != now)
1739
- stats |= 2;
1740
- }
1741
-
1742
- if (!stats)
1743
- return;
1744
-
1745
- part_in_flight(q, part, inflight);
1746
-
1747
- if (stats & 2)
1748
- part_round_stats_single(q, cpu, part2, now, inflight[1]);
1749
- if (stats & 1)
1750
- part_round_stats_single(q, cpu, part, now, inflight[0]);
1751
-}
1752
-EXPORT_SYMBOL_GPL(part_round_stats);
1753
-
1754
-#ifdef CONFIG_PM
1755
-static void blk_pm_put_request(struct request *rq)
1756
-{
1757
- if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
1758
- pm_runtime_mark_last_busy(rq->q->dev);
1759
-}
1760
-#else
1761
-static inline void blk_pm_put_request(struct request *rq) {}
1762
-#endif
1763
-
1764
-void __blk_put_request(struct request_queue *q, struct request *req)
1765
-{
1766
- req_flags_t rq_flags = req->rq_flags;
1767
-
1768
- if (unlikely(!q))
1769
- return;
1770
-
1771
- if (q->mq_ops) {
1772
- blk_mq_free_request(req);
1773
- return;
1774
- }
1775
-
1776
- lockdep_assert_held(q->queue_lock);
1777
-
1778
- blk_req_zone_write_unlock(req);
1779
- blk_pm_put_request(req);
1780
-
1781
- elv_completed_request(q, req);
1782
-
1783
- /* this is a bio leak */
1784
- WARN_ON(req->bio != NULL);
1785
-
1786
- rq_qos_done(q, req);
1787
-
1788
- /*
1789
- * Request may not have originated from ll_rw_blk. if not,
1790
- * it didn't come out of our reserved rq pools
1791
- */
1792
- if (rq_flags & RQF_ALLOCED) {
1793
- struct request_list *rl = blk_rq_rl(req);
1794
- bool sync = op_is_sync(req->cmd_flags);
1795
-
1796
- BUG_ON(!list_empty(&req->queuelist));
1797
- BUG_ON(ELV_ON_HASH(req));
1798
-
1799
- blk_free_request(rl, req);
1800
- freed_request(rl, sync, rq_flags);
1801
- blk_put_rl(rl);
1802
- blk_queue_exit(q);
1803
- }
1804
-}
1805
-EXPORT_SYMBOL_GPL(__blk_put_request);
1806
-
1807647 void blk_put_request(struct request *req)
1808648 {
1809
- struct request_queue *q = req->q;
1810
-
1811
- if (q->mq_ops)
1812
- blk_mq_free_request(req);
1813
- else {
1814
- unsigned long flags;
1815
-
1816
- spin_lock_irqsave(q->queue_lock, flags);
1817
- __blk_put_request(q, req);
1818
- spin_unlock_irqrestore(q->queue_lock, flags);
1819
- }
649
+ blk_mq_free_request(req);
1820650 }
1821651 EXPORT_SYMBOL(blk_put_request);
1822
-
1823
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1824
- struct bio *bio)
1825
-{
1826
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1827
-
1828
- if (!ll_back_merge_fn(q, req, bio))
1829
- return false;
1830
-
1831
- trace_block_bio_backmerge(q, req, bio);
1832
-
1833
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1834
- blk_rq_set_mixed_merge(req);
1835
-
1836
- req->biotail->bi_next = bio;
1837
- req->biotail = bio;
1838
- req->__data_len += bio->bi_iter.bi_size;
1839
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1840
-
1841
- blk_account_io_start(req, false);
1842
- return true;
1843
-}
1844
-
1845
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1846
- struct bio *bio)
1847
-{
1848
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
1849
-
1850
- if (!ll_front_merge_fn(q, req, bio))
1851
- return false;
1852
-
1853
- trace_block_bio_frontmerge(q, req, bio);
1854
-
1855
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1856
- blk_rq_set_mixed_merge(req);
1857
-
1858
- bio->bi_next = req->bio;
1859
- req->bio = bio;
1860
-
1861
- req->__sector = bio->bi_iter.bi_sector;
1862
- req->__data_len += bio->bi_iter.bi_size;
1863
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1864
-
1865
- blk_account_io_start(req, false);
1866
- return true;
1867
-}
1868
-
1869
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
1870
- struct bio *bio)
1871
-{
1872
- unsigned short segments = blk_rq_nr_discard_segments(req);
1873
-
1874
- if (segments >= queue_max_discard_segments(q))
1875
- goto no_merge;
1876
- if (blk_rq_sectors(req) + bio_sectors(bio) >
1877
- blk_rq_get_max_sectors(req, blk_rq_pos(req)))
1878
- goto no_merge;
1879
-
1880
- req->biotail->bi_next = bio;
1881
- req->biotail = bio;
1882
- req->__data_len += bio->bi_iter.bi_size;
1883
- req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1884
- req->nr_phys_segments = segments + 1;
1885
-
1886
- blk_account_io_start(req, false);
1887
- return true;
1888
-no_merge:
1889
- req_set_nomerge(q, req);
1890
- return false;
1891
-}
1892
-
1893
-/**
1894
- * blk_attempt_plug_merge - try to merge with %current's plugged list
1895
- * @q: request_queue new bio is being queued at
1896
- * @bio: new bio being queued
1897
- * @request_count: out parameter for number of traversed plugged requests
1898
- * @same_queue_rq: pointer to &struct request that gets filled in when
1899
- * another request associated with @q is found on the plug list
1900
- * (optional, may be %NULL)
1901
- *
1902
- * Determine whether @bio being queued on @q can be merged with a request
1903
- * on %current's plugged list. Returns %true if merge was successful,
1904
- * otherwise %false.
1905
- *
1906
- * Plugging coalesces IOs from the same issuer for the same purpose without
1907
- * going through @q->queue_lock. As such it's more of an issuing mechanism
1908
- * than scheduling, and the request, while may have elvpriv data, is not
1909
- * added on the elevator at this point. In addition, we don't have
1910
- * reliable access to the elevator outside queue lock. Only check basic
1911
- * merging parameters without querying the elevator.
1912
- *
1913
- * Caller must ensure !blk_queue_nomerges(q) beforehand.
1914
- */
1915
-bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1916
- unsigned int *request_count,
1917
- struct request **same_queue_rq)
1918
-{
1919
- struct blk_plug *plug;
1920
- struct request *rq;
1921
- struct list_head *plug_list;
1922
-
1923
- plug = current->plug;
1924
- if (!plug)
1925
- return false;
1926
- *request_count = 0;
1927
-
1928
- if (q->mq_ops)
1929
- plug_list = &plug->mq_list;
1930
- else
1931
- plug_list = &plug->list;
1932
-
1933
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
1934
- bool merged = false;
1935
-
1936
- if (rq->q == q) {
1937
- (*request_count)++;
1938
- /*
1939
- * Only blk-mq multiple hardware queues case checks the
1940
- * rq in the same queue, there should be only one such
1941
- * rq in a queue
1942
- **/
1943
- if (same_queue_rq)
1944
- *same_queue_rq = rq;
1945
- }
1946
-
1947
- if (rq->q != q || !blk_rq_merge_ok(rq, bio))
1948
- continue;
1949
-
1950
- switch (blk_try_merge(rq, bio)) {
1951
- case ELEVATOR_BACK_MERGE:
1952
- merged = bio_attempt_back_merge(q, rq, bio);
1953
- break;
1954
- case ELEVATOR_FRONT_MERGE:
1955
- merged = bio_attempt_front_merge(q, rq, bio);
1956
- break;
1957
- case ELEVATOR_DISCARD_MERGE:
1958
- merged = bio_attempt_discard_merge(q, rq, bio);
1959
- break;
1960
- default:
1961
- break;
1962
- }
1963
-
1964
- if (merged)
1965
- return true;
1966
- }
1967
-
1968
- return false;
1969
-}
1970
-
1971
-unsigned int blk_plug_queued_count(struct request_queue *q)
1972
-{
1973
- struct blk_plug *plug;
1974
- struct request *rq;
1975
- struct list_head *plug_list;
1976
- unsigned int ret = 0;
1977
-
1978
- plug = current->plug;
1979
- if (!plug)
1980
- goto out;
1981
-
1982
- if (q->mq_ops)
1983
- plug_list = &plug->mq_list;
1984
- else
1985
- plug_list = &plug->list;
1986
-
1987
- list_for_each_entry(rq, plug_list, queuelist) {
1988
- if (rq->q == q)
1989
- ret++;
1990
- }
1991
-out:
1992
- return ret;
1993
-}
1994
-
1995
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
1996
-{
1997
- struct io_context *ioc = rq_ioc(bio);
1998
-
1999
- if (bio->bi_opf & REQ_RAHEAD)
2000
- req->cmd_flags |= REQ_FAILFAST_MASK;
2001
-
2002
- req->__sector = bio->bi_iter.bi_sector;
2003
- if (ioprio_valid(bio_prio(bio)))
2004
- req->ioprio = bio_prio(bio);
2005
- else if (ioc)
2006
- req->ioprio = ioc->ioprio;
2007
- else
2008
- req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
2009
- req->write_hint = bio->bi_write_hint;
2010
- blk_rq_bio_prep(req->q, req, bio);
2011
-}
2012
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
2013
-
2014
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
2015
-{
2016
- struct blk_plug *plug;
2017
- int where = ELEVATOR_INSERT_SORT;
2018
- struct request *req, *free;
2019
- unsigned int request_count = 0;
2020
-
2021
- /*
2022
- * low level driver can indicate that it wants pages above a
2023
- * certain limit bounced to low memory (ie for highmem, or even
2024
- * ISA dma in theory)
2025
- */
2026
- blk_queue_bounce(q, &bio);
2027
-
2028
- blk_queue_split(q, &bio);
2029
-
2030
- if (!bio_integrity_prep(bio))
2031
- return BLK_QC_T_NONE;
2032
-
2033
- if (op_is_flush(bio->bi_opf)) {
2034
- spin_lock_irq(q->queue_lock);
2035
- where = ELEVATOR_INSERT_FLUSH;
2036
- goto get_rq;
2037
- }
2038
-
2039
- /*
2040
- * Check if we can merge with the plugged list before grabbing
2041
- * any locks.
2042
- */
2043
- if (!blk_queue_nomerges(q)) {
2044
- if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
2045
- return BLK_QC_T_NONE;
2046
- } else
2047
- request_count = blk_plug_queued_count(q);
2048
-
2049
- spin_lock_irq(q->queue_lock);
2050
-
2051
- switch (elv_merge(q, &req, bio)) {
2052
- case ELEVATOR_BACK_MERGE:
2053
- if (!bio_attempt_back_merge(q, req, bio))
2054
- break;
2055
- elv_bio_merged(q, req, bio);
2056
- free = attempt_back_merge(q, req);
2057
- if (free)
2058
- __blk_put_request(q, free);
2059
- else
2060
- elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
2061
- goto out_unlock;
2062
- case ELEVATOR_FRONT_MERGE:
2063
- if (!bio_attempt_front_merge(q, req, bio))
2064
- break;
2065
- elv_bio_merged(q, req, bio);
2066
- free = attempt_front_merge(q, req);
2067
- if (free)
2068
- __blk_put_request(q, free);
2069
- else
2070
- elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
2071
- goto out_unlock;
2072
- default:
2073
- break;
2074
- }
2075
-
2076
-get_rq:
2077
- rq_qos_throttle(q, bio, q->queue_lock);
2078
-
2079
- /*
2080
- * Grab a free request. This is might sleep but can not fail.
2081
- * Returns with the queue unlocked.
2082
- */
2083
- blk_queue_enter_live(q);
2084
- req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2085
- if (IS_ERR(req)) {
2086
- blk_queue_exit(q);
2087
- rq_qos_cleanup(q, bio);
2088
- if (PTR_ERR(req) == -ENOMEM)
2089
- bio->bi_status = BLK_STS_RESOURCE;
2090
- else
2091
- bio->bi_status = BLK_STS_IOERR;
2092
- bio_endio(bio);
2093
- goto out_unlock;
2094
- }
2095
-
2096
- rq_qos_track(q, req, bio);
2097
-
2098
- /*
2099
- * After dropping the lock and possibly sleeping here, our request
2100
- * may now be mergeable after it had proven unmergeable (above).
2101
- * We don't worry about that case for efficiency. It won't happen
2102
- * often, and the elevators are able to handle it.
2103
- */
2104
- blk_init_request_from_bio(req, bio);
2105
-
2106
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
2107
- req->cpu = raw_smp_processor_id();
2108
-
2109
- plug = current->plug;
2110
- if (plug) {
2111
- /*
2112
- * If this is the first request added after a plug, fire
2113
- * of a plug trace.
2114
- *
2115
- * @request_count may become stale because of schedule
2116
- * out, so check plug list again.
2117
- */
2118
- if (!request_count || list_empty(&plug->list))
2119
- trace_block_plug(q);
2120
- else {
2121
- struct request *last = list_entry_rq(plug->list.prev);
2122
- if (request_count >= BLK_MAX_REQUEST_COUNT ||
2123
- blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
2124
- blk_flush_plug_list(plug, false);
2125
- trace_block_plug(q);
2126
- }
2127
- }
2128
- list_add_tail(&req->queuelist, &plug->list);
2129
- blk_account_io_start(req, true);
2130
- } else {
2131
- spin_lock_irq(q->queue_lock);
2132
- add_acct_request(q, req, where);
2133
- __blk_run_queue(q);
2134
-out_unlock:
2135
- spin_unlock_irq(q->queue_lock);
2136
- }
2137
-
2138
- return BLK_QC_T_NONE;
2139
-}
2140652
2141653 static void handle_bad_sector(struct bio *bio, sector_t maxsector)
2142654 {
2143655 char b[BDEVNAME_SIZE];
2144656
2145
- printk(KERN_INFO "attempt to access beyond end of device\n");
2146
- printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
2147
- bio_devname(bio, b), bio->bi_opf,
2148
- (unsigned long long)bio_end_sector(bio),
2149
- (long long)maxsector);
657
+ pr_info_ratelimited("attempt to access beyond end of device\n"
658
+ "%s: rw=%d, want=%llu, limit=%llu\n",
659
+ bio_devname(bio, b), bio->bi_opf,
660
+ bio_end_sector(bio), maxsector);
2150661 }
2151662
2152663 #ifdef CONFIG_FAIL_MAKE_REQUEST
....@@ -2193,10 +704,7 @@
2193704
2194705 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
2195706 return false;
2196
-
2197
- WARN_ONCE(1,
2198
- "generic_make_request: Trying to write "
2199
- "to read-only block-device %s (partno %d)\n",
707
+ pr_warn("Trying to write to read-only block-device %s (partno %d)\n",
2200708 bio_devname(bio, b), part->partno);
2201709 /* Older lvm-tools actually trigger this */
2202710 return false;
....@@ -2248,11 +756,7 @@
2248756 if (unlikely(bio_check_ro(bio, p)))
2249757 goto out;
2250758
2251
- /*
2252
- * Zone reset does not include bi_size so bio_sectors() is always 0.
2253
- * Include a test for the reset op code and perform the remap if needed.
2254
- */
2255
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
759
+ if (bio_sectors(bio)) {
2256760 if (bio_check_eod(bio, part_nr_sects_read(p)))
2257761 goto out;
2258762 bio->bi_iter.bi_sector += p->start_sect;
....@@ -2266,30 +770,58 @@
2266770 return ret;
2267771 }
2268772
2269
-static noinline_for_stack bool
2270
-generic_make_request_checks(struct bio *bio)
773
+/*
774
+ * Check write append to a zoned block device.
775
+ */
776
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
777
+ struct bio *bio)
2271778 {
2272
- struct request_queue *q;
779
+ sector_t pos = bio->bi_iter.bi_sector;
2273780 int nr_sectors = bio_sectors(bio);
781
+
782
+ /* Only applicable to zoned block devices */
783
+ if (!blk_queue_is_zoned(q))
784
+ return BLK_STS_NOTSUPP;
785
+
786
+ /* The bio sector must point to the start of a sequential zone */
787
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
788
+ !blk_queue_zone_is_seq(q, pos))
789
+ return BLK_STS_IOERR;
790
+
791
+ /*
792
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
793
+ * split and could result in non-contiguous sectors being written in
794
+ * different zones.
795
+ */
796
+ if (nr_sectors > q->limits.chunk_sectors)
797
+ return BLK_STS_IOERR;
798
+
799
+ /* Make sure the BIO is small enough and will not get split */
800
+ if (nr_sectors > q->limits.max_zone_append_sectors)
801
+ return BLK_STS_IOERR;
802
+
803
+ bio->bi_opf |= REQ_NOMERGE;
804
+
805
+ return BLK_STS_OK;
806
+}
807
+
808
+static noinline_for_stack bool submit_bio_checks(struct bio *bio)
809
+{
810
+ struct request_queue *q = bio->bi_disk->queue;
2274811 blk_status_t status = BLK_STS_IOERR;
2275
- char b[BDEVNAME_SIZE];
812
+ struct blk_plug *plug;
2276813
2277814 might_sleep();
2278815
2279
- q = bio->bi_disk->queue;
2280
- if (unlikely(!q)) {
2281
- printk(KERN_ERR
2282
- "generic_make_request: Trying to access "
2283
- "nonexistent block-device %s (%Lu)\n",
2284
- bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
2285
- goto end_io;
2286
- }
816
+ plug = blk_mq_plug(q, bio);
817
+ if (plug && plug->nowait)
818
+ bio->bi_opf |= REQ_NOWAIT;
2287819
2288820 /*
2289821 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2290
- * if queue is not a request based queue.
822
+ * if queue does not support NOWAIT.
2291823 */
2292
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
824
+ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
2293825 goto not_supported;
2294826
2295827 if (should_fail_bio(bio))
....@@ -2306,18 +838,20 @@
2306838 }
2307839
2308840 /*
2309
- * Filter flush bio's early so that make_request based
2310
- * drivers without flush support don't have to worry
2311
- * about them.
841
+ * Filter flush bio's early so that bio based drivers without flush
842
+ * support don't have to worry about them.
2312843 */
2313844 if (op_is_flush(bio->bi_opf) &&
2314845 !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
2315846 bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
2316
- if (!nr_sectors) {
847
+ if (!bio_sectors(bio)) {
2317848 status = BLK_STS_OK;
2318849 goto end_io;
2319850 }
2320851 }
852
+
853
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
854
+ bio->bi_opf &= ~REQ_HIPRI;
2321855
2322856 switch (bio_op(bio)) {
2323857 case REQ_OP_DISCARD:
....@@ -2332,9 +866,20 @@
2332866 if (!q->limits.max_write_same_sectors)
2333867 goto not_supported;
2334868 break;
2335
- case REQ_OP_ZONE_REPORT:
869
+ case REQ_OP_ZONE_APPEND:
870
+ status = blk_check_zone_append(q, bio);
871
+ if (status != BLK_STS_OK)
872
+ goto end_io;
873
+ break;
2336874 case REQ_OP_ZONE_RESET:
875
+ case REQ_OP_ZONE_OPEN:
876
+ case REQ_OP_ZONE_CLOSE:
877
+ case REQ_OP_ZONE_FINISH:
2337878 if (!blk_queue_is_zoned(q))
879
+ goto not_supported;
880
+ break;
881
+ case REQ_OP_ZONE_RESET_ALL:
882
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
2338883 goto not_supported;
2339884 break;
2340885 case REQ_OP_WRITE_ZEROES:
....@@ -2346,15 +891,19 @@
2346891 }
2347892
2348893 /*
2349
- * Various block parts want %current->io_context and lazy ioc
2350
- * allocation ends up trading a lot of pain for a small amount of
2351
- * memory. Just allocate it upfront. This may fail and block
2352
- * layer knows how to live with it.
894
+ * Various block parts want %current->io_context, so allocate it up
895
+ * front rather than dealing with lots of pain to allocate it only
896
+ * where needed. This may fail and the block layer knows how to live
897
+ * with it.
2353898 */
2354
- create_io_context(GFP_ATOMIC, q->node);
899
+ if (unlikely(!current->io_context))
900
+ create_task_io_context(current, GFP_ATOMIC, q->node);
2355901
2356
- if (!blkcg_bio_issue_check(q, bio))
902
+ if (blk_throtl_bio(bio))
2357903 return false;
904
+
905
+ blk_cgroup_bio_start(bio);
906
+ blkcg_bio_issue_init(bio);
2358907
2359908 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
2360909 trace_block_bio_queue(q, bio);
....@@ -2373,197 +922,162 @@
2373922 return false;
2374923 }
2375924
2376
-/**
2377
- * generic_make_request - hand a buffer to its device driver for I/O
2378
- * @bio: The bio describing the location in memory and on the device.
2379
- *
2380
- * generic_make_request() is used to make I/O requests of block
2381
- * devices. It is passed a &struct bio, which describes the I/O that needs
2382
- * to be done.
2383
- *
2384
- * generic_make_request() does not return any status. The
2385
- * success/failure status of the request, along with notification of
2386
- * completion, is delivered asynchronously through the bio->bi_end_io
2387
- * function described (one day) else where.
2388
- *
2389
- * The caller of generic_make_request must make sure that bi_io_vec
2390
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
2391
- * set to describe the device address, and the
2392
- * bi_end_io and optionally bi_private are set to describe how
2393
- * completion notification should be signaled.
2394
- *
2395
- * generic_make_request and the drivers it calls may use bi_next if this
2396
- * bio happens to be merged with someone else, and may resubmit the bio to
2397
- * a lower device by calling into generic_make_request recursively, which
2398
- * means the bio should NOT be touched after the call to ->make_request_fn.
2399
- */
2400
-blk_qc_t generic_make_request(struct bio *bio)
925
+static blk_qc_t __submit_bio(struct bio *bio)
2401926 {
2402
- /*
2403
- * bio_list_on_stack[0] contains bios submitted by the current
2404
- * make_request_fn.
2405
- * bio_list_on_stack[1] contains bios that were submitted before
2406
- * the current make_request_fn, but that haven't been processed
2407
- * yet.
2408
- */
2409
- struct bio_list bio_list_on_stack[2];
2410
- blk_mq_req_flags_t flags = 0;
2411
- struct request_queue *q = bio->bi_disk->queue;
927
+ struct gendisk *disk = bio->bi_disk;
2412928 blk_qc_t ret = BLK_QC_T_NONE;
2413929
2414
- if (bio->bi_opf & REQ_NOWAIT)
2415
- flags = BLK_MQ_REQ_NOWAIT;
2416
- if (bio_flagged(bio, BIO_QUEUE_ENTERED))
2417
- blk_queue_enter_live(q);
2418
- else if (blk_queue_enter(q, flags) < 0) {
2419
- if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
2420
- bio_wouldblock_error(bio);
2421
- else
2422
- bio_io_error(bio);
2423
- return ret;
930
+ if (blk_crypto_bio_prep(&bio)) {
931
+ if (!disk->fops->submit_bio)
932
+ return blk_mq_submit_bio(bio);
933
+ ret = disk->fops->submit_bio(bio);
2424934 }
935
+ blk_queue_exit(disk->queue);
936
+ return ret;
937
+}
2425938
2426
- if (!generic_make_request_checks(bio))
2427
- goto out;
939
+/*
940
+ * The loop in this function may be a bit non-obvious, and so deserves some
941
+ * explanation:
942
+ *
943
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
944
+ * that), so we have a list with a single bio.
945
+ * - We pretend that we have just taken it off a longer list, so we assign
946
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
947
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
948
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
949
+ * non-NULL value in bio_list and re-enter the loop from the top.
950
+ * - In this case we really did just take the bio of the top of the list (no
951
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
952
+ * again.
953
+ *
954
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
955
+ * bio_list_on_stack[1] contains bios that were submitted before the current
956
+ * ->submit_bio_bio, but that haven't been processed yet.
957
+ */
958
+static blk_qc_t __submit_bio_noacct(struct bio *bio)
959
+{
960
+ struct bio_list bio_list_on_stack[2];
961
+ blk_qc_t ret = BLK_QC_T_NONE;
962
+
963
+ BUG_ON(bio->bi_next);
964
+
965
+ bio_list_init(&bio_list_on_stack[0]);
966
+ current->bio_list = bio_list_on_stack;
967
+
968
+ do {
969
+ struct request_queue *q = bio->bi_disk->queue;
970
+ struct bio_list lower, same;
971
+
972
+ if (unlikely(bio_queue_enter(bio) != 0))
973
+ continue;
974
+
975
+ /*
976
+ * Create a fresh bio_list for all subordinate requests.
977
+ */
978
+ bio_list_on_stack[1] = bio_list_on_stack[0];
979
+ bio_list_init(&bio_list_on_stack[0]);
980
+
981
+ ret = __submit_bio(bio);
982
+
983
+ /*
984
+ * Sort new bios into those for a lower level and those for the
985
+ * same level.
986
+ */
987
+ bio_list_init(&lower);
988
+ bio_list_init(&same);
989
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
990
+ if (q == bio->bi_disk->queue)
991
+ bio_list_add(&same, bio);
992
+ else
993
+ bio_list_add(&lower, bio);
994
+
995
+ /*
996
+ * Now assemble so we handle the lowest level first.
997
+ */
998
+ bio_list_merge(&bio_list_on_stack[0], &lower);
999
+ bio_list_merge(&bio_list_on_stack[0], &same);
1000
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
1001
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
1002
+
1003
+ current->bio_list = NULL;
1004
+ return ret;
1005
+}
1006
+
1007
+static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
1008
+{
1009
+ struct bio_list bio_list[2] = { };
1010
+ blk_qc_t ret = BLK_QC_T_NONE;
1011
+
1012
+ current->bio_list = bio_list;
1013
+
1014
+ do {
1015
+ struct gendisk *disk = bio->bi_disk;
1016
+
1017
+ if (unlikely(bio_queue_enter(bio) != 0))
1018
+ continue;
1019
+
1020
+ if (!blk_crypto_bio_prep(&bio)) {
1021
+ blk_queue_exit(disk->queue);
1022
+ ret = BLK_QC_T_NONE;
1023
+ continue;
1024
+ }
1025
+
1026
+ ret = blk_mq_submit_bio(bio);
1027
+ } while ((bio = bio_list_pop(&bio_list[0])));
1028
+
1029
+ current->bio_list = NULL;
1030
+ return ret;
1031
+}
1032
+
1033
+/**
1034
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
1035
+ * @bio: The bio describing the location in memory and on the device.
1036
+ *
1037
+ * This is a version of submit_bio() that shall only be used for I/O that is
1038
+ * resubmitted to lower level drivers by stacking block drivers. All file
1039
+ * systems and other upper level users of the block layer should use
1040
+ * submit_bio() instead.
1041
+ */
1042
+blk_qc_t submit_bio_noacct(struct bio *bio)
1043
+{
1044
+ if (!submit_bio_checks(bio))
1045
+ return BLK_QC_T_NONE;
24281046
24291047 /*
2430
- * We only want one ->make_request_fn to be active at a time, else
2431
- * stack usage with stacked devices could be a problem. So use
2432
- * current->bio_list to keep a list of requests submited by a
2433
- * make_request_fn function. current->bio_list is also used as a
2434
- * flag to say if generic_make_request is currently active in this
2435
- * task or not. If it is NULL, then no make_request is active. If
2436
- * it is non-NULL, then a make_request is active, and new requests
2437
- * should be added at the tail
1048
+ * We only want one ->submit_bio to be active at a time, else stack
1049
+ * usage with stacked devices could be a problem. Use current->bio_list
1050
+ * to collect a list of requests submited by a ->submit_bio method while
1051
+ * it is active, and then process them after it returned.
24381052 */
24391053 if (current->bio_list) {
24401054 bio_list_add(&current->bio_list[0], bio);
2441
- goto out;
2442
- }
2443
-
2444
- /* following loop may be a bit non-obvious, and so deserves some
2445
- * explanation.
2446
- * Before entering the loop, bio->bi_next is NULL (as all callers
2447
- * ensure that) so we have a list with a single bio.
2448
- * We pretend that we have just taken it off a longer list, so
2449
- * we assign bio_list to a pointer to the bio_list_on_stack,
2450
- * thus initialising the bio_list of new bios to be
2451
- * added. ->make_request() may indeed add some more bios
2452
- * through a recursive call to generic_make_request. If it
2453
- * did, we find a non-NULL value in bio_list and re-enter the loop
2454
- * from the top. In this case we really did just take the bio
2455
- * of the top of the list (no pretending) and so remove it from
2456
- * bio_list, and call into ->make_request() again.
2457
- */
2458
- BUG_ON(bio->bi_next);
2459
- bio_list_init(&bio_list_on_stack[0]);
2460
- current->bio_list = bio_list_on_stack;
2461
- do {
2462
- bool enter_succeeded = true;
2463
-
2464
- if (unlikely(q != bio->bi_disk->queue)) {
2465
- if (q)
2466
- blk_queue_exit(q);
2467
- q = bio->bi_disk->queue;
2468
- flags = 0;
2469
- if (bio->bi_opf & REQ_NOWAIT)
2470
- flags = BLK_MQ_REQ_NOWAIT;
2471
- if (blk_queue_enter(q, flags) < 0)
2472
- enter_succeeded = false;
2473
- }
2474
-
2475
- if (enter_succeeded) {
2476
- struct bio_list lower, same;
2477
-
2478
- /* Create a fresh bio_list for all subordinate requests */
2479
- bio_list_on_stack[1] = bio_list_on_stack[0];
2480
- bio_list_init(&bio_list_on_stack[0]);
2481
-
2482
- if (!blk_crypto_submit_bio(&bio))
2483
- ret = q->make_request_fn(q, bio);
2484
-
2485
- /* sort new bios into those for a lower level
2486
- * and those for the same level
2487
- */
2488
- bio_list_init(&lower);
2489
- bio_list_init(&same);
2490
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
2491
- if (q == bio->bi_disk->queue)
2492
- bio_list_add(&same, bio);
2493
- else
2494
- bio_list_add(&lower, bio);
2495
- /* now assemble so we handle the lowest level first */
2496
- bio_list_merge(&bio_list_on_stack[0], &lower);
2497
- bio_list_merge(&bio_list_on_stack[0], &same);
2498
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
2499
- } else {
2500
- if (unlikely(!blk_queue_dying(q) &&
2501
- (bio->bi_opf & REQ_NOWAIT)))
2502
- bio_wouldblock_error(bio);
2503
- else
2504
- bio_io_error(bio);
2505
- q = NULL;
2506
- }
2507
- bio = bio_list_pop(&bio_list_on_stack[0]);
2508
- } while (bio);
2509
- current->bio_list = NULL; /* deactivate */
2510
-
2511
-out:
2512
- if (q)
2513
- blk_queue_exit(q);
2514
- return ret;
2515
-}
2516
-EXPORT_SYMBOL(generic_make_request);
2517
-
2518
-/**
2519
- * direct_make_request - hand a buffer directly to its device driver for I/O
2520
- * @bio: The bio describing the location in memory and on the device.
2521
- *
2522
- * This function behaves like generic_make_request(), but does not protect
2523
- * against recursion. Must only be used if the called driver is known
2524
- * to not call generic_make_request (or direct_make_request) again from
2525
- * its make_request function. (Calling direct_make_request again from
2526
- * a workqueue is perfectly fine as that doesn't recurse).
2527
- */
2528
-blk_qc_t direct_make_request(struct bio *bio)
2529
-{
2530
- struct request_queue *q = bio->bi_disk->queue;
2531
- bool nowait = bio->bi_opf & REQ_NOWAIT;
2532
- blk_qc_t ret = BLK_QC_T_NONE;
2533
-
2534
- if (!generic_make_request_checks(bio))
2535
- return BLK_QC_T_NONE;
2536
-
2537
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
2538
- if (nowait && !blk_queue_dying(q))
2539
- bio->bi_status = BLK_STS_AGAIN;
2540
- else
2541
- bio->bi_status = BLK_STS_IOERR;
2542
- bio_endio(bio);
25431055 return BLK_QC_T_NONE;
25441056 }
25451057
2546
- if (!blk_crypto_submit_bio(&bio))
2547
- ret = q->make_request_fn(q, bio);
2548
- blk_queue_exit(q);
2549
- return ret;
1058
+ if (!bio->bi_disk->fops->submit_bio)
1059
+ return __submit_bio_noacct_mq(bio);
1060
+ return __submit_bio_noacct(bio);
25501061 }
2551
-EXPORT_SYMBOL_GPL(direct_make_request);
1062
+EXPORT_SYMBOL(submit_bio_noacct);
25521063
25531064 /**
25541065 * submit_bio - submit a bio to the block device layer for I/O
25551066 * @bio: The &struct bio which describes the I/O
25561067 *
2557
- * submit_bio() is very similar in purpose to generic_make_request(), and
2558
- * uses that function to do most of the work. Both are fairly rough
2559
- * interfaces; @bio must be presetup and ready for I/O.
1068
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
1069
+ * fully set up &struct bio that describes the I/O that needs to be done. The
1070
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
25601071 *
1072
+ * The success/failure status of the request, along with notification of
1073
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
1074
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
1075
+ * been called.
25611076 */
25621077 blk_qc_t submit_bio(struct bio *bio)
25631078 {
2564
- bool workingset_read = false;
2565
- unsigned long pflags;
2566
- blk_qc_t ret;
1079
+ if (blkcg_punt_bio_submit(bio))
1080
+ return BLK_QC_T_NONE;
25671081
25681082 /*
25691083 * If it's a regular read/write or a barrier with data attached,
....@@ -2580,8 +1094,6 @@
25801094 if (op_is_write(bio_op(bio))) {
25811095 count_vm_events(PGPGOUT, count);
25821096 } else {
2583
- if (bio_flagged(bio, BIO_WORKINGSET))
2584
- workingset_read = true;
25851097 task_io_account_read(bio->bi_iter.bi_size);
25861098 count_vm_events(PGPGIN, count);
25871099 }
....@@ -2597,37 +1109,30 @@
25971109 }
25981110
25991111 /*
2600
- * If we're reading data that is part of the userspace
2601
- * workingset, count submission time as memory stall. When the
2602
- * device is congested, or the submitting cgroup IO-throttled,
2603
- * submission can be a significant part of overall IO time.
1112
+ * If we're reading data that is part of the userspace workingset, count
1113
+ * submission time as memory stall. When the device is congested, or
1114
+ * the submitting cgroup IO-throttled, submission can be a significant
1115
+ * part of overall IO time.
26041116 */
2605
- if (workingset_read)
1117
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
1118
+ bio_flagged(bio, BIO_WORKINGSET))) {
1119
+ unsigned long pflags;
1120
+ blk_qc_t ret;
1121
+
26061122 psi_memstall_enter(&pflags);
2607
-
2608
- ret = generic_make_request(bio);
2609
-
2610
- if (workingset_read)
1123
+ ret = submit_bio_noacct(bio);
26111124 psi_memstall_leave(&pflags);
26121125
2613
- return ret;
1126
+ return ret;
1127
+ }
1128
+
1129
+ return submit_bio_noacct(bio);
26141130 }
26151131 EXPORT_SYMBOL(submit_bio);
26161132
2617
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2618
-{
2619
- if (!q->poll_fn || !blk_qc_t_valid(cookie))
2620
- return false;
2621
-
2622
- if (current->plug)
2623
- blk_flush_plug_list(current->plug, false);
2624
- return q->poll_fn(q, cookie);
2625
-}
2626
-EXPORT_SYMBOL_GPL(blk_poll);
2627
-
26281133 /**
26291134 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2630
- * for new the queue limits
1135
+ * for the new queue limits
26311136 * @q: the queue
26321137 * @rq: the request being checked
26331138 *
....@@ -2642,12 +1147,28 @@
26421147 * limits when retrying requests on other queues. Those requests need
26431148 * to be checked against the new queue limits again during dispatch.
26441149 */
2645
-static int blk_cloned_rq_check_limits(struct request_queue *q,
1150
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
26461151 struct request *rq)
26471152 {
2648
- if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
2649
- printk(KERN_ERR "%s: over max size limit.\n", __func__);
2650
- return -EIO;
1153
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
1154
+
1155
+ if (blk_rq_sectors(rq) > max_sectors) {
1156
+ /*
1157
+ * SCSI device does not have a good way to return if
1158
+ * Write Same/Zero is actually supported. If a device rejects
1159
+ * a non-read/write command (discard, write same,etc.) the
1160
+ * low-level device driver will set the relevant queue limit to
1161
+ * 0 to prevent blk-lib from issuing more of the offending
1162
+ * operations. Commands queued prior to the queue limit being
1163
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
1164
+ * errors being propagated to upper layers.
1165
+ */
1166
+ if (max_sectors == 0)
1167
+ return BLK_STS_NOTSUPP;
1168
+
1169
+ printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
1170
+ __func__, blk_rq_sectors(rq), max_sectors);
1171
+ return BLK_STS_IOERR;
26511172 }
26521173
26531174 /*
....@@ -2656,13 +1177,14 @@
26561177 * Recalculate it to check the request correctly on this queue's
26571178 * limitation.
26581179 */
2659
- blk_recalc_rq_segments(rq);
1180
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
26601181 if (rq->nr_phys_segments > queue_max_segments(q)) {
2661
- printk(KERN_ERR "%s: over max segments limit.\n", __func__);
2662
- return -EIO;
1182
+ printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
1183
+ __func__, rq->nr_phys_segments, queue_max_segments(q));
1184
+ return BLK_STS_IOERR;
26631185 }
26641186
2665
- return 0;
1187
+ return BLK_STS_OK;
26661188 }
26671189
26681190 /**
....@@ -2672,48 +1194,28 @@
26721194 */
26731195 blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
26741196 {
2675
- unsigned long flags;
2676
- int where = ELEVATOR_INSERT_BACK;
1197
+ blk_status_t ret;
26771198
2678
- if (blk_cloned_rq_check_limits(q, rq))
2679
- return BLK_STS_IOERR;
1199
+ ret = blk_cloned_rq_check_limits(q, rq);
1200
+ if (ret != BLK_STS_OK)
1201
+ return ret;
26801202
26811203 if (rq->rq_disk &&
26821204 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
26831205 return BLK_STS_IOERR;
26841206
2685
- if (q->mq_ops) {
2686
- if (blk_queue_io_stat(q))
2687
- blk_account_io_start(rq, true);
2688
- /*
2689
- * Since we have a scheduler attached on the top device,
2690
- * bypass a potential scheduler on the bottom device for
2691
- * insert.
2692
- */
2693
- return blk_mq_request_issue_directly(rq);
2694
- }
2695
-
2696
- spin_lock_irqsave(q->queue_lock, flags);
2697
- if (unlikely(blk_queue_dying(q))) {
2698
- spin_unlock_irqrestore(q->queue_lock, flags);
1207
+ if (blk_crypto_insert_cloned_request(rq))
26991208 return BLK_STS_IOERR;
2700
- }
1209
+
1210
+ if (blk_queue_io_stat(q))
1211
+ blk_account_io_start(rq);
27011212
27021213 /*
2703
- * Submitting request must be dequeued before calling this function
2704
- * because it will be linked to another request_queue
1214
+ * Since we have a scheduler attached on the top device,
1215
+ * bypass a potential scheduler on the bottom device for
1216
+ * insert.
27051217 */
2706
- BUG_ON(blk_queued_rq(rq));
2707
-
2708
- if (op_is_flush(rq->cmd_flags))
2709
- where = ELEVATOR_INSERT_FLUSH;
2710
-
2711
- add_acct_request(q, rq, where);
2712
- if (where == ELEVATOR_INSERT_FLUSH)
2713
- __blk_run_queue(q);
2714
- spin_unlock_irqrestore(q->queue_lock, flags);
2715
-
2716
- return BLK_STS_OK;
1218
+ return blk_mq_request_issue_directly(rq, true);
27171219 }
27181220 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
27191221
....@@ -2758,16 +1260,30 @@
27581260 }
27591261 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
27601262
2761
-void blk_account_io_completion(struct request *req, unsigned int bytes)
1263
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
27621264 {
2763
- if (blk_do_io_stat(req)) {
1265
+ unsigned long stamp;
1266
+again:
1267
+ stamp = READ_ONCE(part->stamp);
1268
+ if (unlikely(stamp != now)) {
1269
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
1270
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
1271
+ }
1272
+ if (part->partno) {
1273
+ part = &part_to_disk(part)->part0;
1274
+ goto again;
1275
+ }
1276
+}
1277
+
1278
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
1279
+{
1280
+ if (req->part && blk_do_io_stat(req)) {
27641281 const int sgrp = op_stat_group(req_op(req));
27651282 struct hd_struct *part;
2766
- int cpu;
27671283
2768
- cpu = part_stat_lock();
1284
+ part_stat_lock();
27691285 part = req->part;
2770
- part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
1286
+ part_stat_add(part, sectors[sgrp], bytes >> 9);
27711287 part_stat_unlock();
27721288 }
27731289 }
....@@ -2779,299 +1295,95 @@
27791295 * normal IO on queueing nor completion. Accounting the
27801296 * containing request is enough.
27811297 */
2782
- if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
1298
+ if (req->part && blk_do_io_stat(req) &&
1299
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
27831300 const int sgrp = op_stat_group(req_op(req));
27841301 struct hd_struct *part;
2785
- int cpu;
27861302
2787
- cpu = part_stat_lock();
1303
+ part_stat_lock();
27881304 part = req->part;
27891305
2790
- part_stat_inc(cpu, part, ios[sgrp]);
2791
- part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
2792
- part_round_stats(req->q, cpu, part);
2793
- part_dec_in_flight(req->q, part, rq_data_dir(req));
1306
+ update_io_ticks(part, jiffies, true);
1307
+ part_stat_inc(part, ios[sgrp]);
1308
+ part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
1309
+ part_stat_unlock();
27941310
27951311 hd_struct_put(part);
2796
- part_stat_unlock();
27971312 }
27981313 }
27991314
2800
-#ifdef CONFIG_PM
2801
-/*
2802
- * Don't process normal requests when queue is suspended
2803
- * or in the process of suspending/resuming
2804
- */
2805
-static bool blk_pm_allow_request(struct request *rq)
1315
+void blk_account_io_start(struct request *rq)
28061316 {
2807
- switch (rq->q->rpm_status) {
2808
- case RPM_RESUMING:
2809
- case RPM_SUSPENDING:
2810
- return rq->rq_flags & RQF_PM;
2811
- case RPM_SUSPENDED:
2812
- return false;
2813
- default:
2814
- return true;
2815
- }
2816
-}
2817
-#else
2818
-static bool blk_pm_allow_request(struct request *rq)
2819
-{
2820
- return true;
2821
-}
2822
-#endif
2823
-
2824
-void blk_account_io_start(struct request *rq, bool new_io)
2825
-{
2826
- struct hd_struct *part;
2827
- int rw = rq_data_dir(rq);
2828
- int cpu;
2829
-
28301317 if (!blk_do_io_stat(rq))
28311318 return;
28321319
2833
- cpu = part_stat_lock();
1320
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
28341321
2835
- if (!new_io) {
2836
- part = rq->part;
2837
- part_stat_inc(cpu, part, merges[rw]);
2838
- } else {
2839
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2840
- if (!hd_struct_try_get(part)) {
2841
- /*
2842
- * The partition is already being removed,
2843
- * the request will be accounted on the disk only
2844
- *
2845
- * We take a reference on disk->part0 although that
2846
- * partition will never be deleted, so we can treat
2847
- * it as any other partition.
2848
- */
2849
- part = &rq->rq_disk->part0;
2850
- hd_struct_get(part);
2851
- }
2852
- part_round_stats(rq->q, cpu, part);
2853
- part_inc_in_flight(rq->q, part, rw);
2854
- rq->part = part;
2855
- }
2856
-
1322
+ part_stat_lock();
1323
+ update_io_ticks(rq->part, jiffies, false);
28571324 part_stat_unlock();
28581325 }
28591326
2860
-static struct request *elv_next_request(struct request_queue *q)
1327
+static unsigned long __part_start_io_acct(struct hd_struct *part,
1328
+ unsigned int sectors, unsigned int op)
28611329 {
2862
- struct request *rq;
2863
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
1330
+ const int sgrp = op_stat_group(op);
1331
+ unsigned long now = READ_ONCE(jiffies);
28641332
2865
- WARN_ON_ONCE(q->mq_ops);
1333
+ part_stat_lock();
1334
+ update_io_ticks(part, now, false);
1335
+ part_stat_inc(part, ios[sgrp]);
1336
+ part_stat_add(part, sectors[sgrp], sectors);
1337
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
1338
+ part_stat_unlock();
28661339
2867
- while (1) {
2868
- list_for_each_entry(rq, &q->queue_head, queuelist) {
2869
- if (blk_pm_allow_request(rq))
2870
- return rq;
2871
-
2872
- if (rq->rq_flags & RQF_SOFTBARRIER)
2873
- break;
2874
- }
2875
-
2876
- /*
2877
- * Flush request is running and flush request isn't queueable
2878
- * in the drive, we can hold the queue till flush request is
2879
- * finished. Even we don't do this, driver can't dispatch next
2880
- * requests and will requeue them. And this can improve
2881
- * throughput too. For example, we have request flush1, write1,
2882
- * flush 2. flush1 is dispatched, then queue is hold, write1
2883
- * isn't inserted to queue. After flush1 is finished, flush2
2884
- * will be dispatched. Since disk cache is already clean,
2885
- * flush2 will be finished very soon, so looks like flush2 is
2886
- * folded to flush1.
2887
- * Since the queue is hold, a flag is set to indicate the queue
2888
- * should be restarted later. Please see flush_end_io() for
2889
- * details.
2890
- */
2891
- if (fq->flush_pending_idx != fq->flush_running_idx &&
2892
- !queue_flush_queueable(q)) {
2893
- fq->flush_queue_delayed = 1;
2894
- return NULL;
2895
- }
2896
- if (unlikely(blk_queue_bypass(q)) ||
2897
- !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2898
- return NULL;
2899
- }
1340
+ return now;
29001341 }
29011342
2902
-/**
2903
- * blk_peek_request - peek at the top of a request queue
2904
- * @q: request queue to peek at
2905
- *
2906
- * Description:
2907
- * Return the request at the top of @q. The returned request
2908
- * should be started using blk_start_request() before LLD starts
2909
- * processing it.
2910
- *
2911
- * Return:
2912
- * Pointer to the request at the top of @q if available. Null
2913
- * otherwise.
2914
- */
2915
-struct request *blk_peek_request(struct request_queue *q)
1343
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
1344
+ struct bio *bio)
29161345 {
2917
- struct request *rq;
2918
- int ret;
1346
+ *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
29191347
2920
- lockdep_assert_held(q->queue_lock);
2921
- WARN_ON_ONCE(q->mq_ops);
2922
-
2923
- while ((rq = elv_next_request(q)) != NULL) {
2924
- if (!(rq->rq_flags & RQF_STARTED)) {
2925
- /*
2926
- * This is the first time the device driver
2927
- * sees this request (possibly after
2928
- * requeueing). Notify IO scheduler.
2929
- */
2930
- if (rq->rq_flags & RQF_SORTED)
2931
- elv_activate_rq(q, rq);
2932
-
2933
- /*
2934
- * just mark as started even if we don't start
2935
- * it, a request that has been delayed should
2936
- * not be passed by new incoming requests
2937
- */
2938
- rq->rq_flags |= RQF_STARTED;
2939
- trace_block_rq_issue(q, rq);
2940
- }
2941
-
2942
- if (!q->boundary_rq || q->boundary_rq == rq) {
2943
- q->end_sector = rq_end_sector(rq);
2944
- q->boundary_rq = NULL;
2945
- }
2946
-
2947
- if (rq->rq_flags & RQF_DONTPREP)
2948
- break;
2949
-
2950
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
2951
- /*
2952
- * make sure space for the drain appears we
2953
- * know we can do this because max_hw_segments
2954
- * has been adjusted to be one fewer than the
2955
- * device can handle
2956
- */
2957
- rq->nr_phys_segments++;
2958
- }
2959
-
2960
- if (!q->prep_rq_fn)
2961
- break;
2962
-
2963
- ret = q->prep_rq_fn(q, rq);
2964
- if (ret == BLKPREP_OK) {
2965
- break;
2966
- } else if (ret == BLKPREP_DEFER) {
2967
- /*
2968
- * the request may have been (partially) prepped.
2969
- * we need to keep this request in the front to
2970
- * avoid resource deadlock. RQF_STARTED will
2971
- * prevent other fs requests from passing this one.
2972
- */
2973
- if (q->dma_drain_size && blk_rq_bytes(rq) &&
2974
- !(rq->rq_flags & RQF_DONTPREP)) {
2975
- /*
2976
- * remove the space for the drain we added
2977
- * so that we don't add it again
2978
- */
2979
- --rq->nr_phys_segments;
2980
- }
2981
-
2982
- rq = NULL;
2983
- break;
2984
- } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
2985
- rq->rq_flags |= RQF_QUIET;
2986
- /*
2987
- * Mark this request as started so we don't trigger
2988
- * any debug logic in the end I/O path.
2989
- */
2990
- blk_start_request(rq);
2991
- __blk_end_request_all(rq, ret == BLKPREP_INVALID ?
2992
- BLK_STS_TARGET : BLK_STS_IOERR);
2993
- } else {
2994
- printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2995
- break;
2996
- }
2997
- }
2998
-
2999
- return rq;
1348
+ return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
30001349 }
3001
-EXPORT_SYMBOL(blk_peek_request);
1350
+EXPORT_SYMBOL_GPL(part_start_io_acct);
30021351
3003
-static void blk_dequeue_request(struct request *rq)
1352
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
1353
+ unsigned int op)
30041354 {
3005
- struct request_queue *q = rq->q;
1355
+ return __part_start_io_acct(&disk->part0, sectors, op);
1356
+}
1357
+EXPORT_SYMBOL(disk_start_io_acct);
30061358
3007
- BUG_ON(list_empty(&rq->queuelist));
3008
- BUG_ON(ELV_ON_HASH(rq));
1359
+static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
1360
+ unsigned long start_time)
1361
+{
1362
+ const int sgrp = op_stat_group(op);
1363
+ unsigned long now = READ_ONCE(jiffies);
1364
+ unsigned long duration = now - start_time;
30091365
3010
- list_del_init(&rq->queuelist);
3011
-
3012
- /*
3013
- * the time frame between a request being removed from the lists
3014
- * and to it is freed is accounted as io that is in progress at
3015
- * the driver side.
3016
- */
3017
- if (blk_account_rq(rq))
3018
- q->in_flight[rq_is_sync(rq)]++;
1366
+ part_stat_lock();
1367
+ update_io_ticks(part, now, true);
1368
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1369
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
1370
+ part_stat_unlock();
30191371 }
30201372
3021
-/**
3022
- * blk_start_request - start request processing on the driver
3023
- * @req: request to dequeue
3024
- *
3025
- * Description:
3026
- * Dequeue @req and start timeout timer on it. This hands off the
3027
- * request to the driver.
3028
- */
3029
-void blk_start_request(struct request *req)
1373
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
1374
+ unsigned long start_time)
30301375 {
3031
- lockdep_assert_held(req->q->queue_lock);
3032
- WARN_ON_ONCE(req->q->mq_ops);
3033
-
3034
- blk_dequeue_request(req);
3035
-
3036
- if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
3037
- req->io_start_time_ns = ktime_get_ns();
3038
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
3039
- req->throtl_size = blk_rq_sectors(req);
3040
-#endif
3041
- req->rq_flags |= RQF_STATS;
3042
- rq_qos_issue(req->q, req);
3043
- }
3044
-
3045
- BUG_ON(blk_rq_is_complete(req));
3046
- blk_add_timer(req);
1376
+ __part_end_io_acct(part, bio_op(bio), start_time);
1377
+ hd_struct_put(part);
30471378 }
3048
-EXPORT_SYMBOL(blk_start_request);
1379
+EXPORT_SYMBOL_GPL(part_end_io_acct);
30491380
3050
-/**
3051
- * blk_fetch_request - fetch a request from a request queue
3052
- * @q: request queue to fetch a request from
3053
- *
3054
- * Description:
3055
- * Return the request at the top of @q. The request is started on
3056
- * return and LLD can start processing it immediately.
3057
- *
3058
- * Return:
3059
- * Pointer to the request at the top of @q if available. Null
3060
- * otherwise.
3061
- */
3062
-struct request *blk_fetch_request(struct request_queue *q)
1381
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
1382
+ unsigned long start_time)
30631383 {
3064
- struct request *rq;
3065
-
3066
- lockdep_assert_held(q->queue_lock);
3067
- WARN_ON_ONCE(q->mq_ops);
3068
-
3069
- rq = blk_peek_request(q);
3070
- if (rq)
3071
- blk_start_request(rq);
3072
- return rq;
1384
+ __part_end_io_acct(&disk->part0, op, start_time);
30731385 }
3074
-EXPORT_SYMBOL(blk_fetch_request);
1386
+EXPORT_SYMBOL(disk_end_io_acct);
30751387
30761388 /*
30771389 * Steal bios from a request and add them to a bio list.
....@@ -3107,7 +1419,7 @@
31071419 *
31081420 * This special helper function is only for request stacking drivers
31091421 * (e.g. request-based dm) so that they can handle partial completion.
3110
- * Actual device drivers should use blk_end_request instead.
1422
+ * Actual device drivers should use blk_mq_end_request instead.
31111423 *
31121424 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
31131425 * %false return from this function.
....@@ -3130,9 +1442,22 @@
31301442 if (!req->bio)
31311443 return false;
31321444
1445
+#ifdef CONFIG_BLK_DEV_INTEGRITY
1446
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
1447
+ error == BLK_STS_OK)
1448
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
1449
+#endif
1450
+
1451
+ /*
1452
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
1453
+ * bio_endio(). Therefore, the keyslot must be released before that.
1454
+ */
1455
+ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
1456
+ __blk_crypto_rq_put_keyslot(req);
1457
+
31331458 if (unlikely(error && !blk_rq_is_passthrough(req) &&
31341459 !(req->rq_flags & RQF_QUIET)))
3135
- print_req_error(req, error);
1460
+ print_req_error(req, error, __func__);
31361461
31371462 blk_account_io_completion(req, nr_bytes);
31381463
....@@ -3191,276 +1516,12 @@
31911516 }
31921517
31931518 /* recalculate the number of segments */
3194
- blk_recalc_rq_segments(req);
1519
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
31951520 }
31961521
31971522 return true;
31981523 }
31991524 EXPORT_SYMBOL_GPL(blk_update_request);
3200
-
3201
-static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
3202
- unsigned int nr_bytes,
3203
- unsigned int bidi_bytes)
3204
-{
3205
- if (blk_update_request(rq, error, nr_bytes))
3206
- return true;
3207
-
3208
- /* Bidi request must be completed as a whole */
3209
- if (unlikely(blk_bidi_rq(rq)) &&
3210
- blk_update_request(rq->next_rq, error, bidi_bytes))
3211
- return true;
3212
-
3213
- if (blk_queue_add_random(rq->q))
3214
- add_disk_randomness(rq->rq_disk);
3215
-
3216
- return false;
3217
-}
3218
-
3219
-/**
3220
- * blk_unprep_request - unprepare a request
3221
- * @req: the request
3222
- *
3223
- * This function makes a request ready for complete resubmission (or
3224
- * completion). It happens only after all error handling is complete,
3225
- * so represents the appropriate moment to deallocate any resources
3226
- * that were allocated to the request in the prep_rq_fn. The queue
3227
- * lock is held when calling this.
3228
- */
3229
-void blk_unprep_request(struct request *req)
3230
-{
3231
- struct request_queue *q = req->q;
3232
-
3233
- req->rq_flags &= ~RQF_DONTPREP;
3234
- if (q->unprep_rq_fn)
3235
- q->unprep_rq_fn(q, req);
3236
-}
3237
-EXPORT_SYMBOL_GPL(blk_unprep_request);
3238
-
3239
-void blk_finish_request(struct request *req, blk_status_t error)
3240
-{
3241
- struct request_queue *q = req->q;
3242
- u64 now = ktime_get_ns();
3243
-
3244
- lockdep_assert_held(req->q->queue_lock);
3245
- WARN_ON_ONCE(q->mq_ops);
3246
-
3247
- if (req->rq_flags & RQF_STATS)
3248
- blk_stat_add(req, now);
3249
-
3250
- if (req->rq_flags & RQF_QUEUED)
3251
- blk_queue_end_tag(q, req);
3252
-
3253
- BUG_ON(blk_queued_rq(req));
3254
-
3255
- if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
3256
- laptop_io_completion(req->q->backing_dev_info);
3257
-
3258
- blk_delete_timer(req);
3259
-
3260
- if (req->rq_flags & RQF_DONTPREP)
3261
- blk_unprep_request(req);
3262
-
3263
- blk_account_io_done(req, now);
3264
-
3265
- if (req->end_io) {
3266
- rq_qos_done(q, req);
3267
- req->end_io(req, error);
3268
- } else {
3269
- if (blk_bidi_rq(req))
3270
- __blk_put_request(req->next_rq->q, req->next_rq);
3271
-
3272
- __blk_put_request(q, req);
3273
- }
3274
-}
3275
-EXPORT_SYMBOL(blk_finish_request);
3276
-
3277
-/**
3278
- * blk_end_bidi_request - Complete a bidi request
3279
- * @rq: the request to complete
3280
- * @error: block status code
3281
- * @nr_bytes: number of bytes to complete @rq
3282
- * @bidi_bytes: number of bytes to complete @rq->next_rq
3283
- *
3284
- * Description:
3285
- * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
3286
- * Drivers that supports bidi can safely call this member for any
3287
- * type of request, bidi or uni. In the later case @bidi_bytes is
3288
- * just ignored.
3289
- *
3290
- * Return:
3291
- * %false - we are done with this request
3292
- * %true - still buffers pending for this request
3293
- **/
3294
-static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
3295
- unsigned int nr_bytes, unsigned int bidi_bytes)
3296
-{
3297
- struct request_queue *q = rq->q;
3298
- unsigned long flags;
3299
-
3300
- WARN_ON_ONCE(q->mq_ops);
3301
-
3302
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3303
- return true;
3304
-
3305
- spin_lock_irqsave(q->queue_lock, flags);
3306
- blk_finish_request(rq, error);
3307
- spin_unlock_irqrestore(q->queue_lock, flags);
3308
-
3309
- return false;
3310
-}
3311
-
3312
-/**
3313
- * __blk_end_bidi_request - Complete a bidi request with queue lock held
3314
- * @rq: the request to complete
3315
- * @error: block status code
3316
- * @nr_bytes: number of bytes to complete @rq
3317
- * @bidi_bytes: number of bytes to complete @rq->next_rq
3318
- *
3319
- * Description:
3320
- * Identical to blk_end_bidi_request() except that queue lock is
3321
- * assumed to be locked on entry and remains so on return.
3322
- *
3323
- * Return:
3324
- * %false - we are done with this request
3325
- * %true - still buffers pending for this request
3326
- **/
3327
-static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
3328
- unsigned int nr_bytes, unsigned int bidi_bytes)
3329
-{
3330
- lockdep_assert_held(rq->q->queue_lock);
3331
- WARN_ON_ONCE(rq->q->mq_ops);
3332
-
3333
- if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3334
- return true;
3335
-
3336
- blk_finish_request(rq, error);
3337
-
3338
- return false;
3339
-}
3340
-
3341
-/**
3342
- * blk_end_request - Helper function for drivers to complete the request.
3343
- * @rq: the request being processed
3344
- * @error: block status code
3345
- * @nr_bytes: number of bytes to complete
3346
- *
3347
- * Description:
3348
- * Ends I/O on a number of bytes attached to @rq.
3349
- * If @rq has leftover, sets it up for the next range of segments.
3350
- *
3351
- * Return:
3352
- * %false - we are done with this request
3353
- * %true - still buffers pending for this request
3354
- **/
3355
-bool blk_end_request(struct request *rq, blk_status_t error,
3356
- unsigned int nr_bytes)
3357
-{
3358
- WARN_ON_ONCE(rq->q->mq_ops);
3359
- return blk_end_bidi_request(rq, error, nr_bytes, 0);
3360
-}
3361
-EXPORT_SYMBOL(blk_end_request);
3362
-
3363
-/**
3364
- * blk_end_request_all - Helper function for drives to finish the request.
3365
- * @rq: the request to finish
3366
- * @error: block status code
3367
- *
3368
- * Description:
3369
- * Completely finish @rq.
3370
- */
3371
-void blk_end_request_all(struct request *rq, blk_status_t error)
3372
-{
3373
- bool pending;
3374
- unsigned int bidi_bytes = 0;
3375
-
3376
- if (unlikely(blk_bidi_rq(rq)))
3377
- bidi_bytes = blk_rq_bytes(rq->next_rq);
3378
-
3379
- pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3380
- BUG_ON(pending);
3381
-}
3382
-EXPORT_SYMBOL(blk_end_request_all);
3383
-
3384
-/**
3385
- * __blk_end_request - Helper function for drivers to complete the request.
3386
- * @rq: the request being processed
3387
- * @error: block status code
3388
- * @nr_bytes: number of bytes to complete
3389
- *
3390
- * Description:
3391
- * Must be called with queue lock held unlike blk_end_request().
3392
- *
3393
- * Return:
3394
- * %false - we are done with this request
3395
- * %true - still buffers pending for this request
3396
- **/
3397
-bool __blk_end_request(struct request *rq, blk_status_t error,
3398
- unsigned int nr_bytes)
3399
-{
3400
- lockdep_assert_held(rq->q->queue_lock);
3401
- WARN_ON_ONCE(rq->q->mq_ops);
3402
-
3403
- return __blk_end_bidi_request(rq, error, nr_bytes, 0);
3404
-}
3405
-EXPORT_SYMBOL(__blk_end_request);
3406
-
3407
-/**
3408
- * __blk_end_request_all - Helper function for drives to finish the request.
3409
- * @rq: the request to finish
3410
- * @error: block status code
3411
- *
3412
- * Description:
3413
- * Completely finish @rq. Must be called with queue lock held.
3414
- */
3415
-void __blk_end_request_all(struct request *rq, blk_status_t error)
3416
-{
3417
- bool pending;
3418
- unsigned int bidi_bytes = 0;
3419
-
3420
- lockdep_assert_held(rq->q->queue_lock);
3421
- WARN_ON_ONCE(rq->q->mq_ops);
3422
-
3423
- if (unlikely(blk_bidi_rq(rq)))
3424
- bidi_bytes = blk_rq_bytes(rq->next_rq);
3425
-
3426
- pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3427
- BUG_ON(pending);
3428
-}
3429
-EXPORT_SYMBOL(__blk_end_request_all);
3430
-
3431
-/**
3432
- * __blk_end_request_cur - Helper function to finish the current request chunk.
3433
- * @rq: the request to finish the current chunk for
3434
- * @error: block status code
3435
- *
3436
- * Description:
3437
- * Complete the current consecutively mapped chunk from @rq. Must
3438
- * be called with queue lock held.
3439
- *
3440
- * Return:
3441
- * %false - we are done with this request
3442
- * %true - still buffers pending for this request
3443
- */
3444
-bool __blk_end_request_cur(struct request *rq, blk_status_t error)
3445
-{
3446
- return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
3447
-}
3448
-EXPORT_SYMBOL(__blk_end_request_cur);
3449
-
3450
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3451
- struct bio *bio)
3452
-{
3453
- if (bio_has_data(bio))
3454
- rq->nr_phys_segments = bio_phys_segments(q, bio);
3455
- else if (bio_op(bio) == REQ_OP_DISCARD)
3456
- rq->nr_phys_segments = 1;
3457
-
3458
- rq->__data_len = bio->bi_iter.bi_size;
3459
- rq->bio = rq->biotail = bio;
3460
-
3461
- if (bio->bi_disk)
3462
- rq->rq_disk = bio->bi_disk;
3463
-}
34641525
34651526 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
34661527 /**
....@@ -3502,8 +1563,8 @@
35021563 */
35031564 int blk_lld_busy(struct request_queue *q)
35041565 {
3505
- if (q->lld_busy_fn)
3506
- return q->lld_busy_fn(q);
1566
+ if (queue_is_mq(q) && q->mq_ops->busy)
1567
+ return q->mq_ops->busy(q);
35071568
35081569 return 0;
35091570 }
....@@ -3528,24 +1589,6 @@
35281589 }
35291590 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
35301591
3531
-/*
3532
- * Copy attributes of the original request to the clone request.
3533
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
3534
- */
3535
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
3536
-{
3537
- dst->cpu = src->cpu;
3538
- dst->__sector = blk_rq_pos(src);
3539
- dst->__data_len = blk_rq_bytes(src);
3540
- if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3541
- dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
3542
- dst->special_vec = src->special_vec;
3543
- }
3544
- dst->nr_phys_segments = src->nr_phys_segments;
3545
- dst->ioprio = src->ioprio;
3546
- dst->extra_len = src->extra_len;
3547
-}
3548
-
35491592 /**
35501593 * blk_rq_prep_clone - Helper function to setup clone request
35511594 * @rq: the request to be setup
....@@ -3558,8 +1601,6 @@
35581601 *
35591602 * Description:
35601603 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3561
- * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
3562
- * are not copied, and copying such parts is the caller's responsibility.
35631604 * Also, pages which the original bios are pointing to are not copied
35641605 * and the cloned bios just point same pages.
35651606 * So cloned bios must be completed before original bios, which means
....@@ -3586,11 +1627,24 @@
35861627 if (rq->bio) {
35871628 rq->biotail->bi_next = bio;
35881629 rq->biotail = bio;
3589
- } else
1630
+ } else {
35901631 rq->bio = rq->biotail = bio;
1632
+ }
1633
+ bio = NULL;
35911634 }
35921635
3593
- __blk_rq_prep_clone(rq, rq_src);
1636
+ /* Copy attributes of the original request to the clone request. */
1637
+ rq->__sector = blk_rq_pos(rq_src);
1638
+ rq->__data_len = blk_rq_bytes(rq_src);
1639
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
1640
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
1641
+ rq->special_vec = rq_src->special_vec;
1642
+ }
1643
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
1644
+ rq->ioprio = rq_src->ioprio;
1645
+
1646
+ if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
1647
+ goto free_and_out;
35941648
35951649 return 0;
35961650
....@@ -3609,12 +1663,6 @@
36091663 }
36101664 EXPORT_SYMBOL(kblockd_schedule_work);
36111665
3612
-int kblockd_schedule_work_on(int cpu, struct work_struct *work)
3613
-{
3614
- return queue_work_on(cpu, kblockd_workqueue, work);
3615
-}
3616
-EXPORT_SYMBOL(kblockd_schedule_work_on);
3617
-
36181666 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
36191667 unsigned long delay)
36201668 {
....@@ -3627,6 +1675,15 @@
36271675 * @plug: The &struct blk_plug that needs to be initialized
36281676 *
36291677 * Description:
1678
+ * blk_start_plug() indicates to the block layer an intent by the caller
1679
+ * to submit multiple I/O requests in a batch. The block layer may use
1680
+ * this hint to defer submitting I/Os from the caller until blk_finish_plug()
1681
+ * is called. However, the block layer may choose to submit requests
1682
+ * before a call to blk_finish_plug() if the number of queued I/Os
1683
+ * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1684
+ * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1685
+ * the task schedules (see below).
1686
+ *
36301687 * Tracking blk_plug inside the task_struct will help with auto-flushing the
36311688 * pending I/O should the task end up blocking between blk_start_plug() and
36321689 * blk_finish_plug(). This is important from a performance perspective, but
....@@ -3646,9 +1703,12 @@
36461703 if (tsk->plug)
36471704 return;
36481705
3649
- INIT_LIST_HEAD(&plug->list);
36501706 INIT_LIST_HEAD(&plug->mq_list);
36511707 INIT_LIST_HEAD(&plug->cb_list);
1708
+ plug->rq_count = 0;
1709
+ plug->multiple_queues = false;
1710
+ plug->nowait = false;
1711
+
36521712 /*
36531713 * Store ordering should not be needed here, since a potential
36541714 * preempt will imply a full memory barrier
....@@ -3656,36 +1716,6 @@
36561716 tsk->plug = plug;
36571717 }
36581718 EXPORT_SYMBOL(blk_start_plug);
3659
-
3660
-static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
3661
-{
3662
- struct request *rqa = container_of(a, struct request, queuelist);
3663
- struct request *rqb = container_of(b, struct request, queuelist);
3664
-
3665
- return !(rqa->q < rqb->q ||
3666
- (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
3667
-}
3668
-
3669
-/*
3670
- * If 'from_schedule' is true, then postpone the dispatch of requests
3671
- * until a safe kblockd context. We due this to avoid accidental big
3672
- * additional stack usage in driver dispatch, in places where the originally
3673
- * plugger did not intend it.
3674
- */
3675
-static void queue_unplugged(struct request_queue *q, unsigned int depth,
3676
- bool from_schedule)
3677
- __releases(q->queue_lock)
3678
-{
3679
- lockdep_assert_held(q->queue_lock);
3680
-
3681
- trace_block_unplug(q, depth, !from_schedule);
3682
-
3683
- if (from_schedule)
3684
- blk_run_queue_async(q);
3685
- else
3686
- __blk_run_queue(q);
3687
- spin_unlock_irq(q->queue_lock);
3688
-}
36891719
36901720 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
36911721 {
....@@ -3731,67 +1761,22 @@
37311761
37321762 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
37331763 {
3734
- struct request_queue *q;
3735
- struct request *rq;
3736
- LIST_HEAD(list);
3737
- unsigned int depth;
3738
-
37391764 flush_plug_callbacks(plug, from_schedule);
37401765
37411766 if (!list_empty(&plug->mq_list))
37421767 blk_mq_flush_plug_list(plug, from_schedule);
3743
-
3744
- if (list_empty(&plug->list))
3745
- return;
3746
-
3747
- list_splice_init(&plug->list, &list);
3748
-
3749
- list_sort(NULL, &list, plug_rq_cmp);
3750
-
3751
- q = NULL;
3752
- depth = 0;
3753
-
3754
- while (!list_empty(&list)) {
3755
- rq = list_entry_rq(list.next);
3756
- list_del_init(&rq->queuelist);
3757
- BUG_ON(!rq->q);
3758
- if (rq->q != q) {
3759
- /*
3760
- * This drops the queue lock
3761
- */
3762
- if (q)
3763
- queue_unplugged(q, depth, from_schedule);
3764
- q = rq->q;
3765
- depth = 0;
3766
- spin_lock_irq(q->queue_lock);
3767
- }
3768
-
3769
- /*
3770
- * Short-circuit if @q is dead
3771
- */
3772
- if (unlikely(blk_queue_dying(q))) {
3773
- __blk_end_request_all(rq, BLK_STS_IOERR);
3774
- continue;
3775
- }
3776
-
3777
- /*
3778
- * rq is already accounted, so use raw insert
3779
- */
3780
- if (op_is_flush(rq->cmd_flags))
3781
- __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3782
- else
3783
- __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3784
-
3785
- depth++;
3786
- }
3787
-
3788
- /*
3789
- * This drops the queue lock
3790
- */
3791
- if (q)
3792
- queue_unplugged(q, depth, from_schedule);
37931768 }
37941769
1770
+/**
1771
+ * blk_finish_plug - mark the end of a batch of submitted I/O
1772
+ * @plug: The &struct blk_plug passed to blk_start_plug()
1773
+ *
1774
+ * Description:
1775
+ * Indicate that a batch of I/O submissions is complete. This function
1776
+ * must be paired with an initial call to blk_start_plug(). The intent
1777
+ * is to allow the block layer to optimize I/O submission. See the
1778
+ * documentation for blk_start_plug() for more information.
1779
+ */
37951780 void blk_finish_plug(struct blk_plug *plug)
37961781 {
37971782 if (plug != current->plug)
....@@ -3802,198 +1787,25 @@
38021787 }
38031788 EXPORT_SYMBOL(blk_finish_plug);
38041789
3805
-#ifdef CONFIG_PM
3806
-/**
3807
- * blk_pm_runtime_init - Block layer runtime PM initialization routine
3808
- * @q: the queue of the device
3809
- * @dev: the device the queue belongs to
3810
- *
3811
- * Description:
3812
- * Initialize runtime-PM-related fields for @q and start auto suspend for
3813
- * @dev. Drivers that want to take advantage of request-based runtime PM
3814
- * should call this function after @dev has been initialized, and its
3815
- * request queue @q has been allocated, and runtime PM for it can not happen
3816
- * yet(either due to disabled/forbidden or its usage_count > 0). In most
3817
- * cases, driver should call this function before any I/O has taken place.
3818
- *
3819
- * This function takes care of setting up using auto suspend for the device,
3820
- * the autosuspend delay is set to -1 to make runtime suspend impossible
3821
- * until an updated value is either set by user or by driver. Drivers do
3822
- * not need to touch other autosuspend settings.
3823
- *
3824
- * The block layer runtime PM is request based, so only works for drivers
3825
- * that use request as their IO unit instead of those directly use bio's.
3826
- */
3827
-void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
1790
+void blk_io_schedule(void)
38281791 {
3829
- /* Don't enable runtime PM for blk-mq until it is ready */
3830
- if (q->mq_ops) {
3831
- pm_runtime_disable(dev);
3832
- return;
3833
- }
1792
+ /* Prevent hang_check timer from firing at us during very long I/O */
1793
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
38341794
3835
- q->dev = dev;
3836
- q->rpm_status = RPM_ACTIVE;
3837
- pm_runtime_set_autosuspend_delay(q->dev, -1);
3838
- pm_runtime_use_autosuspend(q->dev);
1795
+ if (timeout)
1796
+ io_schedule_timeout(timeout);
1797
+ else
1798
+ io_schedule();
38391799 }
3840
-EXPORT_SYMBOL(blk_pm_runtime_init);
3841
-
3842
-/**
3843
- * blk_pre_runtime_suspend - Pre runtime suspend check
3844
- * @q: the queue of the device
3845
- *
3846
- * Description:
3847
- * This function will check if runtime suspend is allowed for the device
3848
- * by examining if there are any requests pending in the queue. If there
3849
- * are requests pending, the device can not be runtime suspended; otherwise,
3850
- * the queue's status will be updated to SUSPENDING and the driver can
3851
- * proceed to suspend the device.
3852
- *
3853
- * For the not allowed case, we mark last busy for the device so that
3854
- * runtime PM core will try to autosuspend it some time later.
3855
- *
3856
- * This function should be called near the start of the device's
3857
- * runtime_suspend callback.
3858
- *
3859
- * Return:
3860
- * 0 - OK to runtime suspend the device
3861
- * -EBUSY - Device should not be runtime suspended
3862
- */
3863
-int blk_pre_runtime_suspend(struct request_queue *q)
3864
-{
3865
- int ret = 0;
3866
-
3867
- if (!q->dev)
3868
- return ret;
3869
-
3870
- spin_lock_irq(q->queue_lock);
3871
- if (q->nr_pending) {
3872
- ret = -EBUSY;
3873
- pm_runtime_mark_last_busy(q->dev);
3874
- } else {
3875
- q->rpm_status = RPM_SUSPENDING;
3876
- }
3877
- spin_unlock_irq(q->queue_lock);
3878
- return ret;
3879
-}
3880
-EXPORT_SYMBOL(blk_pre_runtime_suspend);
3881
-
3882
-/**
3883
- * blk_post_runtime_suspend - Post runtime suspend processing
3884
- * @q: the queue of the device
3885
- * @err: return value of the device's runtime_suspend function
3886
- *
3887
- * Description:
3888
- * Update the queue's runtime status according to the return value of the
3889
- * device's runtime suspend function and mark last busy for the device so
3890
- * that PM core will try to auto suspend the device at a later time.
3891
- *
3892
- * This function should be called near the end of the device's
3893
- * runtime_suspend callback.
3894
- */
3895
-void blk_post_runtime_suspend(struct request_queue *q, int err)
3896
-{
3897
- if (!q->dev)
3898
- return;
3899
-
3900
- spin_lock_irq(q->queue_lock);
3901
- if (!err) {
3902
- q->rpm_status = RPM_SUSPENDED;
3903
- } else {
3904
- q->rpm_status = RPM_ACTIVE;
3905
- pm_runtime_mark_last_busy(q->dev);
3906
- }
3907
- spin_unlock_irq(q->queue_lock);
3908
-}
3909
-EXPORT_SYMBOL(blk_post_runtime_suspend);
3910
-
3911
-/**
3912
- * blk_pre_runtime_resume - Pre runtime resume processing
3913
- * @q: the queue of the device
3914
- *
3915
- * Description:
3916
- * Update the queue's runtime status to RESUMING in preparation for the
3917
- * runtime resume of the device.
3918
- *
3919
- * This function should be called near the start of the device's
3920
- * runtime_resume callback.
3921
- */
3922
-void blk_pre_runtime_resume(struct request_queue *q)
3923
-{
3924
- if (!q->dev)
3925
- return;
3926
-
3927
- spin_lock_irq(q->queue_lock);
3928
- q->rpm_status = RPM_RESUMING;
3929
- spin_unlock_irq(q->queue_lock);
3930
-}
3931
-EXPORT_SYMBOL(blk_pre_runtime_resume);
3932
-
3933
-/**
3934
- * blk_post_runtime_resume - Post runtime resume processing
3935
- * @q: the queue of the device
3936
- * @err: return value of the device's runtime_resume function
3937
- *
3938
- * Description:
3939
- * Update the queue's runtime status according to the return value of the
3940
- * device's runtime_resume function. If it is successfully resumed, process
3941
- * the requests that are queued into the device's queue when it is resuming
3942
- * and then mark last busy and initiate autosuspend for it.
3943
- *
3944
- * This function should be called near the end of the device's
3945
- * runtime_resume callback.
3946
- */
3947
-void blk_post_runtime_resume(struct request_queue *q, int err)
3948
-{
3949
- if (!q->dev)
3950
- return;
3951
-
3952
- spin_lock_irq(q->queue_lock);
3953
- if (!err) {
3954
- q->rpm_status = RPM_ACTIVE;
3955
- __blk_run_queue(q);
3956
- pm_runtime_mark_last_busy(q->dev);
3957
- pm_request_autosuspend(q->dev);
3958
- } else {
3959
- q->rpm_status = RPM_SUSPENDED;
3960
- }
3961
- spin_unlock_irq(q->queue_lock);
3962
-}
3963
-EXPORT_SYMBOL(blk_post_runtime_resume);
3964
-
3965
-/**
3966
- * blk_set_runtime_active - Force runtime status of the queue to be active
3967
- * @q: the queue of the device
3968
- *
3969
- * If the device is left runtime suspended during system suspend the resume
3970
- * hook typically resumes the device and corrects runtime status
3971
- * accordingly. However, that does not affect the queue runtime PM status
3972
- * which is still "suspended". This prevents processing requests from the
3973
- * queue.
3974
- *
3975
- * This function can be used in driver's resume hook to correct queue
3976
- * runtime PM status and re-enable peeking requests from the queue. It
3977
- * should be called before first request is added to the queue.
3978
- */
3979
-void blk_set_runtime_active(struct request_queue *q)
3980
-{
3981
- spin_lock_irq(q->queue_lock);
3982
- q->rpm_status = RPM_ACTIVE;
3983
- pm_runtime_mark_last_busy(q->dev);
3984
- pm_request_autosuspend(q->dev);
3985
- spin_unlock_irq(q->queue_lock);
3986
-}
3987
-EXPORT_SYMBOL(blk_set_runtime_active);
3988
-#endif
1800
+EXPORT_SYMBOL_GPL(blk_io_schedule);
39891801
39901802 int __init blk_dev_init(void)
39911803 {
39921804 BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
39931805 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3994
- FIELD_SIZEOF(struct request, cmd_flags));
1806
+ sizeof_field(struct request, cmd_flags));
39951807 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
3996
- FIELD_SIZEOF(struct bio, bi_opf));
1808
+ sizeof_field(struct bio, bi_opf));
39971809
39981810 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
39991811 kblockd_workqueue = alloc_workqueue("kblockd",
....@@ -4001,21 +1813,10 @@
40011813 if (!kblockd_workqueue)
40021814 panic("Failed to create kblockd\n");
40031815
4004
- request_cachep = kmem_cache_create("blkdev_requests",
4005
- sizeof(struct request), 0, SLAB_PANIC, NULL);
4006
-
40071816 blk_requestq_cachep = kmem_cache_create("request_queue",
40081817 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
40091818
4010
-#ifdef CONFIG_DEBUG_FS
40111819 blk_debugfs_root = debugfs_create_dir("block", NULL);
4012
-#endif
4013
-
4014
- if (bio_crypt_ctx_init() < 0)
4015
- panic("Failed to allocate mem for bio crypt ctxs\n");
4016
-
4017
- if (blk_crypto_fallback_init() < 0)
4018
- panic("Failed to init blk-crypto-fallback\n");
40191820
40201821 return 0;
40211822 }