hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/block/blk-flush.c
....@@ -1,10 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Functions to sequence PREFLUSH and FUA writes.
34 *
45 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
56 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6
- *
7
- * This file is released under the GPLv2.
87 *
98 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
109 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
....@@ -70,6 +69,7 @@
7069 #include <linux/blkdev.h>
7170 #include <linux/gfp.h>
7271 #include <linux/blk-mq.h>
72
+#include <linux/lockdep.h>
7373
7474 #include "blk.h"
7575 #include "blk-mq.h"
....@@ -93,7 +93,7 @@
9393 FLUSH_PENDING_TIMEOUT = 5 * HZ,
9494 };
9595
96
-static bool blk_kick_flush(struct request_queue *q,
96
+static void blk_kick_flush(struct request_queue *q,
9797 struct blk_flush_queue *fq, unsigned int flags);
9898
9999 static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
....@@ -132,18 +132,20 @@
132132 rq->end_io = rq->flush.saved_end_io;
133133 }
134134
135
-static bool blk_flush_queue_rq(struct request *rq, bool add_front)
135
+static void blk_flush_queue_rq(struct request *rq, bool add_front)
136136 {
137
- if (rq->q->mq_ops) {
138
- blk_mq_add_to_requeue_list(rq, add_front, true);
139
- return false;
140
- } else {
141
- if (add_front)
142
- list_add(&rq->queuelist, &rq->q->queue_head);
143
- else
144
- list_add_tail(&rq->queuelist, &rq->q->queue_head);
145
- return true;
146
- }
137
+ blk_mq_add_to_requeue_list(rq, add_front, true);
138
+}
139
+
140
+static void blk_account_io_flush(struct request *rq)
141
+{
142
+ struct hd_struct *part = &rq->rq_disk->part0;
143
+
144
+ part_stat_lock();
145
+ part_stat_inc(part, ios[STAT_FLUSH]);
146
+ part_stat_add(part, nsecs[STAT_FLUSH],
147
+ ktime_get_ns() - rq->start_time_ns);
148
+ part_stat_unlock();
147149 }
148150
149151 /**
....@@ -157,18 +159,14 @@
157159 * completion and trigger the next step.
158160 *
159161 * CONTEXT:
160
- * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
161
- *
162
- * RETURNS:
163
- * %true if requests were added to the dispatch queue, %false otherwise.
162
+ * spin_lock_irq(fq->mq_flush_lock)
164163 */
165
-static bool blk_flush_complete_seq(struct request *rq,
164
+static void blk_flush_complete_seq(struct request *rq,
166165 struct blk_flush_queue *fq,
167166 unsigned int seq, blk_status_t error)
168167 {
169168 struct request_queue *q = rq->q;
170169 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
171
- bool queued = false, kicked;
172170 unsigned int cmd_flags;
173171
174172 BUG_ON(rq->flush.seq & seq);
....@@ -191,12 +189,12 @@
191189
192190 case REQ_FSEQ_DATA:
193191 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
194
- queued = blk_flush_queue_rq(rq, true);
192
+ blk_flush_queue_rq(rq, true);
195193 break;
196194
197195 case REQ_FSEQ_DONE:
198196 /*
199
- * @rq was previously adjusted by blk_flush_issue() for
197
+ * @rq was previously adjusted by blk_insert_flush() for
200198 * flush sequencing and may already have gone through the
201199 * flush data request completion path. Restore @rq for
202200 * normal completion and end it.
....@@ -204,52 +202,50 @@
204202 BUG_ON(!list_empty(&rq->queuelist));
205203 list_del_init(&rq->flush.list);
206204 blk_flush_restore_request(rq);
207
- if (q->mq_ops)
208
- blk_mq_end_request(rq, error);
209
- else
210
- __blk_end_request_all(rq, error);
205
+ blk_mq_end_request(rq, error);
211206 break;
212207
213208 default:
214209 BUG();
215210 }
216211
217
- kicked = blk_kick_flush(q, fq, cmd_flags);
218
- return kicked | queued;
212
+ blk_kick_flush(q, fq, cmd_flags);
219213 }
220214
221215 static void flush_end_io(struct request *flush_rq, blk_status_t error)
222216 {
223217 struct request_queue *q = flush_rq->q;
224218 struct list_head *running;
225
- bool queued = false;
226219 struct request *rq, *n;
227220 unsigned long flags = 0;
228221 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
229222
230
- if (q->mq_ops) {
231
- struct blk_mq_hw_ctx *hctx;
223
+ /* release the tag's ownership to the req cloned from */
224
+ spin_lock_irqsave(&fq->mq_flush_lock, flags);
232225
233
- /* release the tag's ownership to the req cloned from */
234
- spin_lock_irqsave(&fq->mq_flush_lock, flags);
226
+ if (!refcount_dec_and_test(&flush_rq->ref)) {
227
+ fq->rq_status = error;
228
+ spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
229
+ return;
230
+ }
235231
236
- if (!refcount_dec_and_test(&flush_rq->ref)) {
237
- fq->rq_status = error;
238
- spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
239
- return;
240
- }
232
+ blk_account_io_flush(flush_rq);
233
+ /*
234
+ * Flush request has to be marked as IDLE when it is really ended
235
+ * because its .end_io() is called from timeout code path too for
236
+ * avoiding use-after-free.
237
+ */
238
+ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
239
+ if (fq->rq_status != BLK_STS_OK) {
240
+ error = fq->rq_status;
241
+ fq->rq_status = BLK_STS_OK;
242
+ }
241243
242
- if (fq->rq_status != BLK_STS_OK)
243
- error = fq->rq_status;
244
-
245
- hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
246
- if (!q->elevator) {
247
- blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
248
- flush_rq->tag = -1;
249
- } else {
250
- blk_mq_put_driver_tag_hctx(hctx, flush_rq);
251
- flush_rq->internal_tag = -1;
252
- }
244
+ if (!q->elevator) {
245
+ flush_rq->tag = BLK_MQ_NO_TAG;
246
+ } else {
247
+ blk_mq_put_driver_tag(flush_rq);
248
+ flush_rq->internal_tag = BLK_MQ_NO_TAG;
253249 }
254250
255251 running = &fq->flush_queue[fq->flush_running_idx];
....@@ -258,35 +254,20 @@
258254 /* account completion of the flush request */
259255 fq->flush_running_idx ^= 1;
260256
261
- if (!q->mq_ops)
262
- elv_completed_request(q, flush_rq);
263
-
264257 /* and push the waiting requests to the next stage */
265258 list_for_each_entry_safe(rq, n, running, flush.list) {
266259 unsigned int seq = blk_flush_cur_seq(rq);
267260
268261 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
269
- queued |= blk_flush_complete_seq(rq, fq, seq, error);
262
+ blk_flush_complete_seq(rq, fq, seq, error);
270263 }
271264
272
- /*
273
- * Kick the queue to avoid stall for two cases:
274
- * 1. Moving a request silently to empty queue_head may stall the
275
- * queue.
276
- * 2. When flush request is running in non-queueable queue, the
277
- * queue is hold. Restart the queue after flush request is finished
278
- * to avoid stall.
279
- * This function is called from request completion path and calling
280
- * directly into request_fn may confuse the driver. Always use
281
- * kblockd.
282
- */
283
- if (queued || fq->flush_queue_delayed) {
284
- WARN_ON(q->mq_ops);
285
- blk_run_queue_async(q);
286
- }
287
- fq->flush_queue_delayed = 0;
288
- if (q->mq_ops)
289
- spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
265
+ spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
266
+}
267
+
268
+bool is_flush_rq(struct request *rq)
269
+{
270
+ return rq->end_io == flush_end_io;
290271 }
291272
292273 /**
....@@ -299,12 +280,10 @@
299280 * Please read the comment at the top of this file for more info.
300281 *
301282 * CONTEXT:
302
- * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
283
+ * spin_lock_irq(fq->mq_flush_lock)
303284 *
304
- * RETURNS:
305
- * %true if flush was issued, %false otherwise.
306285 */
307
-static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
286
+static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
308287 unsigned int flags)
309288 {
310289 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
....@@ -314,19 +293,13 @@
314293
315294 /* C1 described at the top of this file */
316295 if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
317
- return false;
296
+ return;
318297
319
- /* C2 and C3
320
- *
321
- * For blk-mq + scheduling, we can risk having all driver tags
322
- * assigned to empty flushes, and we deadlock if we are expecting
323
- * other requests to make progress. Don't defer for that case.
324
- */
298
+ /* C2 and C3 */
325299 if (!list_empty(&fq->flush_data_in_flight) &&
326
- !(q->mq_ops && q->elevator) &&
327300 time_before(jiffies,
328301 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
329
- return false;
302
+ return;
330303
331304 /*
332305 * Issue flush and toggle pending_idx. This makes pending_idx
....@@ -344,86 +317,49 @@
344317 * In case of IO scheduler, flush rq need to borrow scheduler tag
345318 * just for cheating put/get driver tag.
346319 */
347
- if (q->mq_ops) {
348
- struct blk_mq_hw_ctx *hctx;
320
+ flush_rq->mq_ctx = first_rq->mq_ctx;
321
+ flush_rq->mq_hctx = first_rq->mq_hctx;
349322
350
- flush_rq->mq_ctx = first_rq->mq_ctx;
323
+ if (!q->elevator) {
324
+ flush_rq->tag = first_rq->tag;
351325
352
- if (!q->elevator) {
353
- fq->orig_rq = first_rq;
354
- flush_rq->tag = first_rq->tag;
355
- hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
356
- blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
357
- } else {
358
- flush_rq->internal_tag = first_rq->internal_tag;
359
- }
360
- }
326
+ /*
327
+ * We borrow data request's driver tag, so have to mark
328
+ * this flush request as INFLIGHT for avoiding double
329
+ * account of this driver tag
330
+ */
331
+ flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
332
+ } else
333
+ flush_rq->internal_tag = first_rq->internal_tag;
361334
362335 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
363336 flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
364337 flush_rq->rq_flags |= RQF_FLUSH_SEQ;
365338 flush_rq->rq_disk = first_rq->rq_disk;
366339 flush_rq->end_io = flush_end_io;
367
-
368
- return blk_flush_queue_rq(flush_rq, false);
369
-}
370
-
371
-static void flush_data_end_io(struct request *rq, blk_status_t error)
372
-{
373
- struct request_queue *q = rq->q;
374
- struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
375
-
376
- lockdep_assert_held(q->queue_lock);
377
-
378340 /*
379
- * Updating q->in_flight[] here for making this tag usable
380
- * early. Because in blk_queue_start_tag(),
381
- * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
382
- * reserve tags for sync I/O.
383
- *
384
- * More importantly this way can avoid the following I/O
385
- * deadlock:
386
- *
387
- * - suppose there are 40 fua requests comming to flush queue
388
- * and queue depth is 31
389
- * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
390
- * tag for async I/O any more
391
- * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
392
- * and flush_data_end_io() is called
393
- * - the other rqs still can't go ahead if not updating
394
- * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
395
- * are held in flush data queue and make no progress of
396
- * handling post flush rq
397
- * - only after the post flush rq is handled, all these rqs
398
- * can be completed
341
+ * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
342
+ * implied in refcount_inc_not_zero() called from
343
+ * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
344
+ * and READ flush_rq->end_io
399345 */
346
+ smp_wmb();
347
+ refcount_set(&flush_rq->ref, 1);
400348
401
- elv_completed_request(q, rq);
402
-
403
- /* for avoiding double accounting */
404
- rq->rq_flags &= ~RQF_STARTED;
405
-
406
- /*
407
- * After populating an empty queue, kick it to avoid stall. Read
408
- * the comment in flush_end_io().
409
- */
410
- if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
411
- blk_run_queue_async(q);
349
+ blk_flush_queue_rq(flush_rq, false);
412350 }
413351
414352 static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
415353 {
416354 struct request_queue *q = rq->q;
417
- struct blk_mq_hw_ctx *hctx;
355
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
418356 struct blk_mq_ctx *ctx = rq->mq_ctx;
419357 unsigned long flags;
420358 struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
421359
422
- hctx = blk_mq_map_queue(q, ctx->cpu);
423
-
424360 if (q->elevator) {
425361 WARN_ON(rq->tag < 0);
426
- blk_mq_put_driver_tag_hctx(hctx, rq);
362
+ blk_mq_put_driver_tag(rq);
427363 }
428364
429365 /*
....@@ -453,9 +389,6 @@
453389 unsigned int policy = blk_flush_policy(fflags, rq);
454390 struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
455391
456
- if (!q->mq_ops)
457
- lockdep_assert_held(q->queue_lock);
458
-
459392 /*
460393 * @policy now records what operations need to be done. Adjust
461394 * REQ_PREFLUSH and FUA for the driver.
....@@ -478,10 +411,7 @@
478411 * complete the request.
479412 */
480413 if (!policy) {
481
- if (q->mq_ops)
482
- blk_mq_end_request(rq, 0);
483
- else
484
- __blk_end_request(rq, 0, 0);
414
+ blk_mq_end_request(rq, 0);
485415 return;
486416 }
487417
....@@ -494,10 +424,7 @@
494424 */
495425 if ((policy & REQ_FSEQ_DATA) &&
496426 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
497
- if (q->mq_ops)
498
- blk_mq_request_bypass_insert(rq, false);
499
- else
500
- list_add_tail(&rq->queuelist, &q->queue_head);
427
+ blk_mq_request_bypass_insert(rq, false, false);
501428 return;
502429 }
503430
....@@ -509,74 +436,39 @@
509436 INIT_LIST_HEAD(&rq->flush.list);
510437 rq->rq_flags |= RQF_FLUSH_SEQ;
511438 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
512
- if (q->mq_ops) {
513
- rq->end_io = mq_flush_data_end_io;
514439
515
- spin_lock_irq(&fq->mq_flush_lock);
516
- blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
517
- spin_unlock_irq(&fq->mq_flush_lock);
518
- return;
519
- }
520
- rq->end_io = flush_data_end_io;
440
+ rq->end_io = mq_flush_data_end_io;
521441
442
+ spin_lock_irq(&fq->mq_flush_lock);
522443 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
444
+ spin_unlock_irq(&fq->mq_flush_lock);
523445 }
524446
525447 /**
526448 * blkdev_issue_flush - queue a flush
527449 * @bdev: blockdev to issue flush for
528450 * @gfp_mask: memory allocation flags (for bio_alloc)
529
- * @error_sector: error sector
530451 *
531452 * Description:
532
- * Issue a flush for the block device in question. Caller can supply
533
- * room for storing the error offset in case of a flush error, if they
534
- * wish to.
453
+ * Issue a flush for the block device in question.
535454 */
536
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
537
- sector_t *error_sector)
455
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
538456 {
539
- struct request_queue *q;
540457 struct bio *bio;
541458 int ret = 0;
542
-
543
- if (bdev->bd_disk == NULL)
544
- return -ENXIO;
545
-
546
- q = bdev_get_queue(bdev);
547
- if (!q)
548
- return -ENXIO;
549
-
550
- /*
551
- * some block devices may not have their queue correctly set up here
552
- * (e.g. loop device without a backing file) and so issuing a flush
553
- * here will panic. Ensure there is a request function before issuing
554
- * the flush.
555
- */
556
- if (!q->make_request_fn)
557
- return -ENXIO;
558459
559460 bio = bio_alloc(gfp_mask, 0);
560461 bio_set_dev(bio, bdev);
561462 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
562463
563464 ret = submit_bio_wait(bio);
564
-
565
- /*
566
- * The driver must store the error location in ->bi_sector, if
567
- * it supports it. For non-stacked drivers, this should be
568
- * copied from blk_rq_pos(rq).
569
- */
570
- if (error_sector)
571
- *error_sector = bio->bi_iter.bi_sector;
572
-
573465 bio_put(bio);
574466 return ret;
575467 }
576468 EXPORT_SYMBOL(blkdev_issue_flush);
577469
578
-struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
579
- int node, int cmd_size, gfp_t flags)
470
+struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
471
+ gfp_t flags)
580472 {
581473 struct blk_flush_queue *fq;
582474 int rq_sz = sizeof(struct request);
....@@ -585,8 +477,7 @@
585477 if (!fq)
586478 goto fail;
587479
588
- if (q->mq_ops)
589
- spin_lock_init(&fq->mq_flush_lock);
480
+ spin_lock_init(&fq->mq_flush_lock);
590481
591482 rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
592483 fq->flush_rq = kzalloc_node(rq_sz, flags, node);
....@@ -596,6 +487,9 @@
596487 INIT_LIST_HEAD(&fq->flush_queue[0]);
597488 INIT_LIST_HEAD(&fq->flush_queue[1]);
598489 INIT_LIST_HEAD(&fq->flush_data_in_flight);
490
+
491
+ lockdep_register_key(&fq->key);
492
+ lockdep_set_class(&fq->mq_flush_lock, &fq->key);
599493
600494 return fq;
601495
....@@ -611,6 +505,7 @@
611505 if (!fq)
612506 return;
613507
508
+ lockdep_unregister_key(&fq->key);
614509 kfree(fq->flush_rq);
615510 kfree(fq);
616511 }