hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/md/dm.c
....@@ -26,6 +26,7 @@
2626 #include <linux/wait.h>
2727 #include <linux/pr.h>
2828 #include <linux/refcount.h>
29
+#include <linux/part_stat.h>
2930 #include <linux/blk-crypto.h>
3031 #include <linux/keyslot-manager.h>
3132
....@@ -148,6 +149,16 @@
148149 #define DM_NUMA_NODE NUMA_NO_NODE
149150 static int dm_numa_node = DM_NUMA_NODE;
150151
152
+#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
153
+static int swap_bios = DEFAULT_SWAP_BIOS;
154
+static int get_swap_bios(void)
155
+{
156
+ int latch = READ_ONCE(swap_bios);
157
+ if (unlikely(latch <= 0))
158
+ latch = DEFAULT_SWAP_BIOS;
159
+ return latch;
160
+}
161
+
151162 /*
152163 * For mempools pre-allocation at the table loading time.
153164 */
....@@ -161,9 +172,6 @@
161172 refcount_t count;
162173 struct dm_dev dm_dev;
163174 };
164
-
165
-static struct kmem_cache *_rq_tio_cache;
166
-static struct kmem_cache *_rq_cache;
167175
168176 /*
169177 * Bio-based DM's mempools' reserved IOs set by the user.
....@@ -226,20 +234,11 @@
226234
227235 static int __init local_init(void)
228236 {
229
- int r = -ENOMEM;
230
-
231
- _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
232
- if (!_rq_tio_cache)
233
- return r;
234
-
235
- _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
236
- __alignof__(struct request), 0, NULL);
237
- if (!_rq_cache)
238
- goto out_free_rq_tio_cache;
237
+ int r;
239238
240239 r = dm_uevent_init();
241240 if (r)
242
- goto out_free_rq_cache;
241
+ return r;
243242
244243 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
245244 if (!deferred_remove_workqueue) {
....@@ -261,21 +260,14 @@
261260 destroy_workqueue(deferred_remove_workqueue);
262261 out_uevent_exit:
263262 dm_uevent_exit();
264
-out_free_rq_cache:
265
- kmem_cache_destroy(_rq_cache);
266
-out_free_rq_tio_cache:
267
- kmem_cache_destroy(_rq_tio_cache);
268263
269264 return r;
270265 }
271266
272267 static void local_exit(void)
273268 {
274
- flush_scheduled_work();
275269 destroy_workqueue(deferred_remove_workqueue);
276270
277
- kmem_cache_destroy(_rq_cache);
278
- kmem_cache_destroy(_rq_tio_cache);
279271 unregister_blkdev(_major, _name);
280272 dm_uevent_exit();
281273
....@@ -440,27 +432,90 @@
440432 dm_deferred_remove();
441433 }
442434
443
-sector_t dm_get_size(struct mapped_device *md)
444
-{
445
- return get_capacity(md->disk);
446
-}
447
-
448
-struct request_queue *dm_get_md_queue(struct mapped_device *md)
449
-{
450
- return md->queue;
451
-}
452
-
453
-struct dm_stats *dm_get_stats(struct mapped_device *md)
454
-{
455
- return &md->stats;
456
-}
457
-
458435 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
459436 {
460437 struct mapped_device *md = bdev->bd_disk->private_data;
461438
462439 return dm_get_geometry(md, geo);
463440 }
441
+
442
+#ifdef CONFIG_BLK_DEV_ZONED
443
+int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
444
+{
445
+ struct dm_report_zones_args *args = data;
446
+ sector_t sector_diff = args->tgt->begin - args->start;
447
+
448
+ /*
449
+ * Ignore zones beyond the target range.
450
+ */
451
+ if (zone->start >= args->start + args->tgt->len)
452
+ return 0;
453
+
454
+ /*
455
+ * Remap the start sector and write pointer position of the zone
456
+ * to match its position in the target range.
457
+ */
458
+ zone->start += sector_diff;
459
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
460
+ if (zone->cond == BLK_ZONE_COND_FULL)
461
+ zone->wp = zone->start + zone->len;
462
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
463
+ zone->wp = zone->start;
464
+ else
465
+ zone->wp += sector_diff;
466
+ }
467
+
468
+ args->next_sector = zone->start + zone->len;
469
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
470
+}
471
+EXPORT_SYMBOL_GPL(dm_report_zones_cb);
472
+
473
+static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
474
+ unsigned int nr_zones, report_zones_cb cb, void *data)
475
+{
476
+ struct mapped_device *md = disk->private_data;
477
+ struct dm_table *map;
478
+ int srcu_idx, ret;
479
+ struct dm_report_zones_args args = {
480
+ .next_sector = sector,
481
+ .orig_data = data,
482
+ .orig_cb = cb,
483
+ };
484
+
485
+ if (dm_suspended_md(md))
486
+ return -EAGAIN;
487
+
488
+ map = dm_get_live_table(md, &srcu_idx);
489
+ if (!map) {
490
+ ret = -EIO;
491
+ goto out;
492
+ }
493
+
494
+ do {
495
+ struct dm_target *tgt;
496
+
497
+ tgt = dm_table_find_target(map, args.next_sector);
498
+ if (WARN_ON_ONCE(!tgt->type->report_zones)) {
499
+ ret = -EIO;
500
+ goto out;
501
+ }
502
+
503
+ args.tgt = tgt;
504
+ ret = tgt->type->report_zones(tgt, &args,
505
+ nr_zones - args.zone_idx);
506
+ if (ret < 0)
507
+ goto out;
508
+ } while (args.zone_idx < nr_zones &&
509
+ args.next_sector < get_capacity(disk));
510
+
511
+ ret = args.zone_idx;
512
+out:
513
+ dm_put_live_table(md, srcu_idx);
514
+ return ret;
515
+}
516
+#else
517
+#define dm_blk_report_zones NULL
518
+#endif /* CONFIG_BLK_DEV_ZONED */
464519
465520 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
466521 struct block_device **bdev)
....@@ -531,7 +586,45 @@
531586 return r;
532587 }
533588
534
-static void start_io_acct(struct dm_io *io);
589
+u64 dm_start_time_ns_from_clone(struct bio *bio)
590
+{
591
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
592
+ struct dm_io *io = tio->io;
593
+
594
+ return jiffies_to_nsecs(io->start_time);
595
+}
596
+EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
597
+
598
+static void start_io_acct(struct dm_io *io)
599
+{
600
+ struct mapped_device *md = io->md;
601
+ struct bio *bio = io->orig_bio;
602
+
603
+ io->start_time = bio_start_io_acct(bio);
604
+ if (unlikely(dm_stats_used(&md->stats)))
605
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
606
+ bio->bi_iter.bi_sector, bio_sectors(bio),
607
+ false, 0, &io->stats_aux);
608
+}
609
+
610
+static void end_io_acct(struct mapped_device *md, struct bio *bio,
611
+ unsigned long start_time, struct dm_stats_aux *stats_aux)
612
+{
613
+ unsigned long duration = jiffies - start_time;
614
+
615
+ if (unlikely(dm_stats_used(&md->stats)))
616
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
617
+ bio->bi_iter.bi_sector, bio_sectors(bio),
618
+ true, duration, stats_aux);
619
+
620
+ smp_wmb();
621
+
622
+ bio_end_io_acct(bio, start_time);
623
+
624
+ /* nudge anyone waiting on suspend queue */
625
+ if (unlikely(wq_has_sleeper(&md->wait)))
626
+ wake_up(&md->wait);
627
+}
535628
536629 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
537630 {
....@@ -595,61 +688,6 @@
595688 if (tio->inside_dm_io)
596689 return;
597690 bio_put(&tio->clone);
598
-}
599
-
600
-int md_in_flight(struct mapped_device *md)
601
-{
602
- return atomic_read(&md->pending[READ]) +
603
- atomic_read(&md->pending[WRITE]);
604
-}
605
-
606
-static void start_io_acct(struct dm_io *io)
607
-{
608
- struct mapped_device *md = io->md;
609
- struct bio *bio = io->orig_bio;
610
- int rw = bio_data_dir(bio);
611
-
612
- io->start_time = jiffies;
613
-
614
- generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
615
- &dm_disk(md)->part0);
616
-
617
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
618
- atomic_inc_return(&md->pending[rw]));
619
-
620
- if (unlikely(dm_stats_used(&md->stats)))
621
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
622
- bio->bi_iter.bi_sector, bio_sectors(bio),
623
- false, 0, &io->stats_aux);
624
-}
625
-
626
-static void end_io_acct(struct dm_io *io)
627
-{
628
- struct mapped_device *md = io->md;
629
- struct bio *bio = io->orig_bio;
630
- unsigned long duration = jiffies - io->start_time;
631
- int pending;
632
- int rw = bio_data_dir(bio);
633
-
634
- generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
635
- io->start_time);
636
-
637
- if (unlikely(dm_stats_used(&md->stats)))
638
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
639
- bio->bi_iter.bi_sector, bio_sectors(bio),
640
- true, duration, &io->stats_aux);
641
-
642
- /*
643
- * After this is decremented the bio must not be touched if it is
644
- * a flush.
645
- */
646
- pending = atomic_dec_return(&md->pending[rw]);
647
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
648
- pending += atomic_read(&md->pending[rw^0x1]);
649
-
650
- /* nudge anyone waiting on suspend queue */
651
- if (!pending)
652
- wake_up(&md->wait);
653691 }
654692
655693 /*
....@@ -748,7 +786,8 @@
748786 }
749787
750788 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
751
- fmode_t mode) {
789
+ fmode_t mode)
790
+{
752791 struct table_device *td;
753792
754793 list_for_each_entry(td, l, list)
....@@ -759,7 +798,8 @@
759798 }
760799
761800 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
762
- struct dm_dev **result) {
801
+ struct dm_dev **result)
802
+{
763803 int r;
764804 struct table_device *td;
765805
....@@ -864,6 +904,8 @@
864904 blk_status_t io_error;
865905 struct bio *bio;
866906 struct mapped_device *md = io->md;
907
+ unsigned long start_time = 0;
908
+ struct dm_stats_aux stats_aux;
867909
868910 /* Push-back supersedes any I/O errors */
869911 if (unlikely(error)) {
....@@ -890,8 +932,10 @@
890932
891933 io_error = io->status;
892934 bio = io->orig_bio;
893
- end_io_acct(io);
935
+ start_time = io->start_time;
936
+ stats_aux = io->stats_aux;
894937 free_io(md, io);
938
+ end_io_acct(md, bio, start_time, &stats_aux);
895939
896940 if (io_error == BLK_STS_DM_REQUEUE)
897941 return;
....@@ -937,6 +981,11 @@
937981 limits->max_write_zeroes_sectors = 0;
938982 }
939983
984
+static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
985
+{
986
+ return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
987
+}
988
+
940989 static void clone_endio(struct bio *bio)
941990 {
942991 blk_status_t error = bio->bi_status;
....@@ -944,8 +993,9 @@
944993 struct dm_io *io = tio->io;
945994 struct mapped_device *md = tio->io->md;
946995 dm_endio_fn endio = tio->ti->type->end_io;
996
+ struct bio *orig_bio = io->orig_bio;
947997
948
- if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
998
+ if (unlikely(error == BLK_STS_TARGET)) {
949999 if (bio_op(bio) == REQ_OP_DISCARD &&
9501000 !bio->bi_disk->queue->limits.max_discard_sectors)
9511001 disable_discard(md);
....@@ -957,12 +1007,24 @@
9571007 disable_write_zeroes(md);
9581008 }
9591009
1010
+ /*
1011
+ * For zone-append bios get offset in zone of the written
1012
+ * sector and add that to the original bio sector pos.
1013
+ */
1014
+ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
1015
+ sector_t written_sector = bio->bi_iter.bi_sector;
1016
+ struct request_queue *q = orig_bio->bi_disk->queue;
1017
+ u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
1018
+
1019
+ orig_bio->bi_iter.bi_sector += written_sector & mask;
1020
+ }
1021
+
9601022 if (endio) {
9611023 int r = endio(tio->ti, bio, &error);
9621024 switch (r) {
9631025 case DM_ENDIO_REQUEUE:
9641026 error = BLK_STS_DM_REQUEUE;
965
- /*FALLTHRU*/
1027
+ fallthrough;
9661028 case DM_ENDIO_DONE:
9671029 break;
9681030 case DM_ENDIO_INCOMPLETE:
....@@ -974,6 +1036,11 @@
9741036 }
9751037 }
9761038
1039
+ if (unlikely(swap_bios_limit(tio->ti, bio))) {
1040
+ struct mapped_device *md = io->md;
1041
+ up(&md->swap_bios_semaphore);
1042
+ }
1043
+
9771044 free_tio(tio);
9781045 dec_pending(io, error);
9791046 }
....@@ -982,29 +1049,28 @@
9821049 * Return maximum size of I/O possible at the supplied sector up to the current
9831050 * target boundary.
9841051 */
985
-static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1052
+static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1053
+ sector_t target_offset)
9861054 {
987
- sector_t target_offset = dm_target_offset(ti, sector);
988
-
9891055 return ti->len - target_offset;
9901056 }
9911057
992
-static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1058
+static sector_t max_io_len(struct dm_target *ti, sector_t sector)
9931059 {
994
- sector_t len = max_io_len_target_boundary(sector, ti);
995
- sector_t offset, max_len;
1060
+ sector_t target_offset = dm_target_offset(ti, sector);
1061
+ sector_t len = max_io_len_target_boundary(ti, target_offset);
1062
+ sector_t max_len;
9961063
9971064 /*
998
- * Does the target need to split even further?
1065
+ * Does the target need to split IO even further?
1066
+ * - varied (per target) IO splitting is a tenet of DM; this
1067
+ * explains why stacked chunk_sectors based splitting via
1068
+ * blk_max_size_offset() isn't possible here. So pass in
1069
+ * ti->max_io_len to override stacked chunk_sectors.
9991070 */
10001071 if (ti->max_io_len) {
1001
- offset = dm_target_offset(ti, sector);
1002
- if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1003
- max_len = sector_div(offset, ti->max_io_len);
1004
- else
1005
- max_len = offset & (ti->max_io_len - 1);
1006
- max_len = ti->max_io_len - max_len;
1007
-
1072
+ max_len = blk_max_size_offset(ti->table->md->queue,
1073
+ target_offset, ti->max_io_len);
10081074 if (len > max_len)
10091075 len = max_len;
10101076 }
....@@ -1039,7 +1105,7 @@
10391105 return NULL;
10401106
10411107 ti = dm_table_find_target(map, sector);
1042
- if (!dm_target_is_valid(ti))
1108
+ if (!ti)
10431109 return NULL;
10441110
10451111 return ti;
....@@ -1060,13 +1126,33 @@
10601126 goto out;
10611127 if (!ti->type->direct_access)
10621128 goto out;
1063
- len = max_io_len(sector, ti) / PAGE_SECTORS;
1129
+ len = max_io_len(ti, sector) / PAGE_SECTORS;
10641130 if (len < 1)
10651131 goto out;
10661132 nr_pages = min(len, nr_pages);
10671133 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
10681134
10691135 out:
1136
+ dm_put_live_table(md, srcu_idx);
1137
+
1138
+ return ret;
1139
+}
1140
+
1141
+static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1142
+ int blocksize, sector_t start, sector_t len)
1143
+{
1144
+ struct mapped_device *md = dax_get_private(dax_dev);
1145
+ struct dm_table *map;
1146
+ bool ret = false;
1147
+ int srcu_idx;
1148
+
1149
+ map = dm_get_live_table(md, &srcu_idx);
1150
+ if (!map)
1151
+ goto out;
1152
+
1153
+ ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
1154
+
1155
+out:
10701156 dm_put_live_table(md, srcu_idx);
10711157
10721158 return ret;
....@@ -1120,9 +1206,37 @@
11201206 return ret;
11211207 }
11221208
1209
+static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1210
+ size_t nr_pages)
1211
+{
1212
+ struct mapped_device *md = dax_get_private(dax_dev);
1213
+ sector_t sector = pgoff * PAGE_SECTORS;
1214
+ struct dm_target *ti;
1215
+ int ret = -EIO;
1216
+ int srcu_idx;
1217
+
1218
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1219
+
1220
+ if (!ti)
1221
+ goto out;
1222
+ if (WARN_ON(!ti->type->dax_zero_page_range)) {
1223
+ /*
1224
+ * ->zero_page_range() is mandatory dax operation. If we are
1225
+ * here, something is wrong.
1226
+ */
1227
+ goto out;
1228
+ }
1229
+ ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1230
+ out:
1231
+ dm_put_live_table(md, srcu_idx);
1232
+
1233
+ return ret;
1234
+}
1235
+
11231236 /*
11241237 * A target may call dm_accept_partial_bio only from the map routine. It is
1125
- * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1238
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1239
+ * operations and REQ_OP_ZONE_APPEND (zone append writes).
11261240 *
11271241 * dm_accept_partial_bio informs the dm that the target only wants to process
11281242 * additional n_sectors sectors of the bio and the rest of the data should be
....@@ -1152,105 +1266,33 @@
11521266 {
11531267 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
11541268 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1269
+
11551270 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1271
+ BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1272
+ BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
11561273 BUG_ON(bi_size > *tio->len_ptr);
11571274 BUG_ON(n_sectors > bi_size);
1275
+
11581276 *tio->len_ptr -= bi_size - n_sectors;
11591277 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
11601278 }
11611279 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
11621280
1163
-/*
1164
- * The zone descriptors obtained with a zone report indicate zone positions
1165
- * within the target backing device, regardless of that device is a partition
1166
- * and regardless of the target mapping start sector on the device or partition.
1167
- * The zone descriptors start sector and write pointer position must be adjusted
1168
- * to match their relative position within the dm device.
1169
- * A target may call dm_remap_zone_report() after completion of a
1170
- * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
1171
- * backing device.
1172
- */
1173
-void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1281
+static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
11741282 {
1175
-#ifdef CONFIG_BLK_DEV_ZONED
1176
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1177
- struct bio *report_bio = tio->io->orig_bio;
1178
- struct blk_zone_report_hdr *hdr = NULL;
1179
- struct blk_zone *zone;
1180
- unsigned int nr_rep = 0;
1181
- unsigned int ofst;
1182
- sector_t part_offset;
1183
- struct bio_vec bvec;
1184
- struct bvec_iter iter;
1185
- void *addr;
1186
-
1187
- if (bio->bi_status)
1188
- return;
1189
-
1190
- /*
1191
- * bio sector was incremented by the request size on completion. Taking
1192
- * into account the original request sector, the target start offset on
1193
- * the backing device and the target mapping offset (ti->begin), the
1194
- * start sector of the backing device. The partition offset is always 0
1195
- * if the target uses a whole device.
1196
- */
1197
- part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1198
-
1199
- /*
1200
- * Remap the start sector of the reported zones. For sequential zones,
1201
- * also remap the write pointer position.
1202
- */
1203
- bio_for_each_segment(bvec, report_bio, iter) {
1204
- addr = kmap_atomic(bvec.bv_page);
1205
-
1206
- /* Remember the report header in the first page */
1207
- if (!hdr) {
1208
- hdr = addr;
1209
- ofst = sizeof(struct blk_zone_report_hdr);
1210
- } else
1211
- ofst = 0;
1212
-
1213
- /* Set zones start sector */
1214
- while (hdr->nr_zones && ofst < bvec.bv_len) {
1215
- zone = addr + ofst;
1216
- zone->start -= part_offset;
1217
- if (zone->start >= start + ti->len) {
1218
- hdr->nr_zones = 0;
1219
- break;
1220
- }
1221
- zone->start = zone->start + ti->begin - start;
1222
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1223
- if (zone->cond == BLK_ZONE_COND_FULL)
1224
- zone->wp = zone->start + zone->len;
1225
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
1226
- zone->wp = zone->start;
1227
- else
1228
- zone->wp = zone->wp + ti->begin - start - part_offset;
1229
- }
1230
- ofst += sizeof(struct blk_zone);
1231
- hdr->nr_zones--;
1232
- nr_rep++;
1233
- }
1234
-
1235
- if (addr != hdr)
1236
- kunmap_atomic(addr);
1237
-
1238
- if (!hdr->nr_zones)
1239
- break;
1283
+ mutex_lock(&md->swap_bios_lock);
1284
+ while (latch < md->swap_bios) {
1285
+ cond_resched();
1286
+ down(&md->swap_bios_semaphore);
1287
+ md->swap_bios--;
12401288 }
1241
-
1242
- if (hdr) {
1243
- hdr->nr_zones = nr_rep;
1244
- kunmap_atomic(hdr);
1289
+ while (latch > md->swap_bios) {
1290
+ cond_resched();
1291
+ up(&md->swap_bios_semaphore);
1292
+ md->swap_bios++;
12451293 }
1246
-
1247
- bio_advance(report_bio, report_bio->bi_iter.bi_size);
1248
-
1249
-#else /* !CONFIG_BLK_DEV_ZONED */
1250
- bio->bi_status = BLK_STS_NOTSUPP;
1251
-#endif
1294
+ mutex_unlock(&md->swap_bios_lock);
12521295 }
1253
-EXPORT_SYMBOL_GPL(dm_remap_zone_report);
12541296
12551297 static blk_qc_t __map_bio(struct dm_target_io *tio)
12561298 {
....@@ -1258,7 +1300,6 @@
12581300 sector_t sector;
12591301 struct bio *clone = &tio->clone;
12601302 struct dm_io *io = tio->io;
1261
- struct mapped_device *md = io->md;
12621303 struct dm_target *ti = tio->ti;
12631304 blk_qc_t ret = BLK_QC_T_NONE;
12641305
....@@ -1272,6 +1313,14 @@
12721313 atomic_inc(&io->io_count);
12731314 sector = clone->bi_iter.bi_sector;
12741315
1316
+ if (unlikely(swap_bios_limit(ti, clone))) {
1317
+ struct mapped_device *md = io->md;
1318
+ int latch = get_swap_bios();
1319
+ if (unlikely(latch != md->swap_bios))
1320
+ __set_swap_bios_limit(md, latch);
1321
+ down(&md->swap_bios_semaphore);
1322
+ }
1323
+
12751324 r = ti->type->map(ti, clone);
12761325 switch (r) {
12771326 case DM_MAPIO_SUBMITTED:
....@@ -1280,16 +1329,21 @@
12801329 /* the bio has been remapped so dispatch it */
12811330 trace_block_bio_remap(clone->bi_disk->queue, clone,
12821331 bio_dev(io->orig_bio), sector);
1283
- if (md->type == DM_TYPE_NVME_BIO_BASED)
1284
- ret = direct_make_request(clone);
1285
- else
1286
- ret = generic_make_request(clone);
1332
+ ret = submit_bio_noacct(clone);
12871333 break;
12881334 case DM_MAPIO_KILL:
1335
+ if (unlikely(swap_bios_limit(ti, clone))) {
1336
+ struct mapped_device *md = io->md;
1337
+ up(&md->swap_bios_semaphore);
1338
+ }
12891339 free_tio(tio);
12901340 dec_pending(io, BLK_STS_IOERR);
12911341 break;
12921342 case DM_MAPIO_REQUEUE:
1343
+ if (unlikely(swap_bios_limit(ti, clone))) {
1344
+ struct mapped_device *md = io->md;
1345
+ up(&md->swap_bios_semaphore);
1346
+ }
12931347 free_tio(tio);
12941348 dec_pending(io, BLK_STS_DM_REQUEUE);
12951349 break;
....@@ -1314,13 +1368,15 @@
13141368 sector_t sector, unsigned len)
13151369 {
13161370 struct bio *clone = &tio->clone;
1371
+ int r;
13171372
13181373 __bio_clone_fast(clone, bio);
13191374
1320
- bio_crypt_clone(clone, bio, GFP_NOIO);
1375
+ r = bio_crypt_clone(clone, bio, GFP_NOIO);
1376
+ if (r < 0)
1377
+ return r;
13211378
1322
- if (unlikely(bio_integrity(bio) != NULL)) {
1323
- int r;
1379
+ if (bio_integrity(bio)) {
13241380 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
13251381 !dm_target_passes_integrity(tio->ti->type))) {
13261382 DMWARN("%s: the target %s doesn't support integrity data.",
....@@ -1334,11 +1390,10 @@
13341390 return r;
13351391 }
13361392
1337
- if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1338
- bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1393
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
13391394 clone->bi_iter.bi_size = to_bytes(len);
13401395
1341
- if (unlikely(bio_integrity(bio) != NULL))
1396
+ if (bio_integrity(bio))
13421397 bio_integrity_trim(clone);
13431398
13441399 return 0;
....@@ -1417,11 +1472,32 @@
14171472 {
14181473 unsigned target_nr = 0;
14191474 struct dm_target *ti;
1475
+ struct bio flush_bio;
1476
+
1477
+ /*
1478
+ * Use an on-stack bio for this, it's safe since we don't
1479
+ * need to reference it after submit. It's just used as
1480
+ * the basis for the clone(s).
1481
+ */
1482
+ bio_init(&flush_bio, NULL, 0);
1483
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1484
+ ci->bio = &flush_bio;
1485
+ ci->sector_count = 0;
1486
+
1487
+ /*
1488
+ * Empty flush uses a statically initialized bio, as the base for
1489
+ * cloning. However, blkg association requires that a bdev is
1490
+ * associated with a gendisk, which doesn't happen until the bdev is
1491
+ * opened. So, blkg association is done at issue time of the flush
1492
+ * rather than when the device is created in alloc_dev().
1493
+ */
1494
+ bio_set_dev(ci->bio, ci->io->md->bdev);
14201495
14211496 BUG_ON(bio_has_data(ci->bio));
14221497 while ((ti = dm_table_get_target(ci->map, target_nr++)))
14231498 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
14241499
1500
+ bio_uninit(ci->bio);
14251501 return 0;
14261502 }
14271503
....@@ -1444,41 +1520,10 @@
14441520 return 0;
14451521 }
14461522
1447
-typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1448
-
1449
-static unsigned get_num_discard_bios(struct dm_target *ti)
1450
-{
1451
- return ti->num_discard_bios;
1452
-}
1453
-
1454
-static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1455
-{
1456
- return ti->num_secure_erase_bios;
1457
-}
1458
-
1459
-static unsigned get_num_write_same_bios(struct dm_target *ti)
1460
-{
1461
- return ti->num_write_same_bios;
1462
-}
1463
-
1464
-static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1465
-{
1466
- return ti->num_write_zeroes_bios;
1467
-}
1468
-
1469
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
1470
-
1471
-static bool is_split_required_for_discard(struct dm_target *ti)
1472
-{
1473
- return ti->split_discard_bios;
1474
-}
1475
-
14761523 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1477
- get_num_bios_fn get_num_bios,
1478
- is_split_required_fn is_split_required)
1524
+ unsigned num_bios)
14791525 {
14801526 unsigned len;
1481
- unsigned num_bios;
14821527
14831528 /*
14841529 * Even though the device advertised support for this type of
....@@ -1486,14 +1531,11 @@
14861531 * reconfiguration might also have changed that since the
14871532 * check was performed.
14881533 */
1489
- num_bios = get_num_bios ? get_num_bios(ti) : 0;
14901534 if (!num_bios)
14911535 return -EOPNOTSUPP;
14921536
1493
- if (is_split_required && !is_split_required(ti))
1494
- len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1495
- else
1496
- len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1537
+ len = min_t(sector_t, ci->sector_count,
1538
+ max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
14971539
14981540 __send_duplicate_bios(ci, ti, num_bios, &len);
14991541
....@@ -1503,43 +1545,46 @@
15031545 return 0;
15041546 }
15051547
1506
-static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1548
+static bool is_abnormal_io(struct bio *bio)
15071549 {
1508
- return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1509
- is_split_required_for_discard);
1510
-}
1550
+ bool r = false;
15111551
1512
-static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1513
-{
1514
- return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1515
-}
1552
+ switch (bio_op(bio)) {
1553
+ case REQ_OP_DISCARD:
1554
+ case REQ_OP_SECURE_ERASE:
1555
+ case REQ_OP_WRITE_SAME:
1556
+ case REQ_OP_WRITE_ZEROES:
1557
+ r = true;
1558
+ break;
1559
+ }
15161560
1517
-static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1518
-{
1519
- return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1520
-}
1521
-
1522
-static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1523
-{
1524
- return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1561
+ return r;
15251562 }
15261563
15271564 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
15281565 int *result)
15291566 {
15301567 struct bio *bio = ci->bio;
1568
+ unsigned num_bios = 0;
15311569
1532
- if (bio_op(bio) == REQ_OP_DISCARD)
1533
- *result = __send_discard(ci, ti);
1534
- else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1535
- *result = __send_secure_erase(ci, ti);
1536
- else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1537
- *result = __send_write_same(ci, ti);
1538
- else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1539
- *result = __send_write_zeroes(ci, ti);
1540
- else
1570
+ switch (bio_op(bio)) {
1571
+ case REQ_OP_DISCARD:
1572
+ num_bios = ti->num_discard_bios;
1573
+ break;
1574
+ case REQ_OP_SECURE_ERASE:
1575
+ num_bios = ti->num_secure_erase_bios;
1576
+ break;
1577
+ case REQ_OP_WRITE_SAME:
1578
+ num_bios = ti->num_write_same_bios;
1579
+ break;
1580
+ case REQ_OP_WRITE_ZEROES:
1581
+ num_bios = ti->num_write_zeroes_bios;
1582
+ break;
1583
+ default:
15411584 return false;
1585
+ }
15421586
1587
+ *result = __send_changing_extent_only(ci, ti, num_bios);
15431588 return true;
15441589 }
15451590
....@@ -1548,23 +1593,18 @@
15481593 */
15491594 static int __split_and_process_non_flush(struct clone_info *ci)
15501595 {
1551
- struct bio *bio = ci->bio;
15521596 struct dm_target *ti;
15531597 unsigned len;
15541598 int r;
15551599
15561600 ti = dm_table_find_target(ci->map, ci->sector);
1557
- if (!dm_target_is_valid(ti))
1601
+ if (!ti)
15581602 return -EIO;
15591603
1560
- if (unlikely(__process_abnormal_io(ci, ti, &r)))
1604
+ if (__process_abnormal_io(ci, ti, &r))
15611605 return r;
15621606
1563
- if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1564
- len = ci->sector_count;
1565
- else
1566
- len = min_t(sector_t, max_io_len(ci->sector, ti),
1567
- ci->sector_count);
1607
+ len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
15681608
15691609 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
15701610 if (r < 0)
....@@ -1584,6 +1624,9 @@
15841624 ci->sector = bio->bi_iter.bi_sector;
15851625 }
15861626
1627
+#define __dm_part_stat_sub(part, field, subnd) \
1628
+ (part_stat_get(part, field) -= (subnd))
1629
+
15871630 /*
15881631 * Entry point to split a bio into clones and submit them to the targets.
15891632 */
....@@ -1594,21 +1637,12 @@
15941637 blk_qc_t ret = BLK_QC_T_NONE;
15951638 int error = 0;
15961639
1597
- if (unlikely(!map)) {
1598
- bio_io_error(bio);
1599
- return ret;
1600
- }
1601
-
1602
- blk_queue_split(md->queue, &bio);
1603
-
16041640 init_clone_info(&ci, md, map, bio);
16051641
16061642 if (bio->bi_opf & REQ_PREFLUSH) {
1607
- ci.bio = &ci.io->md->flush_bio;
1608
- ci.sector_count = 0;
16091643 error = __send_empty_flush(&ci);
16101644 /* dec_pending submits any data associated with flush */
1611
- } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1645
+ } else if (op_is_zone_mgmt(bio_op(bio))) {
16121646 ci.bio = bio;
16131647 ci.sector_count = 0;
16141648 error = __split_and_process_non_flush(&ci);
....@@ -1619,21 +1653,32 @@
16191653 error = __split_and_process_non_flush(&ci);
16201654 if (current->bio_list && ci.sector_count && !error) {
16211655 /*
1622
- * Remainder must be passed to generic_make_request()
1656
+ * Remainder must be passed to submit_bio_noacct()
16231657 * so that it gets handled *after* bios already submitted
16241658 * have been completely processed.
16251659 * We take a clone of the original to store in
16261660 * ci.io->orig_bio to be used by end_io_acct() and
16271661 * for dec_pending to use for completion handling.
1628
- * As this path is not used for REQ_OP_ZONE_REPORT,
1629
- * the usage of io->orig_bio in dm_remap_zone_report()
1630
- * won't be affected by this reassignment.
16311662 */
16321663 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
16331664 GFP_NOIO, &md->queue->bio_split);
16341665 ci.io->orig_bio = b;
1666
+
1667
+ /*
1668
+ * Adjust IO stats for each split, otherwise upon queue
1669
+ * reentry there will be redundant IO accounting.
1670
+ * NOTE: this is a stop-gap fix, a proper fix involves
1671
+ * significant refactoring of DM core's bio splitting
1672
+ * (by eliminating DM's splitting and just using bio_split)
1673
+ */
1674
+ part_stat_lock();
1675
+ __dm_part_stat_sub(&dm_disk(md)->part0,
1676
+ sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1677
+ part_stat_unlock();
1678
+
16351679 bio_chain(b, bio);
1636
- ret = generic_make_request(bio);
1680
+ trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1681
+ ret = submit_bio_noacct(bio);
16371682 break;
16381683 }
16391684 }
....@@ -1644,121 +1689,38 @@
16441689 return ret;
16451690 }
16461691
1647
-/*
1648
- * Optimized variant of __split_and_process_bio that leverages the
1649
- * fact that targets that use it do _not_ have a need to split bios.
1650
- */
1651
-static blk_qc_t __process_bio(struct mapped_device *md,
1652
- struct dm_table *map, struct bio *bio)
1692
+static blk_qc_t dm_submit_bio(struct bio *bio)
16531693 {
1654
- struct clone_info ci;
1655
- blk_qc_t ret = BLK_QC_T_NONE;
1656
- int error = 0;
1657
-
1658
- if (unlikely(!map)) {
1659
- bio_io_error(bio);
1660
- return ret;
1661
- }
1662
-
1663
- init_clone_info(&ci, md, map, bio);
1664
-
1665
- if (bio->bi_opf & REQ_PREFLUSH) {
1666
- ci.bio = &ci.io->md->flush_bio;
1667
- ci.sector_count = 0;
1668
- error = __send_empty_flush(&ci);
1669
- /* dec_pending submits any data associated with flush */
1670
- } else {
1671
- struct dm_target *ti = md->immutable_target;
1672
- struct dm_target_io *tio;
1673
-
1674
- /*
1675
- * Defend against IO still getting in during teardown
1676
- * - as was seen for a time with nvme-fcloop
1677
- */
1678
- if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1679
- error = -EIO;
1680
- goto out;
1681
- }
1682
-
1683
- ci.bio = bio;
1684
- ci.sector_count = bio_sectors(bio);
1685
- if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1686
- goto out;
1687
-
1688
- tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1689
- ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1690
- }
1691
-out:
1692
- /* drop the extra reference count */
1693
- dec_pending(ci.io, errno_to_blk_status(error));
1694
- return ret;
1695
-}
1696
-
1697
-typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1698
-
1699
-static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1700
- process_bio_fn process_bio)
1701
-{
1702
- struct mapped_device *md = q->queuedata;
1694
+ struct mapped_device *md = bio->bi_disk->private_data;
17031695 blk_qc_t ret = BLK_QC_T_NONE;
17041696 int srcu_idx;
17051697 struct dm_table *map;
17061698
17071699 map = dm_get_live_table(md, &srcu_idx);
17081700
1709
- /* if we're suspended, we have to queue this io for later */
1710
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1711
- dm_put_live_table(md, srcu_idx);
1712
-
1713
- if (!(bio->bi_opf & REQ_RAHEAD))
1714
- queue_io(md, bio);
1715
- else
1701
+ /* If suspended, or map not yet available, queue this IO for later */
1702
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1703
+ unlikely(!map)) {
1704
+ if (bio->bi_opf & REQ_NOWAIT)
1705
+ bio_wouldblock_error(bio);
1706
+ else if (bio->bi_opf & REQ_RAHEAD)
17161707 bio_io_error(bio);
1717
- return ret;
1708
+ else
1709
+ queue_io(md, bio);
1710
+ goto out;
17181711 }
17191712
1720
- ret = process_bio(md, map, bio);
1713
+ /*
1714
+ * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1715
+ * otherwise associated queue_limits won't be imposed.
1716
+ */
1717
+ if (is_abnormal_io(bio))
1718
+ blk_queue_split(&bio);
17211719
1720
+ ret = __split_and_process_bio(md, map, bio);
1721
+out:
17221722 dm_put_live_table(md, srcu_idx);
17231723 return ret;
1724
-}
1725
-
1726
-/*
1727
- * The request function that remaps the bio to one target and
1728
- * splits off any remainder.
1729
- */
1730
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1731
-{
1732
- return __dm_make_request(q, bio, __split_and_process_bio);
1733
-}
1734
-
1735
-static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1736
-{
1737
- return __dm_make_request(q, bio, __process_bio);
1738
-}
1739
-
1740
-static int dm_any_congested(void *congested_data, int bdi_bits)
1741
-{
1742
- int r = bdi_bits;
1743
- struct mapped_device *md = congested_data;
1744
- struct dm_table *map;
1745
-
1746
- if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1747
- if (dm_request_based(md)) {
1748
- /*
1749
- * With request-based DM we only need to check the
1750
- * top-level queue for congestion.
1751
- */
1752
- r = md->queue->backing_dev_info->wb.state & bdi_bits;
1753
- } else {
1754
- map = dm_get_live_table_fast(md);
1755
- if (map)
1756
- r = dm_table_any_congested(map, bdi_bits);
1757
- dm_put_live_table_fast(md);
1758
- }
1759
- }
1760
-
1761
- return r;
17621724 }
17631725
17641726 /*-----------------------------------------------------------------
....@@ -1811,29 +1773,28 @@
18111773 }
18121774
18131775 static const struct block_device_operations dm_blk_dops;
1776
+static const struct block_device_operations dm_rq_blk_dops;
18141777 static const struct dax_operations dm_dax_ops;
18151778
18161779 static void dm_wq_work(struct work_struct *work);
18171780
1818
-static void dm_init_normal_md_queue(struct mapped_device *md)
1781
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1782
+static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
18191783 {
1820
- md->use_blk_mq = false;
1821
-
1822
- /*
1823
- * Initialize aspects of queue that aren't relevant for blk-mq
1824
- */
1825
- md->queue->backing_dev_info->congested_data = md;
1826
- md->queue->backing_dev_info->congested_fn = dm_any_congested;
1784
+ dm_destroy_keyslot_manager(q->ksm);
18271785 }
18281786
1829
-static void dm_destroy_inline_encryption(struct request_queue *q);
1787
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1788
+
1789
+static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1790
+{
1791
+}
1792
+#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
18301793
18311794 static void cleanup_mapped_device(struct mapped_device *md)
18321795 {
18331796 if (md->wq)
18341797 destroy_workqueue(md->wq);
1835
- if (md->kworker_task)
1836
- kthread_stop(md->kworker_task);
18371798 bioset_exit(&md->bs);
18381799 bioset_exit(&md->io_bs);
18391800
....@@ -1852,7 +1813,7 @@
18521813 }
18531814
18541815 if (md->queue) {
1855
- dm_destroy_inline_encryption(md->queue);
1816
+ dm_queue_destroy_keyslot_manager(md->queue);
18561817 blk_cleanup_queue(md->queue);
18571818 }
18581819
....@@ -1866,6 +1827,7 @@
18661827 mutex_destroy(&md->suspend_lock);
18671828 mutex_destroy(&md->type_lock);
18681829 mutex_destroy(&md->table_devices_lock);
1830
+ mutex_destroy(&md->swap_bios_lock);
18691831
18701832 dm_mq_cleanup_mapped_device(md);
18711833 }
....@@ -1876,7 +1838,6 @@
18761838 static struct mapped_device *alloc_dev(int minor)
18771839 {
18781840 int r, numa_node_id = dm_get_numa_node();
1879
- struct dax_device *dax_dev = NULL;
18801841 struct mapped_device *md;
18811842 void *old_md;
18821843
....@@ -1902,7 +1863,6 @@
19021863 goto bad_io_barrier;
19031864
19041865 md->numa_node_id = numa_node_id;
1905
- md->use_blk_mq = dm_use_blk_mq_default();
19061866 md->init_tio_pdu = false;
19071867 md->type = DM_TYPE_NONE;
19081868 mutex_init(&md->suspend_lock);
....@@ -1917,28 +1877,27 @@
19171877 INIT_LIST_HEAD(&md->table_devices);
19181878 spin_lock_init(&md->uevent_lock);
19191879
1920
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1880
+ /*
1881
+ * default to bio-based until DM table is loaded and md->type
1882
+ * established. If request-based table is loaded: blk-mq will
1883
+ * override accordingly.
1884
+ */
1885
+ md->queue = blk_alloc_queue(numa_node_id);
19211886 if (!md->queue)
19221887 goto bad;
1923
- md->queue->queuedata = md;
1924
- /*
1925
- * default to bio-based required ->make_request_fn until DM
1926
- * table is loaded and md->type established. If request-based
1927
- * table is loaded: blk-mq will override accordingly.
1928
- */
1929
- blk_queue_make_request(md->queue, dm_make_request);
19301888
19311889 md->disk = alloc_disk_node(1, md->numa_node_id);
19321890 if (!md->disk)
19331891 goto bad;
19341892
1935
- atomic_set(&md->pending[0], 0);
1936
- atomic_set(&md->pending[1], 0);
19371893 init_waitqueue_head(&md->wait);
19381894 INIT_WORK(&md->work, dm_wq_work);
19391895 init_waitqueue_head(&md->eventq);
19401896 init_completion(&md->kobj_holder.completion);
1941
- md->kworker_task = NULL;
1897
+
1898
+ md->swap_bios = get_swap_bios();
1899
+ sema_init(&md->swap_bios_semaphore, md->swap_bios);
1900
+ mutex_init(&md->swap_bios_lock);
19421901
19431902 md->disk->major = _major;
19441903 md->disk->first_minor = minor;
....@@ -1948,11 +1907,13 @@
19481907 sprintf(md->disk->disk_name, "dm-%d", minor);
19491908
19501909 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1951
- dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1952
- if (!dax_dev)
1910
+ md->dax_dev = alloc_dax(md, md->disk->disk_name,
1911
+ &dm_dax_ops, 0);
1912
+ if (IS_ERR(md->dax_dev)) {
1913
+ md->dax_dev = NULL;
19531914 goto bad;
1915
+ }
19541916 }
1955
- md->dax_dev = dax_dev;
19561917
19571918 add_disk_no_queue_reg(md->disk);
19581919 format_dev_t(md->name, MKDEV(_major, minor));
....@@ -1965,11 +1926,9 @@
19651926 if (!md->bdev)
19661927 goto bad;
19671928
1968
- bio_init(&md->flush_bio, NULL, 0);
1969
- bio_set_dev(&md->flush_bio, md->bdev);
1970
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1971
-
1972
- dm_stats_init(&md->stats);
1929
+ r = dm_stats_init(&md->stats);
1930
+ if (r < 0)
1931
+ goto bad;
19731932
19741933 /* Populate the mapping, nobody knows we exist yet */
19751934 spin_lock(&_minor_lock);
....@@ -2072,18 +2031,6 @@
20722031 }
20732032
20742033 /*
2075
- * Protected by md->suspend_lock obtained by dm_swap_table().
2076
- */
2077
-static void __set_size(struct mapped_device *md, sector_t size)
2078
-{
2079
- lockdep_assert_held(&md->suspend_lock);
2080
-
2081
- set_capacity(md->disk, size);
2082
-
2083
- i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2084
-}
2085
-
2086
-/*
20872034 * Returns old map, which caller must destroy.
20882035 */
20892036 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
....@@ -2105,7 +2052,8 @@
21052052 if (size != dm_get_size(md))
21062053 memset(&md->geometry, 0, sizeof(md->geometry));
21072054
2108
- __set_size(md, size);
2055
+ set_capacity(md->disk, size);
2056
+ bd_set_nr_sectors(md->bdev, size);
21092057
21102058 dm_table_event_callback(t, event_callback, md);
21112059
....@@ -2119,12 +2067,10 @@
21192067 if (request_based)
21202068 dm_stop_queue(q);
21212069
2122
- if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2070
+ if (request_based) {
21232071 /*
2124
- * Leverage the fact that request-based DM targets and
2125
- * NVMe bio based targets are immutable singletons
2126
- * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2127
- * and __process_bio.
2072
+ * Leverage the fact that request-based DM targets are
2073
+ * immutable singletons - used to optimize dm_mq_queue_rq.
21282074 */
21292075 md->immutable_target = dm_table_get_immutable_target(t);
21302076 }
....@@ -2227,166 +2173,6 @@
22272173 }
22282174 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
22292175
2230
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
2231
-struct dm_keyslot_evict_args {
2232
- const struct blk_crypto_key *key;
2233
- int err;
2234
-};
2235
-
2236
-static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
2237
- sector_t start, sector_t len, void *data)
2238
-{
2239
- struct dm_keyslot_evict_args *args = data;
2240
- int err;
2241
-
2242
- err = blk_crypto_evict_key(dev->bdev->bd_queue, args->key);
2243
- if (!args->err)
2244
- args->err = err;
2245
- /* Always try to evict the key from all devices. */
2246
- return 0;
2247
-}
2248
-
2249
-/*
2250
- * When an inline encryption key is evicted from a device-mapper device, evict
2251
- * it from all the underlying devices.
2252
- */
2253
-static int dm_keyslot_evict(struct keyslot_manager *ksm,
2254
- const struct blk_crypto_key *key, unsigned int slot)
2255
-{
2256
- struct mapped_device *md = keyslot_manager_private(ksm);
2257
- struct dm_keyslot_evict_args args = { key };
2258
- struct dm_table *t;
2259
- int srcu_idx;
2260
- int i;
2261
- struct dm_target *ti;
2262
-
2263
- t = dm_get_live_table(md, &srcu_idx);
2264
- if (!t)
2265
- return 0;
2266
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
2267
- ti = dm_table_get_target(t, i);
2268
- if (!ti->type->iterate_devices)
2269
- continue;
2270
- ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args);
2271
- }
2272
- dm_put_live_table(md, srcu_idx);
2273
- return args.err;
2274
-}
2275
-
2276
-struct dm_derive_raw_secret_args {
2277
- const u8 *wrapped_key;
2278
- unsigned int wrapped_key_size;
2279
- u8 *secret;
2280
- unsigned int secret_size;
2281
- int err;
2282
-};
2283
-
2284
-static int dm_derive_raw_secret_callback(struct dm_target *ti,
2285
- struct dm_dev *dev, sector_t start,
2286
- sector_t len, void *data)
2287
-{
2288
- struct dm_derive_raw_secret_args *args = data;
2289
- struct request_queue *q = dev->bdev->bd_queue;
2290
-
2291
- if (!args->err)
2292
- return 0;
2293
-
2294
- if (!q->ksm) {
2295
- args->err = -EOPNOTSUPP;
2296
- return 0;
2297
- }
2298
-
2299
- args->err = keyslot_manager_derive_raw_secret(q->ksm, args->wrapped_key,
2300
- args->wrapped_key_size,
2301
- args->secret,
2302
- args->secret_size);
2303
- /* Try another device in case this fails. */
2304
- return 0;
2305
-}
2306
-
2307
-/*
2308
- * Retrieve the raw_secret from the underlying device. Given that
2309
- * only only one raw_secret can exist for a particular wrappedkey,
2310
- * retrieve it only from the first device that supports derive_raw_secret()
2311
- */
2312
-static int dm_derive_raw_secret(struct keyslot_manager *ksm,
2313
- const u8 *wrapped_key,
2314
- unsigned int wrapped_key_size,
2315
- u8 *secret, unsigned int secret_size)
2316
-{
2317
- struct mapped_device *md = keyslot_manager_private(ksm);
2318
- struct dm_derive_raw_secret_args args = {
2319
- .wrapped_key = wrapped_key,
2320
- .wrapped_key_size = wrapped_key_size,
2321
- .secret = secret,
2322
- .secret_size = secret_size,
2323
- .err = -EOPNOTSUPP,
2324
- };
2325
- struct dm_table *t;
2326
- int srcu_idx;
2327
- int i;
2328
- struct dm_target *ti;
2329
-
2330
- t = dm_get_live_table(md, &srcu_idx);
2331
- if (!t)
2332
- return -EOPNOTSUPP;
2333
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
2334
- ti = dm_table_get_target(t, i);
2335
- if (!ti->type->iterate_devices)
2336
- continue;
2337
- ti->type->iterate_devices(ti, dm_derive_raw_secret_callback,
2338
- &args);
2339
- if (!args.err)
2340
- break;
2341
- }
2342
- dm_put_live_table(md, srcu_idx);
2343
- return args.err;
2344
-}
2345
-
2346
-static struct keyslot_mgmt_ll_ops dm_ksm_ll_ops = {
2347
- .keyslot_evict = dm_keyslot_evict,
2348
- .derive_raw_secret = dm_derive_raw_secret,
2349
-};
2350
-
2351
-static int dm_init_inline_encryption(struct mapped_device *md)
2352
-{
2353
- unsigned int features;
2354
- unsigned int mode_masks[BLK_ENCRYPTION_MODE_MAX];
2355
-
2356
- /*
2357
- * Initially declare support for all crypto settings. Anything
2358
- * unsupported by a child device will be removed later when calculating
2359
- * the device restrictions.
2360
- */
2361
- features = BLK_CRYPTO_FEATURE_STANDARD_KEYS |
2362
- BLK_CRYPTO_FEATURE_WRAPPED_KEYS;
2363
- memset(mode_masks, 0xFF, sizeof(mode_masks));
2364
-
2365
- md->queue->ksm = keyslot_manager_create_passthrough(NULL,
2366
- &dm_ksm_ll_ops,
2367
- features,
2368
- mode_masks, md);
2369
- if (!md->queue->ksm)
2370
- return -ENOMEM;
2371
- return 0;
2372
-}
2373
-
2374
-static void dm_destroy_inline_encryption(struct request_queue *q)
2375
-{
2376
- keyslot_manager_destroy(q->ksm);
2377
- q->ksm = NULL;
2378
-}
2379
-#else /* CONFIG_BLK_INLINE_ENCRYPTION */
2380
-static inline int dm_init_inline_encryption(struct mapped_device *md)
2381
-{
2382
- return 0;
2383
-}
2384
-
2385
-static inline void dm_destroy_inline_encryption(struct request_queue *q)
2386
-{
2387
-}
2388
-#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
2389
-
23902176 /*
23912177 * Setup the DM device's queue based on md's type
23922178 */
....@@ -2398,27 +2184,15 @@
23982184
23992185 switch (type) {
24002186 case DM_TYPE_REQUEST_BASED:
2401
- dm_init_normal_md_queue(md);
2402
- r = dm_old_init_request_queue(md, t);
2403
- if (r) {
2404
- DMERR("Cannot initialize queue for request-based mapped device");
2405
- return r;
2406
- }
2407
- break;
2408
- case DM_TYPE_MQ_REQUEST_BASED:
2187
+ md->disk->fops = &dm_rq_blk_dops;
24092188 r = dm_mq_init_request_queue(md, t);
24102189 if (r) {
2411
- DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2190
+ DMERR("Cannot initialize queue for request-based dm mapped device");
24122191 return r;
24132192 }
24142193 break;
24152194 case DM_TYPE_BIO_BASED:
24162195 case DM_TYPE_DAX_BIO_BASED:
2417
- dm_init_normal_md_queue(md);
2418
- break;
2419
- case DM_TYPE_NVME_BIO_BASED:
2420
- dm_init_normal_md_queue(md);
2421
- blk_queue_make_request(md->queue, dm_make_request_nvme);
24222196 break;
24232197 case DM_TYPE_NONE:
24242198 WARN_ON_ONCE(true);
....@@ -2430,13 +2204,6 @@
24302204 DMERR("Cannot calculate initial queue limits");
24312205 return r;
24322206 }
2433
-
2434
- r = dm_init_inline_encryption(md);
2435
- if (r) {
2436
- DMERR("Cannot initialize inline encryption");
2437
- return r;
2438
- }
2439
-
24402207 dm_table_set_restrictions(t, md->queue, &limits);
24412208 blk_register_queue(md->disk);
24422209
....@@ -2516,9 +2283,6 @@
25162283
25172284 blk_set_queue_dying(md->queue);
25182285
2519
- if (dm_request_based(md) && md->kworker_task)
2520
- kthread_flush_worker(&md->kworker);
2521
-
25222286 /*
25232287 * Take suspend_lock so that presuspend and postsuspend methods
25242288 * do not race with internal suspend.
....@@ -2569,15 +2333,29 @@
25692333 }
25702334 EXPORT_SYMBOL_GPL(dm_put);
25712335
2572
-static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2336
+static bool md_in_flight_bios(struct mapped_device *md)
2337
+{
2338
+ int cpu;
2339
+ struct hd_struct *part = &dm_disk(md)->part0;
2340
+ long sum = 0;
2341
+
2342
+ for_each_possible_cpu(cpu) {
2343
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2344
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2345
+ }
2346
+
2347
+ return sum != 0;
2348
+}
2349
+
2350
+static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
25732351 {
25742352 int r = 0;
25752353 DEFINE_WAIT(wait);
25762354
2577
- while (1) {
2355
+ while (true) {
25782356 prepare_to_wait(&md->wait, &wait, task_state);
25792357
2580
- if (!md_in_flight(md))
2358
+ if (!md_in_flight_bios(md))
25812359 break;
25822360
25832361 if (signal_pending_state(task_state, current)) {
....@@ -2589,6 +2367,30 @@
25892367 }
25902368 finish_wait(&md->wait, &wait);
25912369
2370
+ smp_rmb();
2371
+
2372
+ return r;
2373
+}
2374
+
2375
+static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2376
+{
2377
+ int r = 0;
2378
+
2379
+ if (!queue_is_mq(md->queue))
2380
+ return dm_wait_for_bios_completion(md, task_state);
2381
+
2382
+ while (true) {
2383
+ if (!blk_mq_queue_inflight(md->queue))
2384
+ break;
2385
+
2386
+ if (signal_pending_state(task_state, current)) {
2387
+ r = -EINTR;
2388
+ break;
2389
+ }
2390
+
2391
+ msleep(5);
2392
+ }
2393
+
25922394 return r;
25932395 }
25942396
....@@ -2597,29 +2399,20 @@
25972399 */
25982400 static void dm_wq_work(struct work_struct *work)
25992401 {
2600
- struct mapped_device *md = container_of(work, struct mapped_device,
2601
- work);
2602
- struct bio *c;
2603
- int srcu_idx;
2604
- struct dm_table *map;
2605
-
2606
- map = dm_get_live_table(md, &srcu_idx);
2402
+ struct mapped_device *md = container_of(work, struct mapped_device, work);
2403
+ struct bio *bio;
26072404
26082405 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
26092406 spin_lock_irq(&md->deferred_lock);
2610
- c = bio_list_pop(&md->deferred);
2407
+ bio = bio_list_pop(&md->deferred);
26112408 spin_unlock_irq(&md->deferred_lock);
26122409
2613
- if (!c)
2410
+ if (!bio)
26142411 break;
26152412
2616
- if (dm_request_based(md))
2617
- generic_make_request(c);
2618
- else
2619
- __split_and_process_bio(md, map, c);
2413
+ submit_bio_noacct(bio);
2414
+ cond_resched();
26202415 }
2621
-
2622
- dm_put_live_table(md, srcu_idx);
26232416 }
26242417
26252418 static void dm_queue_flush(struct mapped_device *md)
....@@ -2681,27 +2474,19 @@
26812474 {
26822475 int r;
26832476
2684
- WARN_ON(md->frozen_sb);
2477
+ WARN_ON(test_bit(DMF_FROZEN, &md->flags));
26852478
2686
- md->frozen_sb = freeze_bdev(md->bdev);
2687
- if (IS_ERR(md->frozen_sb)) {
2688
- r = PTR_ERR(md->frozen_sb);
2689
- md->frozen_sb = NULL;
2690
- return r;
2691
- }
2692
-
2693
- set_bit(DMF_FROZEN, &md->flags);
2694
-
2695
- return 0;
2479
+ r = freeze_bdev(md->bdev);
2480
+ if (!r)
2481
+ set_bit(DMF_FROZEN, &md->flags);
2482
+ return r;
26962483 }
26972484
26982485 static void unlock_fs(struct mapped_device *md)
26992486 {
27002487 if (!test_bit(DMF_FROZEN, &md->flags))
27012488 return;
2702
-
2703
- thaw_bdev(md->bdev, md->frozen_sb);
2704
- md->frozen_sb = NULL;
2489
+ thaw_bdev(md->bdev);
27052490 clear_bit(DMF_FROZEN, &md->flags);
27062491 }
27072492
....@@ -2731,7 +2516,7 @@
27312516 if (noflush)
27322517 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
27332518 else
2734
- pr_debug("%s: suspending with flush\n", dm_device_name(md));
2519
+ DMDEBUG("%s: suspending with flush", dm_device_name(md));
27352520
27362521 /*
27372522 * This gets reverted if there's an error later and the targets
....@@ -2756,13 +2541,12 @@
27562541 /*
27572542 * Here we must make sure that no processes are submitting requests
27582543 * to target drivers i.e. no one may be executing
2759
- * __split_and_process_bio. This is called from dm_request and
2760
- * dm_wq_work.
2544
+ * __split_and_process_bio from dm_submit_bio.
27612545 *
2762
- * To get all processes out of __split_and_process_bio in dm_request,
2546
+ * To get all processes out of __split_and_process_bio in dm_submit_bio,
27632547 * we take the write lock. To prevent any process from reentering
2764
- * __split_and_process_bio from dm_request and quiesce the thread
2765
- * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2548
+ * __split_and_process_bio from dm_submit_bio and quiesce the thread
2549
+ * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
27662550 * flush_workqueue(md->wq).
27672551 */
27682552 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
....@@ -2773,11 +2557,8 @@
27732557 * Stop md->queue before flushing md->wq in case request-based
27742558 * dm defers requests to md->wq from md->queue.
27752559 */
2776
- if (dm_request_based(md)) {
2560
+ if (dm_request_based(md))
27772561 dm_stop_queue(md->queue);
2778
- if (md->kworker_task)
2779
- kthread_flush_worker(&md->kworker);
2780
- }
27812562
27822563 flush_workqueue(md->wq);
27832564
....@@ -3133,19 +2914,19 @@
31332914
31342915 int dm_suspended(struct dm_target *ti)
31352916 {
3136
- return dm_suspended_md(dm_table_get_md(ti->table));
2917
+ return dm_suspended_md(ti->table->md);
31372918 }
31382919 EXPORT_SYMBOL_GPL(dm_suspended);
31392920
31402921 int dm_post_suspending(struct dm_target *ti)
31412922 {
3142
- return dm_post_suspending_md(dm_table_get_md(ti->table));
2923
+ return dm_post_suspending_md(ti->table->md);
31432924 }
31442925 EXPORT_SYMBOL_GPL(dm_post_suspending);
31452926
31462927 int dm_noflush_suspending(struct dm_target *ti)
31472928 {
3148
- return __noflush_suspending(dm_table_get_md(ti->table));
2929
+ return __noflush_suspending(ti->table->md);
31492930 }
31502931 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
31512932
....@@ -3164,7 +2945,6 @@
31642945 switch (type) {
31652946 case DM_TYPE_BIO_BASED:
31662947 case DM_TYPE_DAX_BIO_BASED:
3167
- case DM_TYPE_NVME_BIO_BASED:
31682948 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
31692949 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
31702950 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
....@@ -3175,7 +2955,6 @@
31752955 goto out;
31762956 break;
31772957 case DM_TYPE_REQUEST_BASED:
3178
- case DM_TYPE_MQ_REQUEST_BASED:
31792958 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
31802959 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
31812960 /* per_io_data_size is used for blk-mq pdu at queue allocation */
....@@ -3233,6 +3012,11 @@
32333012 if (dm_table_get_num_targets(table) != 1)
32343013 goto out;
32353014 ti = dm_table_get_target(table, 0);
3015
+
3016
+ if (dm_suspended_md(md)) {
3017
+ ret = -EAGAIN;
3018
+ goto out;
3019
+ }
32363020
32373021 ret = -EINVAL;
32383022 if (!ti->type->iterate_devices)
....@@ -3373,6 +3157,17 @@
33733157 };
33743158
33753159 static const struct block_device_operations dm_blk_dops = {
3160
+ .submit_bio = dm_submit_bio,
3161
+ .open = dm_blk_open,
3162
+ .release = dm_blk_close,
3163
+ .ioctl = dm_blk_ioctl,
3164
+ .getgeo = dm_blk_getgeo,
3165
+ .report_zones = dm_blk_report_zones,
3166
+ .pr_ops = &dm_pr_ops,
3167
+ .owner = THIS_MODULE
3168
+};
3169
+
3170
+static const struct block_device_operations dm_rq_blk_dops = {
33763171 .open = dm_blk_open,
33773172 .release = dm_blk_close,
33783173 .ioctl = dm_blk_ioctl,
....@@ -3383,8 +3178,10 @@
33833178
33843179 static const struct dax_operations dm_dax_ops = {
33853180 .direct_access = dm_dax_direct_access,
3181
+ .dax_supported = dm_dax_supported,
33863182 .copy_from_iter = dm_dax_copy_from_iter,
33873183 .copy_to_iter = dm_dax_copy_to_iter,
3184
+ .zero_page_range = dm_dax_zero_page_range,
33883185 };
33893186
33903187 /*
....@@ -3402,6 +3199,9 @@
34023199 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
34033200 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
34043201
3202
+module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3203
+MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3204
+
34053205 MODULE_DESCRIPTION(DM_NAME " driver");
34063206 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
34073207 MODULE_LICENSE("GPL");