forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/drivers/md/dm.c
....@@ -26,6 +26,7 @@
2626 #include <linux/wait.h>
2727 #include <linux/pr.h>
2828 #include <linux/refcount.h>
29
+#include <linux/part_stat.h>
2930 #include <linux/blk-crypto.h>
3031 #include <linux/keyslot-manager.h>
3132
....@@ -148,6 +149,16 @@
148149 #define DM_NUMA_NODE NUMA_NO_NODE
149150 static int dm_numa_node = DM_NUMA_NODE;
150151
152
+#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
153
+static int swap_bios = DEFAULT_SWAP_BIOS;
154
+static int get_swap_bios(void)
155
+{
156
+ int latch = READ_ONCE(swap_bios);
157
+ if (unlikely(latch <= 0))
158
+ latch = DEFAULT_SWAP_BIOS;
159
+ return latch;
160
+}
161
+
151162 /*
152163 * For mempools pre-allocation at the table loading time.
153164 */
....@@ -161,9 +172,6 @@
161172 refcount_t count;
162173 struct dm_dev dm_dev;
163174 };
164
-
165
-static struct kmem_cache *_rq_tio_cache;
166
-static struct kmem_cache *_rq_cache;
167175
168176 /*
169177 * Bio-based DM's mempools' reserved IOs set by the user.
....@@ -226,20 +234,11 @@
226234
227235 static int __init local_init(void)
228236 {
229
- int r = -ENOMEM;
230
-
231
- _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
232
- if (!_rq_tio_cache)
233
- return r;
234
-
235
- _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
236
- __alignof__(struct request), 0, NULL);
237
- if (!_rq_cache)
238
- goto out_free_rq_tio_cache;
237
+ int r;
239238
240239 r = dm_uevent_init();
241240 if (r)
242
- goto out_free_rq_cache;
241
+ return r;
243242
244243 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
245244 if (!deferred_remove_workqueue) {
....@@ -261,10 +260,6 @@
261260 destroy_workqueue(deferred_remove_workqueue);
262261 out_uevent_exit:
263262 dm_uevent_exit();
264
-out_free_rq_cache:
265
- kmem_cache_destroy(_rq_cache);
266
-out_free_rq_tio_cache:
267
- kmem_cache_destroy(_rq_tio_cache);
268263
269264 return r;
270265 }
....@@ -274,8 +269,6 @@
274269 flush_scheduled_work();
275270 destroy_workqueue(deferred_remove_workqueue);
276271
277
- kmem_cache_destroy(_rq_cache);
278
- kmem_cache_destroy(_rq_tio_cache);
279272 unregister_blkdev(_major, _name);
280273 dm_uevent_exit();
281274
....@@ -440,27 +433,90 @@
440433 dm_deferred_remove();
441434 }
442435
443
-sector_t dm_get_size(struct mapped_device *md)
444
-{
445
- return get_capacity(md->disk);
446
-}
447
-
448
-struct request_queue *dm_get_md_queue(struct mapped_device *md)
449
-{
450
- return md->queue;
451
-}
452
-
453
-struct dm_stats *dm_get_stats(struct mapped_device *md)
454
-{
455
- return &md->stats;
456
-}
457
-
458436 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
459437 {
460438 struct mapped_device *md = bdev->bd_disk->private_data;
461439
462440 return dm_get_geometry(md, geo);
463441 }
442
+
443
+#ifdef CONFIG_BLK_DEV_ZONED
444
+int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
445
+{
446
+ struct dm_report_zones_args *args = data;
447
+ sector_t sector_diff = args->tgt->begin - args->start;
448
+
449
+ /*
450
+ * Ignore zones beyond the target range.
451
+ */
452
+ if (zone->start >= args->start + args->tgt->len)
453
+ return 0;
454
+
455
+ /*
456
+ * Remap the start sector and write pointer position of the zone
457
+ * to match its position in the target range.
458
+ */
459
+ zone->start += sector_diff;
460
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
461
+ if (zone->cond == BLK_ZONE_COND_FULL)
462
+ zone->wp = zone->start + zone->len;
463
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
464
+ zone->wp = zone->start;
465
+ else
466
+ zone->wp += sector_diff;
467
+ }
468
+
469
+ args->next_sector = zone->start + zone->len;
470
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
471
+}
472
+EXPORT_SYMBOL_GPL(dm_report_zones_cb);
473
+
474
+static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
475
+ unsigned int nr_zones, report_zones_cb cb, void *data)
476
+{
477
+ struct mapped_device *md = disk->private_data;
478
+ struct dm_table *map;
479
+ int srcu_idx, ret;
480
+ struct dm_report_zones_args args = {
481
+ .next_sector = sector,
482
+ .orig_data = data,
483
+ .orig_cb = cb,
484
+ };
485
+
486
+ if (dm_suspended_md(md))
487
+ return -EAGAIN;
488
+
489
+ map = dm_get_live_table(md, &srcu_idx);
490
+ if (!map) {
491
+ ret = -EIO;
492
+ goto out;
493
+ }
494
+
495
+ do {
496
+ struct dm_target *tgt;
497
+
498
+ tgt = dm_table_find_target(map, args.next_sector);
499
+ if (WARN_ON_ONCE(!tgt->type->report_zones)) {
500
+ ret = -EIO;
501
+ goto out;
502
+ }
503
+
504
+ args.tgt = tgt;
505
+ ret = tgt->type->report_zones(tgt, &args,
506
+ nr_zones - args.zone_idx);
507
+ if (ret < 0)
508
+ goto out;
509
+ } while (args.zone_idx < nr_zones &&
510
+ args.next_sector < get_capacity(disk));
511
+
512
+ ret = args.zone_idx;
513
+out:
514
+ dm_put_live_table(md, srcu_idx);
515
+ return ret;
516
+}
517
+#else
518
+#define dm_blk_report_zones NULL
519
+#endif /* CONFIG_BLK_DEV_ZONED */
464520
465521 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
466522 struct block_device **bdev)
....@@ -531,7 +587,45 @@
531587 return r;
532588 }
533589
534
-static void start_io_acct(struct dm_io *io);
590
+u64 dm_start_time_ns_from_clone(struct bio *bio)
591
+{
592
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
593
+ struct dm_io *io = tio->io;
594
+
595
+ return jiffies_to_nsecs(io->start_time);
596
+}
597
+EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
598
+
599
+static void start_io_acct(struct dm_io *io)
600
+{
601
+ struct mapped_device *md = io->md;
602
+ struct bio *bio = io->orig_bio;
603
+
604
+ io->start_time = bio_start_io_acct(bio);
605
+ if (unlikely(dm_stats_used(&md->stats)))
606
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
607
+ bio->bi_iter.bi_sector, bio_sectors(bio),
608
+ false, 0, &io->stats_aux);
609
+}
610
+
611
+static void end_io_acct(struct mapped_device *md, struct bio *bio,
612
+ unsigned long start_time, struct dm_stats_aux *stats_aux)
613
+{
614
+ unsigned long duration = jiffies - start_time;
615
+
616
+ if (unlikely(dm_stats_used(&md->stats)))
617
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
618
+ bio->bi_iter.bi_sector, bio_sectors(bio),
619
+ true, duration, stats_aux);
620
+
621
+ smp_wmb();
622
+
623
+ bio_end_io_acct(bio, start_time);
624
+
625
+ /* nudge anyone waiting on suspend queue */
626
+ if (unlikely(wq_has_sleeper(&md->wait)))
627
+ wake_up(&md->wait);
628
+}
535629
536630 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
537631 {
....@@ -595,61 +689,6 @@
595689 if (tio->inside_dm_io)
596690 return;
597691 bio_put(&tio->clone);
598
-}
599
-
600
-int md_in_flight(struct mapped_device *md)
601
-{
602
- return atomic_read(&md->pending[READ]) +
603
- atomic_read(&md->pending[WRITE]);
604
-}
605
-
606
-static void start_io_acct(struct dm_io *io)
607
-{
608
- struct mapped_device *md = io->md;
609
- struct bio *bio = io->orig_bio;
610
- int rw = bio_data_dir(bio);
611
-
612
- io->start_time = jiffies;
613
-
614
- generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
615
- &dm_disk(md)->part0);
616
-
617
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
618
- atomic_inc_return(&md->pending[rw]));
619
-
620
- if (unlikely(dm_stats_used(&md->stats)))
621
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
622
- bio->bi_iter.bi_sector, bio_sectors(bio),
623
- false, 0, &io->stats_aux);
624
-}
625
-
626
-static void end_io_acct(struct dm_io *io)
627
-{
628
- struct mapped_device *md = io->md;
629
- struct bio *bio = io->orig_bio;
630
- unsigned long duration = jiffies - io->start_time;
631
- int pending;
632
- int rw = bio_data_dir(bio);
633
-
634
- generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
635
- io->start_time);
636
-
637
- if (unlikely(dm_stats_used(&md->stats)))
638
- dm_stats_account_io(&md->stats, bio_data_dir(bio),
639
- bio->bi_iter.bi_sector, bio_sectors(bio),
640
- true, duration, &io->stats_aux);
641
-
642
- /*
643
- * After this is decremented the bio must not be touched if it is
644
- * a flush.
645
- */
646
- pending = atomic_dec_return(&md->pending[rw]);
647
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
648
- pending += atomic_read(&md->pending[rw^0x1]);
649
-
650
- /* nudge anyone waiting on suspend queue */
651
- if (!pending)
652
- wake_up(&md->wait);
653692 }
654693
655694 /*
....@@ -748,7 +787,8 @@
748787 }
749788
750789 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
751
- fmode_t mode) {
790
+ fmode_t mode)
791
+{
752792 struct table_device *td;
753793
754794 list_for_each_entry(td, l, list)
....@@ -759,7 +799,8 @@
759799 }
760800
761801 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
762
- struct dm_dev **result) {
802
+ struct dm_dev **result)
803
+{
763804 int r;
764805 struct table_device *td;
765806
....@@ -864,6 +905,8 @@
864905 blk_status_t io_error;
865906 struct bio *bio;
866907 struct mapped_device *md = io->md;
908
+ unsigned long start_time = 0;
909
+ struct dm_stats_aux stats_aux;
867910
868911 /* Push-back supersedes any I/O errors */
869912 if (unlikely(error)) {
....@@ -890,8 +933,10 @@
890933
891934 io_error = io->status;
892935 bio = io->orig_bio;
893
- end_io_acct(io);
936
+ start_time = io->start_time;
937
+ stats_aux = io->stats_aux;
894938 free_io(md, io);
939
+ end_io_acct(md, bio, start_time, &stats_aux);
895940
896941 if (io_error == BLK_STS_DM_REQUEUE)
897942 return;
....@@ -937,6 +982,11 @@
937982 limits->max_write_zeroes_sectors = 0;
938983 }
939984
985
+static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
986
+{
987
+ return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
988
+}
989
+
940990 static void clone_endio(struct bio *bio)
941991 {
942992 blk_status_t error = bio->bi_status;
....@@ -944,8 +994,9 @@
944994 struct dm_io *io = tio->io;
945995 struct mapped_device *md = tio->io->md;
946996 dm_endio_fn endio = tio->ti->type->end_io;
997
+ struct bio *orig_bio = io->orig_bio;
947998
948
- if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
999
+ if (unlikely(error == BLK_STS_TARGET)) {
9491000 if (bio_op(bio) == REQ_OP_DISCARD &&
9501001 !bio->bi_disk->queue->limits.max_discard_sectors)
9511002 disable_discard(md);
....@@ -957,12 +1008,24 @@
9571008 disable_write_zeroes(md);
9581009 }
9591010
1011
+ /*
1012
+ * For zone-append bios get offset in zone of the written
1013
+ * sector and add that to the original bio sector pos.
1014
+ */
1015
+ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
1016
+ sector_t written_sector = bio->bi_iter.bi_sector;
1017
+ struct request_queue *q = orig_bio->bi_disk->queue;
1018
+ u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
1019
+
1020
+ orig_bio->bi_iter.bi_sector += written_sector & mask;
1021
+ }
1022
+
9601023 if (endio) {
9611024 int r = endio(tio->ti, bio, &error);
9621025 switch (r) {
9631026 case DM_ENDIO_REQUEUE:
9641027 error = BLK_STS_DM_REQUEUE;
965
- /*FALLTHRU*/
1028
+ fallthrough;
9661029 case DM_ENDIO_DONE:
9671030 break;
9681031 case DM_ENDIO_INCOMPLETE:
....@@ -974,6 +1037,11 @@
9741037 }
9751038 }
9761039
1040
+ if (unlikely(swap_bios_limit(tio->ti, bio))) {
1041
+ struct mapped_device *md = io->md;
1042
+ up(&md->swap_bios_semaphore);
1043
+ }
1044
+
9771045 free_tio(tio);
9781046 dec_pending(io, error);
9791047 }
....@@ -982,29 +1050,28 @@
9821050 * Return maximum size of I/O possible at the supplied sector up to the current
9831051 * target boundary.
9841052 */
985
-static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1053
+static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1054
+ sector_t target_offset)
9861055 {
987
- sector_t target_offset = dm_target_offset(ti, sector);
988
-
9891056 return ti->len - target_offset;
9901057 }
9911058
992
-static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1059
+static sector_t max_io_len(struct dm_target *ti, sector_t sector)
9931060 {
994
- sector_t len = max_io_len_target_boundary(sector, ti);
995
- sector_t offset, max_len;
1061
+ sector_t target_offset = dm_target_offset(ti, sector);
1062
+ sector_t len = max_io_len_target_boundary(ti, target_offset);
1063
+ sector_t max_len;
9961064
9971065 /*
998
- * Does the target need to split even further?
1066
+ * Does the target need to split IO even further?
1067
+ * - varied (per target) IO splitting is a tenet of DM; this
1068
+ * explains why stacked chunk_sectors based splitting via
1069
+ * blk_max_size_offset() isn't possible here. So pass in
1070
+ * ti->max_io_len to override stacked chunk_sectors.
9991071 */
10001072 if (ti->max_io_len) {
1001
- offset = dm_target_offset(ti, sector);
1002
- if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1003
- max_len = sector_div(offset, ti->max_io_len);
1004
- else
1005
- max_len = offset & (ti->max_io_len - 1);
1006
- max_len = ti->max_io_len - max_len;
1007
-
1073
+ max_len = blk_max_size_offset(ti->table->md->queue,
1074
+ target_offset, ti->max_io_len);
10081075 if (len > max_len)
10091076 len = max_len;
10101077 }
....@@ -1039,7 +1106,7 @@
10391106 return NULL;
10401107
10411108 ti = dm_table_find_target(map, sector);
1042
- if (!dm_target_is_valid(ti))
1109
+ if (!ti)
10431110 return NULL;
10441111
10451112 return ti;
....@@ -1060,13 +1127,33 @@
10601127 goto out;
10611128 if (!ti->type->direct_access)
10621129 goto out;
1063
- len = max_io_len(sector, ti) / PAGE_SECTORS;
1130
+ len = max_io_len(ti, sector) / PAGE_SECTORS;
10641131 if (len < 1)
10651132 goto out;
10661133 nr_pages = min(len, nr_pages);
10671134 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
10681135
10691136 out:
1137
+ dm_put_live_table(md, srcu_idx);
1138
+
1139
+ return ret;
1140
+}
1141
+
1142
+static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1143
+ int blocksize, sector_t start, sector_t len)
1144
+{
1145
+ struct mapped_device *md = dax_get_private(dax_dev);
1146
+ struct dm_table *map;
1147
+ bool ret = false;
1148
+ int srcu_idx;
1149
+
1150
+ map = dm_get_live_table(md, &srcu_idx);
1151
+ if (!map)
1152
+ goto out;
1153
+
1154
+ ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
1155
+
1156
+out:
10701157 dm_put_live_table(md, srcu_idx);
10711158
10721159 return ret;
....@@ -1120,9 +1207,37 @@
11201207 return ret;
11211208 }
11221209
1210
+static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1211
+ size_t nr_pages)
1212
+{
1213
+ struct mapped_device *md = dax_get_private(dax_dev);
1214
+ sector_t sector = pgoff * PAGE_SECTORS;
1215
+ struct dm_target *ti;
1216
+ int ret = -EIO;
1217
+ int srcu_idx;
1218
+
1219
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1220
+
1221
+ if (!ti)
1222
+ goto out;
1223
+ if (WARN_ON(!ti->type->dax_zero_page_range)) {
1224
+ /*
1225
+ * ->zero_page_range() is mandatory dax operation. If we are
1226
+ * here, something is wrong.
1227
+ */
1228
+ goto out;
1229
+ }
1230
+ ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1231
+ out:
1232
+ dm_put_live_table(md, srcu_idx);
1233
+
1234
+ return ret;
1235
+}
1236
+
11231237 /*
11241238 * A target may call dm_accept_partial_bio only from the map routine. It is
1125
- * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1239
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1240
+ * operations and REQ_OP_ZONE_APPEND (zone append writes).
11261241 *
11271242 * dm_accept_partial_bio informs the dm that the target only wants to process
11281243 * additional n_sectors sectors of the bio and the rest of the data should be
....@@ -1152,105 +1267,33 @@
11521267 {
11531268 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
11541269 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1270
+
11551271 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1272
+ BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1273
+ BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
11561274 BUG_ON(bi_size > *tio->len_ptr);
11571275 BUG_ON(n_sectors > bi_size);
1276
+
11581277 *tio->len_ptr -= bi_size - n_sectors;
11591278 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
11601279 }
11611280 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
11621281
1163
-/*
1164
- * The zone descriptors obtained with a zone report indicate zone positions
1165
- * within the target backing device, regardless of that device is a partition
1166
- * and regardless of the target mapping start sector on the device or partition.
1167
- * The zone descriptors start sector and write pointer position must be adjusted
1168
- * to match their relative position within the dm device.
1169
- * A target may call dm_remap_zone_report() after completion of a
1170
- * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
1171
- * backing device.
1172
- */
1173
-void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1282
+static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
11741283 {
1175
-#ifdef CONFIG_BLK_DEV_ZONED
1176
- struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1177
- struct bio *report_bio = tio->io->orig_bio;
1178
- struct blk_zone_report_hdr *hdr = NULL;
1179
- struct blk_zone *zone;
1180
- unsigned int nr_rep = 0;
1181
- unsigned int ofst;
1182
- sector_t part_offset;
1183
- struct bio_vec bvec;
1184
- struct bvec_iter iter;
1185
- void *addr;
1186
-
1187
- if (bio->bi_status)
1188
- return;
1189
-
1190
- /*
1191
- * bio sector was incremented by the request size on completion. Taking
1192
- * into account the original request sector, the target start offset on
1193
- * the backing device and the target mapping offset (ti->begin), the
1194
- * start sector of the backing device. The partition offset is always 0
1195
- * if the target uses a whole device.
1196
- */
1197
- part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1198
-
1199
- /*
1200
- * Remap the start sector of the reported zones. For sequential zones,
1201
- * also remap the write pointer position.
1202
- */
1203
- bio_for_each_segment(bvec, report_bio, iter) {
1204
- addr = kmap_atomic(bvec.bv_page);
1205
-
1206
- /* Remember the report header in the first page */
1207
- if (!hdr) {
1208
- hdr = addr;
1209
- ofst = sizeof(struct blk_zone_report_hdr);
1210
- } else
1211
- ofst = 0;
1212
-
1213
- /* Set zones start sector */
1214
- while (hdr->nr_zones && ofst < bvec.bv_len) {
1215
- zone = addr + ofst;
1216
- zone->start -= part_offset;
1217
- if (zone->start >= start + ti->len) {
1218
- hdr->nr_zones = 0;
1219
- break;
1220
- }
1221
- zone->start = zone->start + ti->begin - start;
1222
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1223
- if (zone->cond == BLK_ZONE_COND_FULL)
1224
- zone->wp = zone->start + zone->len;
1225
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
1226
- zone->wp = zone->start;
1227
- else
1228
- zone->wp = zone->wp + ti->begin - start - part_offset;
1229
- }
1230
- ofst += sizeof(struct blk_zone);
1231
- hdr->nr_zones--;
1232
- nr_rep++;
1233
- }
1234
-
1235
- if (addr != hdr)
1236
- kunmap_atomic(addr);
1237
-
1238
- if (!hdr->nr_zones)
1239
- break;
1284
+ mutex_lock(&md->swap_bios_lock);
1285
+ while (latch < md->swap_bios) {
1286
+ cond_resched();
1287
+ down(&md->swap_bios_semaphore);
1288
+ md->swap_bios--;
12401289 }
1241
-
1242
- if (hdr) {
1243
- hdr->nr_zones = nr_rep;
1244
- kunmap_atomic(hdr);
1290
+ while (latch > md->swap_bios) {
1291
+ cond_resched();
1292
+ up(&md->swap_bios_semaphore);
1293
+ md->swap_bios++;
12451294 }
1246
-
1247
- bio_advance(report_bio, report_bio->bi_iter.bi_size);
1248
-
1249
-#else /* !CONFIG_BLK_DEV_ZONED */
1250
- bio->bi_status = BLK_STS_NOTSUPP;
1251
-#endif
1295
+ mutex_unlock(&md->swap_bios_lock);
12521296 }
1253
-EXPORT_SYMBOL_GPL(dm_remap_zone_report);
12541297
12551298 static blk_qc_t __map_bio(struct dm_target_io *tio)
12561299 {
....@@ -1258,7 +1301,6 @@
12581301 sector_t sector;
12591302 struct bio *clone = &tio->clone;
12601303 struct dm_io *io = tio->io;
1261
- struct mapped_device *md = io->md;
12621304 struct dm_target *ti = tio->ti;
12631305 blk_qc_t ret = BLK_QC_T_NONE;
12641306
....@@ -1272,6 +1314,14 @@
12721314 atomic_inc(&io->io_count);
12731315 sector = clone->bi_iter.bi_sector;
12741316
1317
+ if (unlikely(swap_bios_limit(ti, clone))) {
1318
+ struct mapped_device *md = io->md;
1319
+ int latch = get_swap_bios();
1320
+ if (unlikely(latch != md->swap_bios))
1321
+ __set_swap_bios_limit(md, latch);
1322
+ down(&md->swap_bios_semaphore);
1323
+ }
1324
+
12751325 r = ti->type->map(ti, clone);
12761326 switch (r) {
12771327 case DM_MAPIO_SUBMITTED:
....@@ -1280,16 +1330,21 @@
12801330 /* the bio has been remapped so dispatch it */
12811331 trace_block_bio_remap(clone->bi_disk->queue, clone,
12821332 bio_dev(io->orig_bio), sector);
1283
- if (md->type == DM_TYPE_NVME_BIO_BASED)
1284
- ret = direct_make_request(clone);
1285
- else
1286
- ret = generic_make_request(clone);
1333
+ ret = submit_bio_noacct(clone);
12871334 break;
12881335 case DM_MAPIO_KILL:
1336
+ if (unlikely(swap_bios_limit(ti, clone))) {
1337
+ struct mapped_device *md = io->md;
1338
+ up(&md->swap_bios_semaphore);
1339
+ }
12891340 free_tio(tio);
12901341 dec_pending(io, BLK_STS_IOERR);
12911342 break;
12921343 case DM_MAPIO_REQUEUE:
1344
+ if (unlikely(swap_bios_limit(ti, clone))) {
1345
+ struct mapped_device *md = io->md;
1346
+ up(&md->swap_bios_semaphore);
1347
+ }
12931348 free_tio(tio);
12941349 dec_pending(io, BLK_STS_DM_REQUEUE);
12951350 break;
....@@ -1314,13 +1369,15 @@
13141369 sector_t sector, unsigned len)
13151370 {
13161371 struct bio *clone = &tio->clone;
1372
+ int r;
13171373
13181374 __bio_clone_fast(clone, bio);
13191375
1320
- bio_crypt_clone(clone, bio, GFP_NOIO);
1376
+ r = bio_crypt_clone(clone, bio, GFP_NOIO);
1377
+ if (r < 0)
1378
+ return r;
13211379
1322
- if (unlikely(bio_integrity(bio) != NULL)) {
1323
- int r;
1380
+ if (bio_integrity(bio)) {
13241381 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
13251382 !dm_target_passes_integrity(tio->ti->type))) {
13261383 DMWARN("%s: the target %s doesn't support integrity data.",
....@@ -1334,11 +1391,10 @@
13341391 return r;
13351392 }
13361393
1337
- if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1338
- bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1394
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
13391395 clone->bi_iter.bi_size = to_bytes(len);
13401396
1341
- if (unlikely(bio_integrity(bio) != NULL))
1397
+ if (bio_integrity(bio))
13421398 bio_integrity_trim(clone);
13431399
13441400 return 0;
....@@ -1417,11 +1473,32 @@
14171473 {
14181474 unsigned target_nr = 0;
14191475 struct dm_target *ti;
1476
+ struct bio flush_bio;
1477
+
1478
+ /*
1479
+ * Use an on-stack bio for this, it's safe since we don't
1480
+ * need to reference it after submit. It's just used as
1481
+ * the basis for the clone(s).
1482
+ */
1483
+ bio_init(&flush_bio, NULL, 0);
1484
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1485
+ ci->bio = &flush_bio;
1486
+ ci->sector_count = 0;
1487
+
1488
+ /*
1489
+ * Empty flush uses a statically initialized bio, as the base for
1490
+ * cloning. However, blkg association requires that a bdev is
1491
+ * associated with a gendisk, which doesn't happen until the bdev is
1492
+ * opened. So, blkg association is done at issue time of the flush
1493
+ * rather than when the device is created in alloc_dev().
1494
+ */
1495
+ bio_set_dev(ci->bio, ci->io->md->bdev);
14201496
14211497 BUG_ON(bio_has_data(ci->bio));
14221498 while ((ti = dm_table_get_target(ci->map, target_nr++)))
14231499 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
14241500
1501
+ bio_uninit(ci->bio);
14251502 return 0;
14261503 }
14271504
....@@ -1444,41 +1521,10 @@
14441521 return 0;
14451522 }
14461523
1447
-typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1448
-
1449
-static unsigned get_num_discard_bios(struct dm_target *ti)
1450
-{
1451
- return ti->num_discard_bios;
1452
-}
1453
-
1454
-static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1455
-{
1456
- return ti->num_secure_erase_bios;
1457
-}
1458
-
1459
-static unsigned get_num_write_same_bios(struct dm_target *ti)
1460
-{
1461
- return ti->num_write_same_bios;
1462
-}
1463
-
1464
-static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1465
-{
1466
- return ti->num_write_zeroes_bios;
1467
-}
1468
-
1469
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
1470
-
1471
-static bool is_split_required_for_discard(struct dm_target *ti)
1472
-{
1473
- return ti->split_discard_bios;
1474
-}
1475
-
14761524 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1477
- get_num_bios_fn get_num_bios,
1478
- is_split_required_fn is_split_required)
1525
+ unsigned num_bios)
14791526 {
14801527 unsigned len;
1481
- unsigned num_bios;
14821528
14831529 /*
14841530 * Even though the device advertised support for this type of
....@@ -1486,14 +1532,11 @@
14861532 * reconfiguration might also have changed that since the
14871533 * check was performed.
14881534 */
1489
- num_bios = get_num_bios ? get_num_bios(ti) : 0;
14901535 if (!num_bios)
14911536 return -EOPNOTSUPP;
14921537
1493
- if (is_split_required && !is_split_required(ti))
1494
- len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1495
- else
1496
- len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1538
+ len = min_t(sector_t, ci->sector_count,
1539
+ max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
14971540
14981541 __send_duplicate_bios(ci, ti, num_bios, &len);
14991542
....@@ -1503,43 +1546,46 @@
15031546 return 0;
15041547 }
15051548
1506
-static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1549
+static bool is_abnormal_io(struct bio *bio)
15071550 {
1508
- return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1509
- is_split_required_for_discard);
1510
-}
1551
+ bool r = false;
15111552
1512
-static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1513
-{
1514
- return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1515
-}
1553
+ switch (bio_op(bio)) {
1554
+ case REQ_OP_DISCARD:
1555
+ case REQ_OP_SECURE_ERASE:
1556
+ case REQ_OP_WRITE_SAME:
1557
+ case REQ_OP_WRITE_ZEROES:
1558
+ r = true;
1559
+ break;
1560
+ }
15161561
1517
-static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1518
-{
1519
- return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1520
-}
1521
-
1522
-static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1523
-{
1524
- return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1562
+ return r;
15251563 }
15261564
15271565 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
15281566 int *result)
15291567 {
15301568 struct bio *bio = ci->bio;
1569
+ unsigned num_bios = 0;
15311570
1532
- if (bio_op(bio) == REQ_OP_DISCARD)
1533
- *result = __send_discard(ci, ti);
1534
- else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1535
- *result = __send_secure_erase(ci, ti);
1536
- else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1537
- *result = __send_write_same(ci, ti);
1538
- else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1539
- *result = __send_write_zeroes(ci, ti);
1540
- else
1571
+ switch (bio_op(bio)) {
1572
+ case REQ_OP_DISCARD:
1573
+ num_bios = ti->num_discard_bios;
1574
+ break;
1575
+ case REQ_OP_SECURE_ERASE:
1576
+ num_bios = ti->num_secure_erase_bios;
1577
+ break;
1578
+ case REQ_OP_WRITE_SAME:
1579
+ num_bios = ti->num_write_same_bios;
1580
+ break;
1581
+ case REQ_OP_WRITE_ZEROES:
1582
+ num_bios = ti->num_write_zeroes_bios;
1583
+ break;
1584
+ default:
15411585 return false;
1586
+ }
15421587
1588
+ *result = __send_changing_extent_only(ci, ti, num_bios);
15431589 return true;
15441590 }
15451591
....@@ -1548,23 +1594,18 @@
15481594 */
15491595 static int __split_and_process_non_flush(struct clone_info *ci)
15501596 {
1551
- struct bio *bio = ci->bio;
15521597 struct dm_target *ti;
15531598 unsigned len;
15541599 int r;
15551600
15561601 ti = dm_table_find_target(ci->map, ci->sector);
1557
- if (!dm_target_is_valid(ti))
1602
+ if (!ti)
15581603 return -EIO;
15591604
1560
- if (unlikely(__process_abnormal_io(ci, ti, &r)))
1605
+ if (__process_abnormal_io(ci, ti, &r))
15611606 return r;
15621607
1563
- if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1564
- len = ci->sector_count;
1565
- else
1566
- len = min_t(sector_t, max_io_len(ci->sector, ti),
1567
- ci->sector_count);
1608
+ len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
15681609
15691610 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
15701611 if (r < 0)
....@@ -1584,6 +1625,9 @@
15841625 ci->sector = bio->bi_iter.bi_sector;
15851626 }
15861627
1628
+#define __dm_part_stat_sub(part, field, subnd) \
1629
+ (part_stat_get(part, field) -= (subnd))
1630
+
15871631 /*
15881632 * Entry point to split a bio into clones and submit them to the targets.
15891633 */
....@@ -1594,21 +1638,12 @@
15941638 blk_qc_t ret = BLK_QC_T_NONE;
15951639 int error = 0;
15961640
1597
- if (unlikely(!map)) {
1598
- bio_io_error(bio);
1599
- return ret;
1600
- }
1601
-
1602
- blk_queue_split(md->queue, &bio);
1603
-
16041641 init_clone_info(&ci, md, map, bio);
16051642
16061643 if (bio->bi_opf & REQ_PREFLUSH) {
1607
- ci.bio = &ci.io->md->flush_bio;
1608
- ci.sector_count = 0;
16091644 error = __send_empty_flush(&ci);
16101645 /* dec_pending submits any data associated with flush */
1611
- } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1646
+ } else if (op_is_zone_mgmt(bio_op(bio))) {
16121647 ci.bio = bio;
16131648 ci.sector_count = 0;
16141649 error = __split_and_process_non_flush(&ci);
....@@ -1619,21 +1654,32 @@
16191654 error = __split_and_process_non_flush(&ci);
16201655 if (current->bio_list && ci.sector_count && !error) {
16211656 /*
1622
- * Remainder must be passed to generic_make_request()
1657
+ * Remainder must be passed to submit_bio_noacct()
16231658 * so that it gets handled *after* bios already submitted
16241659 * have been completely processed.
16251660 * We take a clone of the original to store in
16261661 * ci.io->orig_bio to be used by end_io_acct() and
16271662 * for dec_pending to use for completion handling.
1628
- * As this path is not used for REQ_OP_ZONE_REPORT,
1629
- * the usage of io->orig_bio in dm_remap_zone_report()
1630
- * won't be affected by this reassignment.
16311663 */
16321664 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
16331665 GFP_NOIO, &md->queue->bio_split);
16341666 ci.io->orig_bio = b;
1667
+
1668
+ /*
1669
+ * Adjust IO stats for each split, otherwise upon queue
1670
+ * reentry there will be redundant IO accounting.
1671
+ * NOTE: this is a stop-gap fix, a proper fix involves
1672
+ * significant refactoring of DM core's bio splitting
1673
+ * (by eliminating DM's splitting and just using bio_split)
1674
+ */
1675
+ part_stat_lock();
1676
+ __dm_part_stat_sub(&dm_disk(md)->part0,
1677
+ sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1678
+ part_stat_unlock();
1679
+
16351680 bio_chain(b, bio);
1636
- ret = generic_make_request(bio);
1681
+ trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1682
+ ret = submit_bio_noacct(bio);
16371683 break;
16381684 }
16391685 }
....@@ -1644,121 +1690,38 @@
16441690 return ret;
16451691 }
16461692
1647
-/*
1648
- * Optimized variant of __split_and_process_bio that leverages the
1649
- * fact that targets that use it do _not_ have a need to split bios.
1650
- */
1651
-static blk_qc_t __process_bio(struct mapped_device *md,
1652
- struct dm_table *map, struct bio *bio)
1693
+static blk_qc_t dm_submit_bio(struct bio *bio)
16531694 {
1654
- struct clone_info ci;
1655
- blk_qc_t ret = BLK_QC_T_NONE;
1656
- int error = 0;
1657
-
1658
- if (unlikely(!map)) {
1659
- bio_io_error(bio);
1660
- return ret;
1661
- }
1662
-
1663
- init_clone_info(&ci, md, map, bio);
1664
-
1665
- if (bio->bi_opf & REQ_PREFLUSH) {
1666
- ci.bio = &ci.io->md->flush_bio;
1667
- ci.sector_count = 0;
1668
- error = __send_empty_flush(&ci);
1669
- /* dec_pending submits any data associated with flush */
1670
- } else {
1671
- struct dm_target *ti = md->immutable_target;
1672
- struct dm_target_io *tio;
1673
-
1674
- /*
1675
- * Defend against IO still getting in during teardown
1676
- * - as was seen for a time with nvme-fcloop
1677
- */
1678
- if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1679
- error = -EIO;
1680
- goto out;
1681
- }
1682
-
1683
- ci.bio = bio;
1684
- ci.sector_count = bio_sectors(bio);
1685
- if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1686
- goto out;
1687
-
1688
- tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1689
- ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1690
- }
1691
-out:
1692
- /* drop the extra reference count */
1693
- dec_pending(ci.io, errno_to_blk_status(error));
1694
- return ret;
1695
-}
1696
-
1697
-typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1698
-
1699
-static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1700
- process_bio_fn process_bio)
1701
-{
1702
- struct mapped_device *md = q->queuedata;
1695
+ struct mapped_device *md = bio->bi_disk->private_data;
17031696 blk_qc_t ret = BLK_QC_T_NONE;
17041697 int srcu_idx;
17051698 struct dm_table *map;
17061699
17071700 map = dm_get_live_table(md, &srcu_idx);
17081701
1709
- /* if we're suspended, we have to queue this io for later */
1710
- if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1711
- dm_put_live_table(md, srcu_idx);
1712
-
1713
- if (!(bio->bi_opf & REQ_RAHEAD))
1714
- queue_io(md, bio);
1715
- else
1702
+ /* If suspended, or map not yet available, queue this IO for later */
1703
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1704
+ unlikely(!map)) {
1705
+ if (bio->bi_opf & REQ_NOWAIT)
1706
+ bio_wouldblock_error(bio);
1707
+ else if (bio->bi_opf & REQ_RAHEAD)
17161708 bio_io_error(bio);
1717
- return ret;
1709
+ else
1710
+ queue_io(md, bio);
1711
+ goto out;
17181712 }
17191713
1720
- ret = process_bio(md, map, bio);
1714
+ /*
1715
+ * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1716
+ * otherwise associated queue_limits won't be imposed.
1717
+ */
1718
+ if (is_abnormal_io(bio))
1719
+ blk_queue_split(&bio);
17211720
1721
+ ret = __split_and_process_bio(md, map, bio);
1722
+out:
17221723 dm_put_live_table(md, srcu_idx);
17231724 return ret;
1724
-}
1725
-
1726
-/*
1727
- * The request function that remaps the bio to one target and
1728
- * splits off any remainder.
1729
- */
1730
-static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1731
-{
1732
- return __dm_make_request(q, bio, __split_and_process_bio);
1733
-}
1734
-
1735
-static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1736
-{
1737
- return __dm_make_request(q, bio, __process_bio);
1738
-}
1739
-
1740
-static int dm_any_congested(void *congested_data, int bdi_bits)
1741
-{
1742
- int r = bdi_bits;
1743
- struct mapped_device *md = congested_data;
1744
- struct dm_table *map;
1745
-
1746
- if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1747
- if (dm_request_based(md)) {
1748
- /*
1749
- * With request-based DM we only need to check the
1750
- * top-level queue for congestion.
1751
- */
1752
- r = md->queue->backing_dev_info->wb.state & bdi_bits;
1753
- } else {
1754
- map = dm_get_live_table_fast(md);
1755
- if (map)
1756
- r = dm_table_any_congested(map, bdi_bits);
1757
- dm_put_live_table_fast(md);
1758
- }
1759
- }
1760
-
1761
- return r;
17621725 }
17631726
17641727 /*-----------------------------------------------------------------
....@@ -1811,29 +1774,28 @@
18111774 }
18121775
18131776 static const struct block_device_operations dm_blk_dops;
1777
+static const struct block_device_operations dm_rq_blk_dops;
18141778 static const struct dax_operations dm_dax_ops;
18151779
18161780 static void dm_wq_work(struct work_struct *work);
18171781
1818
-static void dm_init_normal_md_queue(struct mapped_device *md)
1782
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1783
+static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
18191784 {
1820
- md->use_blk_mq = false;
1821
-
1822
- /*
1823
- * Initialize aspects of queue that aren't relevant for blk-mq
1824
- */
1825
- md->queue->backing_dev_info->congested_data = md;
1826
- md->queue->backing_dev_info->congested_fn = dm_any_congested;
1785
+ dm_destroy_keyslot_manager(q->ksm);
18271786 }
18281787
1829
-static void dm_destroy_inline_encryption(struct request_queue *q);
1788
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1789
+
1790
+static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1791
+{
1792
+}
1793
+#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
18301794
18311795 static void cleanup_mapped_device(struct mapped_device *md)
18321796 {
18331797 if (md->wq)
18341798 destroy_workqueue(md->wq);
1835
- if (md->kworker_task)
1836
- kthread_stop(md->kworker_task);
18371799 bioset_exit(&md->bs);
18381800 bioset_exit(&md->io_bs);
18391801
....@@ -1852,7 +1814,7 @@
18521814 }
18531815
18541816 if (md->queue) {
1855
- dm_destroy_inline_encryption(md->queue);
1817
+ dm_queue_destroy_keyslot_manager(md->queue);
18561818 blk_cleanup_queue(md->queue);
18571819 }
18581820
....@@ -1866,6 +1828,7 @@
18661828 mutex_destroy(&md->suspend_lock);
18671829 mutex_destroy(&md->type_lock);
18681830 mutex_destroy(&md->table_devices_lock);
1831
+ mutex_destroy(&md->swap_bios_lock);
18691832
18701833 dm_mq_cleanup_mapped_device(md);
18711834 }
....@@ -1876,7 +1839,6 @@
18761839 static struct mapped_device *alloc_dev(int minor)
18771840 {
18781841 int r, numa_node_id = dm_get_numa_node();
1879
- struct dax_device *dax_dev = NULL;
18801842 struct mapped_device *md;
18811843 void *old_md;
18821844
....@@ -1902,7 +1864,6 @@
19021864 goto bad_io_barrier;
19031865
19041866 md->numa_node_id = numa_node_id;
1905
- md->use_blk_mq = dm_use_blk_mq_default();
19061867 md->init_tio_pdu = false;
19071868 md->type = DM_TYPE_NONE;
19081869 mutex_init(&md->suspend_lock);
....@@ -1917,28 +1878,27 @@
19171878 INIT_LIST_HEAD(&md->table_devices);
19181879 spin_lock_init(&md->uevent_lock);
19191880
1920
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1881
+ /*
1882
+ * default to bio-based until DM table is loaded and md->type
1883
+ * established. If request-based table is loaded: blk-mq will
1884
+ * override accordingly.
1885
+ */
1886
+ md->queue = blk_alloc_queue(numa_node_id);
19211887 if (!md->queue)
19221888 goto bad;
1923
- md->queue->queuedata = md;
1924
- /*
1925
- * default to bio-based required ->make_request_fn until DM
1926
- * table is loaded and md->type established. If request-based
1927
- * table is loaded: blk-mq will override accordingly.
1928
- */
1929
- blk_queue_make_request(md->queue, dm_make_request);
19301889
19311890 md->disk = alloc_disk_node(1, md->numa_node_id);
19321891 if (!md->disk)
19331892 goto bad;
19341893
1935
- atomic_set(&md->pending[0], 0);
1936
- atomic_set(&md->pending[1], 0);
19371894 init_waitqueue_head(&md->wait);
19381895 INIT_WORK(&md->work, dm_wq_work);
19391896 init_waitqueue_head(&md->eventq);
19401897 init_completion(&md->kobj_holder.completion);
1941
- md->kworker_task = NULL;
1898
+
1899
+ md->swap_bios = get_swap_bios();
1900
+ sema_init(&md->swap_bios_semaphore, md->swap_bios);
1901
+ mutex_init(&md->swap_bios_lock);
19421902
19431903 md->disk->major = _major;
19441904 md->disk->first_minor = minor;
....@@ -1948,11 +1908,13 @@
19481908 sprintf(md->disk->disk_name, "dm-%d", minor);
19491909
19501910 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1951
- dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1952
- if (!dax_dev)
1911
+ md->dax_dev = alloc_dax(md, md->disk->disk_name,
1912
+ &dm_dax_ops, 0);
1913
+ if (IS_ERR(md->dax_dev)) {
1914
+ md->dax_dev = NULL;
19531915 goto bad;
1916
+ }
19541917 }
1955
- md->dax_dev = dax_dev;
19561918
19571919 add_disk_no_queue_reg(md->disk);
19581920 format_dev_t(md->name, MKDEV(_major, minor));
....@@ -1964,10 +1926,6 @@
19641926 md->bdev = bdget_disk(md->disk, 0);
19651927 if (!md->bdev)
19661928 goto bad;
1967
-
1968
- bio_init(&md->flush_bio, NULL, 0);
1969
- bio_set_dev(&md->flush_bio, md->bdev);
1970
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
19711929
19721930 dm_stats_init(&md->stats);
19731931
....@@ -2072,18 +2030,6 @@
20722030 }
20732031
20742032 /*
2075
- * Protected by md->suspend_lock obtained by dm_swap_table().
2076
- */
2077
-static void __set_size(struct mapped_device *md, sector_t size)
2078
-{
2079
- lockdep_assert_held(&md->suspend_lock);
2080
-
2081
- set_capacity(md->disk, size);
2082
-
2083
- i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2084
-}
2085
-
2086
-/*
20872033 * Returns old map, which caller must destroy.
20882034 */
20892035 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
....@@ -2105,7 +2051,8 @@
21052051 if (size != dm_get_size(md))
21062052 memset(&md->geometry, 0, sizeof(md->geometry));
21072053
2108
- __set_size(md, size);
2054
+ set_capacity(md->disk, size);
2055
+ bd_set_nr_sectors(md->bdev, size);
21092056
21102057 dm_table_event_callback(t, event_callback, md);
21112058
....@@ -2119,12 +2066,10 @@
21192066 if (request_based)
21202067 dm_stop_queue(q);
21212068
2122
- if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2069
+ if (request_based) {
21232070 /*
2124
- * Leverage the fact that request-based DM targets and
2125
- * NVMe bio based targets are immutable singletons
2126
- * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2127
- * and __process_bio.
2071
+ * Leverage the fact that request-based DM targets are
2072
+ * immutable singletons - used to optimize dm_mq_queue_rq.
21282073 */
21292074 md->immutable_target = dm_table_get_immutable_target(t);
21302075 }
....@@ -2227,166 +2172,6 @@
22272172 }
22282173 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
22292174
2230
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
2231
-struct dm_keyslot_evict_args {
2232
- const struct blk_crypto_key *key;
2233
- int err;
2234
-};
2235
-
2236
-static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
2237
- sector_t start, sector_t len, void *data)
2238
-{
2239
- struct dm_keyslot_evict_args *args = data;
2240
- int err;
2241
-
2242
- err = blk_crypto_evict_key(dev->bdev->bd_queue, args->key);
2243
- if (!args->err)
2244
- args->err = err;
2245
- /* Always try to evict the key from all devices. */
2246
- return 0;
2247
-}
2248
-
2249
-/*
2250
- * When an inline encryption key is evicted from a device-mapper device, evict
2251
- * it from all the underlying devices.
2252
- */
2253
-static int dm_keyslot_evict(struct keyslot_manager *ksm,
2254
- const struct blk_crypto_key *key, unsigned int slot)
2255
-{
2256
- struct mapped_device *md = keyslot_manager_private(ksm);
2257
- struct dm_keyslot_evict_args args = { key };
2258
- struct dm_table *t;
2259
- int srcu_idx;
2260
- int i;
2261
- struct dm_target *ti;
2262
-
2263
- t = dm_get_live_table(md, &srcu_idx);
2264
- if (!t)
2265
- return 0;
2266
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
2267
- ti = dm_table_get_target(t, i);
2268
- if (!ti->type->iterate_devices)
2269
- continue;
2270
- ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args);
2271
- }
2272
- dm_put_live_table(md, srcu_idx);
2273
- return args.err;
2274
-}
2275
-
2276
-struct dm_derive_raw_secret_args {
2277
- const u8 *wrapped_key;
2278
- unsigned int wrapped_key_size;
2279
- u8 *secret;
2280
- unsigned int secret_size;
2281
- int err;
2282
-};
2283
-
2284
-static int dm_derive_raw_secret_callback(struct dm_target *ti,
2285
- struct dm_dev *dev, sector_t start,
2286
- sector_t len, void *data)
2287
-{
2288
- struct dm_derive_raw_secret_args *args = data;
2289
- struct request_queue *q = dev->bdev->bd_queue;
2290
-
2291
- if (!args->err)
2292
- return 0;
2293
-
2294
- if (!q->ksm) {
2295
- args->err = -EOPNOTSUPP;
2296
- return 0;
2297
- }
2298
-
2299
- args->err = keyslot_manager_derive_raw_secret(q->ksm, args->wrapped_key,
2300
- args->wrapped_key_size,
2301
- args->secret,
2302
- args->secret_size);
2303
- /* Try another device in case this fails. */
2304
- return 0;
2305
-}
2306
-
2307
-/*
2308
- * Retrieve the raw_secret from the underlying device. Given that
2309
- * only only one raw_secret can exist for a particular wrappedkey,
2310
- * retrieve it only from the first device that supports derive_raw_secret()
2311
- */
2312
-static int dm_derive_raw_secret(struct keyslot_manager *ksm,
2313
- const u8 *wrapped_key,
2314
- unsigned int wrapped_key_size,
2315
- u8 *secret, unsigned int secret_size)
2316
-{
2317
- struct mapped_device *md = keyslot_manager_private(ksm);
2318
- struct dm_derive_raw_secret_args args = {
2319
- .wrapped_key = wrapped_key,
2320
- .wrapped_key_size = wrapped_key_size,
2321
- .secret = secret,
2322
- .secret_size = secret_size,
2323
- .err = -EOPNOTSUPP,
2324
- };
2325
- struct dm_table *t;
2326
- int srcu_idx;
2327
- int i;
2328
- struct dm_target *ti;
2329
-
2330
- t = dm_get_live_table(md, &srcu_idx);
2331
- if (!t)
2332
- return -EOPNOTSUPP;
2333
- for (i = 0; i < dm_table_get_num_targets(t); i++) {
2334
- ti = dm_table_get_target(t, i);
2335
- if (!ti->type->iterate_devices)
2336
- continue;
2337
- ti->type->iterate_devices(ti, dm_derive_raw_secret_callback,
2338
- &args);
2339
- if (!args.err)
2340
- break;
2341
- }
2342
- dm_put_live_table(md, srcu_idx);
2343
- return args.err;
2344
-}
2345
-
2346
-static struct keyslot_mgmt_ll_ops dm_ksm_ll_ops = {
2347
- .keyslot_evict = dm_keyslot_evict,
2348
- .derive_raw_secret = dm_derive_raw_secret,
2349
-};
2350
-
2351
-static int dm_init_inline_encryption(struct mapped_device *md)
2352
-{
2353
- unsigned int features;
2354
- unsigned int mode_masks[BLK_ENCRYPTION_MODE_MAX];
2355
-
2356
- /*
2357
- * Initially declare support for all crypto settings. Anything
2358
- * unsupported by a child device will be removed later when calculating
2359
- * the device restrictions.
2360
- */
2361
- features = BLK_CRYPTO_FEATURE_STANDARD_KEYS |
2362
- BLK_CRYPTO_FEATURE_WRAPPED_KEYS;
2363
- memset(mode_masks, 0xFF, sizeof(mode_masks));
2364
-
2365
- md->queue->ksm = keyslot_manager_create_passthrough(NULL,
2366
- &dm_ksm_ll_ops,
2367
- features,
2368
- mode_masks, md);
2369
- if (!md->queue->ksm)
2370
- return -ENOMEM;
2371
- return 0;
2372
-}
2373
-
2374
-static void dm_destroy_inline_encryption(struct request_queue *q)
2375
-{
2376
- keyslot_manager_destroy(q->ksm);
2377
- q->ksm = NULL;
2378
-}
2379
-#else /* CONFIG_BLK_INLINE_ENCRYPTION */
2380
-static inline int dm_init_inline_encryption(struct mapped_device *md)
2381
-{
2382
- return 0;
2383
-}
2384
-
2385
-static inline void dm_destroy_inline_encryption(struct request_queue *q)
2386
-{
2387
-}
2388
-#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
2389
-
23902175 /*
23912176 * Setup the DM device's queue based on md's type
23922177 */
....@@ -2398,27 +2183,15 @@
23982183
23992184 switch (type) {
24002185 case DM_TYPE_REQUEST_BASED:
2401
- dm_init_normal_md_queue(md);
2402
- r = dm_old_init_request_queue(md, t);
2403
- if (r) {
2404
- DMERR("Cannot initialize queue for request-based mapped device");
2405
- return r;
2406
- }
2407
- break;
2408
- case DM_TYPE_MQ_REQUEST_BASED:
2186
+ md->disk->fops = &dm_rq_blk_dops;
24092187 r = dm_mq_init_request_queue(md, t);
24102188 if (r) {
2411
- DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2189
+ DMERR("Cannot initialize queue for request-based dm mapped device");
24122190 return r;
24132191 }
24142192 break;
24152193 case DM_TYPE_BIO_BASED:
24162194 case DM_TYPE_DAX_BIO_BASED:
2417
- dm_init_normal_md_queue(md);
2418
- break;
2419
- case DM_TYPE_NVME_BIO_BASED:
2420
- dm_init_normal_md_queue(md);
2421
- blk_queue_make_request(md->queue, dm_make_request_nvme);
24222195 break;
24232196 case DM_TYPE_NONE:
24242197 WARN_ON_ONCE(true);
....@@ -2430,13 +2203,6 @@
24302203 DMERR("Cannot calculate initial queue limits");
24312204 return r;
24322205 }
2433
-
2434
- r = dm_init_inline_encryption(md);
2435
- if (r) {
2436
- DMERR("Cannot initialize inline encryption");
2437
- return r;
2438
- }
2439
-
24402206 dm_table_set_restrictions(t, md->queue, &limits);
24412207 blk_register_queue(md->disk);
24422208
....@@ -2516,9 +2282,6 @@
25162282
25172283 blk_set_queue_dying(md->queue);
25182284
2519
- if (dm_request_based(md) && md->kworker_task)
2520
- kthread_flush_worker(&md->kworker);
2521
-
25222285 /*
25232286 * Take suspend_lock so that presuspend and postsuspend methods
25242287 * do not race with internal suspend.
....@@ -2569,15 +2332,29 @@
25692332 }
25702333 EXPORT_SYMBOL_GPL(dm_put);
25712334
2572
-static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2335
+static bool md_in_flight_bios(struct mapped_device *md)
2336
+{
2337
+ int cpu;
2338
+ struct hd_struct *part = &dm_disk(md)->part0;
2339
+ long sum = 0;
2340
+
2341
+ for_each_possible_cpu(cpu) {
2342
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2343
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2344
+ }
2345
+
2346
+ return sum != 0;
2347
+}
2348
+
2349
+static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
25732350 {
25742351 int r = 0;
25752352 DEFINE_WAIT(wait);
25762353
2577
- while (1) {
2354
+ while (true) {
25782355 prepare_to_wait(&md->wait, &wait, task_state);
25792356
2580
- if (!md_in_flight(md))
2357
+ if (!md_in_flight_bios(md))
25812358 break;
25822359
25832360 if (signal_pending_state(task_state, current)) {
....@@ -2589,6 +2366,30 @@
25892366 }
25902367 finish_wait(&md->wait, &wait);
25912368
2369
+ smp_rmb();
2370
+
2371
+ return r;
2372
+}
2373
+
2374
+static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2375
+{
2376
+ int r = 0;
2377
+
2378
+ if (!queue_is_mq(md->queue))
2379
+ return dm_wait_for_bios_completion(md, task_state);
2380
+
2381
+ while (true) {
2382
+ if (!blk_mq_queue_inflight(md->queue))
2383
+ break;
2384
+
2385
+ if (signal_pending_state(task_state, current)) {
2386
+ r = -EINTR;
2387
+ break;
2388
+ }
2389
+
2390
+ msleep(5);
2391
+ }
2392
+
25922393 return r;
25932394 }
25942395
....@@ -2597,29 +2398,19 @@
25972398 */
25982399 static void dm_wq_work(struct work_struct *work)
25992400 {
2600
- struct mapped_device *md = container_of(work, struct mapped_device,
2601
- work);
2602
- struct bio *c;
2603
- int srcu_idx;
2604
- struct dm_table *map;
2605
-
2606
- map = dm_get_live_table(md, &srcu_idx);
2401
+ struct mapped_device *md = container_of(work, struct mapped_device, work);
2402
+ struct bio *bio;
26072403
26082404 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
26092405 spin_lock_irq(&md->deferred_lock);
2610
- c = bio_list_pop(&md->deferred);
2406
+ bio = bio_list_pop(&md->deferred);
26112407 spin_unlock_irq(&md->deferred_lock);
26122408
2613
- if (!c)
2409
+ if (!bio)
26142410 break;
26152411
2616
- if (dm_request_based(md))
2617
- generic_make_request(c);
2618
- else
2619
- __split_and_process_bio(md, map, c);
2412
+ submit_bio_noacct(bio);
26202413 }
2621
-
2622
- dm_put_live_table(md, srcu_idx);
26232414 }
26242415
26252416 static void dm_queue_flush(struct mapped_device *md)
....@@ -2681,27 +2472,19 @@
26812472 {
26822473 int r;
26832474
2684
- WARN_ON(md->frozen_sb);
2475
+ WARN_ON(test_bit(DMF_FROZEN, &md->flags));
26852476
2686
- md->frozen_sb = freeze_bdev(md->bdev);
2687
- if (IS_ERR(md->frozen_sb)) {
2688
- r = PTR_ERR(md->frozen_sb);
2689
- md->frozen_sb = NULL;
2690
- return r;
2691
- }
2692
-
2693
- set_bit(DMF_FROZEN, &md->flags);
2694
-
2695
- return 0;
2477
+ r = freeze_bdev(md->bdev);
2478
+ if (!r)
2479
+ set_bit(DMF_FROZEN, &md->flags);
2480
+ return r;
26962481 }
26972482
26982483 static void unlock_fs(struct mapped_device *md)
26992484 {
27002485 if (!test_bit(DMF_FROZEN, &md->flags))
27012486 return;
2702
-
2703
- thaw_bdev(md->bdev, md->frozen_sb);
2704
- md->frozen_sb = NULL;
2487
+ thaw_bdev(md->bdev);
27052488 clear_bit(DMF_FROZEN, &md->flags);
27062489 }
27072490
....@@ -2731,7 +2514,7 @@
27312514 if (noflush)
27322515 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
27332516 else
2734
- pr_debug("%s: suspending with flush\n", dm_device_name(md));
2517
+ DMDEBUG("%s: suspending with flush", dm_device_name(md));
27352518
27362519 /*
27372520 * This gets reverted if there's an error later and the targets
....@@ -2756,13 +2539,12 @@
27562539 /*
27572540 * Here we must make sure that no processes are submitting requests
27582541 * to target drivers i.e. no one may be executing
2759
- * __split_and_process_bio. This is called from dm_request and
2760
- * dm_wq_work.
2542
+ * __split_and_process_bio from dm_submit_bio.
27612543 *
2762
- * To get all processes out of __split_and_process_bio in dm_request,
2544
+ * To get all processes out of __split_and_process_bio in dm_submit_bio,
27632545 * we take the write lock. To prevent any process from reentering
2764
- * __split_and_process_bio from dm_request and quiesce the thread
2765
- * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2546
+ * __split_and_process_bio from dm_submit_bio and quiesce the thread
2547
+ * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
27662548 * flush_workqueue(md->wq).
27672549 */
27682550 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
....@@ -2773,11 +2555,8 @@
27732555 * Stop md->queue before flushing md->wq in case request-based
27742556 * dm defers requests to md->wq from md->queue.
27752557 */
2776
- if (dm_request_based(md)) {
2558
+ if (dm_request_based(md))
27772559 dm_stop_queue(md->queue);
2778
- if (md->kworker_task)
2779
- kthread_flush_worker(&md->kworker);
2780
- }
27812560
27822561 flush_workqueue(md->wq);
27832562
....@@ -3133,19 +2912,19 @@
31332912
31342913 int dm_suspended(struct dm_target *ti)
31352914 {
3136
- return dm_suspended_md(dm_table_get_md(ti->table));
2915
+ return dm_suspended_md(ti->table->md);
31372916 }
31382917 EXPORT_SYMBOL_GPL(dm_suspended);
31392918
31402919 int dm_post_suspending(struct dm_target *ti)
31412920 {
3142
- return dm_post_suspending_md(dm_table_get_md(ti->table));
2921
+ return dm_post_suspending_md(ti->table->md);
31432922 }
31442923 EXPORT_SYMBOL_GPL(dm_post_suspending);
31452924
31462925 int dm_noflush_suspending(struct dm_target *ti)
31472926 {
3148
- return __noflush_suspending(dm_table_get_md(ti->table));
2927
+ return __noflush_suspending(ti->table->md);
31492928 }
31502929 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
31512930
....@@ -3164,7 +2943,6 @@
31642943 switch (type) {
31652944 case DM_TYPE_BIO_BASED:
31662945 case DM_TYPE_DAX_BIO_BASED:
3167
- case DM_TYPE_NVME_BIO_BASED:
31682946 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
31692947 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
31702948 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
....@@ -3175,7 +2953,6 @@
31752953 goto out;
31762954 break;
31772955 case DM_TYPE_REQUEST_BASED:
3178
- case DM_TYPE_MQ_REQUEST_BASED:
31792956 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
31802957 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
31812958 /* per_io_data_size is used for blk-mq pdu at queue allocation */
....@@ -3233,6 +3010,11 @@
32333010 if (dm_table_get_num_targets(table) != 1)
32343011 goto out;
32353012 ti = dm_table_get_target(table, 0);
3013
+
3014
+ if (dm_suspended_md(md)) {
3015
+ ret = -EAGAIN;
3016
+ goto out;
3017
+ }
32363018
32373019 ret = -EINVAL;
32383020 if (!ti->type->iterate_devices)
....@@ -3373,6 +3155,17 @@
33733155 };
33743156
33753157 static const struct block_device_operations dm_blk_dops = {
3158
+ .submit_bio = dm_submit_bio,
3159
+ .open = dm_blk_open,
3160
+ .release = dm_blk_close,
3161
+ .ioctl = dm_blk_ioctl,
3162
+ .getgeo = dm_blk_getgeo,
3163
+ .report_zones = dm_blk_report_zones,
3164
+ .pr_ops = &dm_pr_ops,
3165
+ .owner = THIS_MODULE
3166
+};
3167
+
3168
+static const struct block_device_operations dm_rq_blk_dops = {
33763169 .open = dm_blk_open,
33773170 .release = dm_blk_close,
33783171 .ioctl = dm_blk_ioctl,
....@@ -3383,8 +3176,10 @@
33833176
33843177 static const struct dax_operations dm_dax_ops = {
33853178 .direct_access = dm_dax_direct_access,
3179
+ .dax_supported = dm_dax_supported,
33863180 .copy_from_iter = dm_dax_copy_from_iter,
33873181 .copy_to_iter = dm_dax_copy_to_iter,
3182
+ .zero_page_range = dm_dax_zero_page_range,
33883183 };
33893184
33903185 /*
....@@ -3402,6 +3197,9 @@
34023197 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
34033198 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
34043199
3200
+module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3201
+MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3202
+
34053203 MODULE_DESCRIPTION(DM_NAME " driver");
34063204 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
34073205 MODULE_LICENSE("GPL");