hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/md/dm-zoned-metadata.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
34 *
....@@ -8,13 +9,14 @@
89
910 #include <linux/module.h>
1011 #include <linux/crc32.h>
12
+#include <linux/sched/mm.h>
1113
1214 #define DM_MSG_PREFIX "zoned metadata"
1315
1416 /*
1517 * Metadata version.
1618 */
17
-#define DMZ_META_VER 1
19
+#define DMZ_META_VER 2
1820
1921 /*
2022 * On-disk super block magic.
....@@ -33,7 +35,7 @@
3335 * (1) Super block (1 block)
3436 * (2) Chunk mapping table (nr_map_blocks)
3537 * (3) Bitmap blocks (nr_bitmap_blocks)
36
- * All metadata blocks are stored in conventional zones, starting from the
38
+ * All metadata blocks are stored in conventional zones, starting from
3739 * the first conventional zone found on disk.
3840 */
3941 struct dmz_super {
....@@ -67,8 +69,17 @@
6769 /* Checksum */
6870 __le32 crc; /* 48 */
6971
72
+ /* DM-Zoned label */
73
+ u8 dmz_label[32]; /* 80 */
74
+
75
+ /* DM-Zoned UUID */
76
+ u8 dmz_uuid[16]; /* 96 */
77
+
78
+ /* Device UUID */
79
+ u8 dev_uuid[16]; /* 112 */
80
+
7081 /* Padding to full 512B sector */
71
- u8 reserved[464]; /* 512 */
82
+ u8 reserved[400]; /* 512 */
7283 };
7384
7485 /*
....@@ -120,8 +131,10 @@
120131 */
121132 struct dmz_sb {
122133 sector_t block;
134
+ struct dmz_dev *dev;
123135 struct dmz_mblock *mblk;
124136 struct dmz_super *sb;
137
+ struct dm_zone *zone;
125138 };
126139
127140 /*
....@@ -129,28 +142,41 @@
129142 */
130143 struct dmz_metadata {
131144 struct dmz_dev *dev;
145
+ unsigned int nr_devs;
146
+
147
+ char devname[BDEVNAME_SIZE];
148
+ char label[BDEVNAME_SIZE];
149
+ uuid_t uuid;
132150
133151 sector_t zone_bitmap_size;
134152 unsigned int zone_nr_bitmap_blocks;
135153 unsigned int zone_bits_per_mblk;
136154
155
+ sector_t zone_nr_blocks;
156
+ sector_t zone_nr_blocks_shift;
157
+
158
+ sector_t zone_nr_sectors;
159
+ sector_t zone_nr_sectors_shift;
160
+
137161 unsigned int nr_bitmap_blocks;
138162 unsigned int nr_map_blocks;
139163
164
+ unsigned int nr_zones;
140165 unsigned int nr_useable_zones;
141166 unsigned int nr_meta_blocks;
142167 unsigned int nr_meta_zones;
143168 unsigned int nr_data_zones;
169
+ unsigned int nr_cache_zones;
144170 unsigned int nr_rnd_zones;
145171 unsigned int nr_reserved_seq;
146172 unsigned int nr_chunks;
147173
148174 /* Zone information array */
149
- struct dm_zone *zones;
175
+ struct xarray zones;
150176
151
- struct dm_zone *sb_zone;
152177 struct dmz_sb sb[2];
153178 unsigned int mblk_primary;
179
+ unsigned int sb_version;
154180 u64 sb_gen;
155181 unsigned int min_nr_mblks;
156182 unsigned int max_nr_mblks;
....@@ -166,15 +192,11 @@
166192 /* Zone allocation management */
167193 struct mutex map_lock;
168194 struct dmz_mblock **map_mblk;
169
- unsigned int nr_rnd;
170
- atomic_t unmap_nr_rnd;
171
- struct list_head unmap_rnd_list;
172
- struct list_head map_rnd_list;
173195
174
- unsigned int nr_seq;
175
- atomic_t unmap_nr_seq;
176
- struct list_head unmap_seq_list;
177
- struct list_head map_seq_list;
196
+ unsigned int nr_cache;
197
+ atomic_t unmap_nr_cache;
198
+ struct list_head unmap_cache_list;
199
+ struct list_head map_cache_list;
178200
179201 atomic_t nr_reserved_seq_zones;
180202 struct list_head reserved_seq_zones_list;
....@@ -182,22 +204,65 @@
182204 wait_queue_head_t free_wq;
183205 };
184206
207
+#define dmz_zmd_info(zmd, format, args...) \
208
+ DMINFO("(%s): " format, (zmd)->label, ## args)
209
+
210
+#define dmz_zmd_err(zmd, format, args...) \
211
+ DMERR("(%s): " format, (zmd)->label, ## args)
212
+
213
+#define dmz_zmd_warn(zmd, format, args...) \
214
+ DMWARN("(%s): " format, (zmd)->label, ## args)
215
+
216
+#define dmz_zmd_debug(zmd, format, args...) \
217
+ DMDEBUG("(%s): " format, (zmd)->label, ## args)
185218 /*
186219 * Various accessors
187220 */
188
-unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
221
+static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone)
189222 {
190
- return ((unsigned int)(zone - zmd->zones));
223
+ if (WARN_ON(!zone))
224
+ return 0;
225
+
226
+ return zone->id - zone->dev->zone_offset;
191227 }
192228
193229 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
194230 {
195
- return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
231
+ unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
232
+
233
+ return (sector_t)zone_id << zmd->zone_nr_sectors_shift;
196234 }
197235
198236 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
199237 {
200
- return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
238
+ unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
239
+
240
+ return (sector_t)zone_id << zmd->zone_nr_blocks_shift;
241
+}
242
+
243
+unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd)
244
+{
245
+ return zmd->zone_nr_blocks;
246
+}
247
+
248
+unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd)
249
+{
250
+ return zmd->zone_nr_blocks_shift;
251
+}
252
+
253
+unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd)
254
+{
255
+ return zmd->zone_nr_sectors;
256
+}
257
+
258
+unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd)
259
+{
260
+ return zmd->zone_nr_sectors_shift;
261
+}
262
+
263
+unsigned int dmz_nr_zones(struct dmz_metadata *zmd)
264
+{
265
+ return zmd->nr_zones;
201266 }
202267
203268 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
....@@ -205,14 +270,88 @@
205270 return zmd->nr_chunks;
206271 }
207272
208
-unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
273
+unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx)
209274 {
210
- return zmd->nr_rnd;
275
+ return zmd->dev[idx].nr_rnd;
211276 }
212277
213
-unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
278
+unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx)
214279 {
215
- return atomic_read(&zmd->unmap_nr_rnd);
280
+ return atomic_read(&zmd->dev[idx].unmap_nr_rnd);
281
+}
282
+
283
+unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd)
284
+{
285
+ return zmd->nr_cache;
286
+}
287
+
288
+unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd)
289
+{
290
+ return atomic_read(&zmd->unmap_nr_cache);
291
+}
292
+
293
+unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx)
294
+{
295
+ return zmd->dev[idx].nr_seq;
296
+}
297
+
298
+unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx)
299
+{
300
+ return atomic_read(&zmd->dev[idx].unmap_nr_seq);
301
+}
302
+
303
+static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
304
+{
305
+ return xa_load(&zmd->zones, zone_id);
306
+}
307
+
308
+static struct dm_zone *dmz_insert(struct dmz_metadata *zmd,
309
+ unsigned int zone_id, struct dmz_dev *dev)
310
+{
311
+ struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL);
312
+
313
+ if (!zone)
314
+ return ERR_PTR(-ENOMEM);
315
+
316
+ if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) {
317
+ kfree(zone);
318
+ return ERR_PTR(-EBUSY);
319
+ }
320
+
321
+ INIT_LIST_HEAD(&zone->link);
322
+ atomic_set(&zone->refcount, 0);
323
+ zone->id = zone_id;
324
+ zone->chunk = DMZ_MAP_UNMAPPED;
325
+ zone->dev = dev;
326
+
327
+ return zone;
328
+}
329
+
330
+const char *dmz_metadata_label(struct dmz_metadata *zmd)
331
+{
332
+ return (const char *)zmd->label;
333
+}
334
+
335
+bool dmz_check_dev(struct dmz_metadata *zmd)
336
+{
337
+ unsigned int i;
338
+
339
+ for (i = 0; i < zmd->nr_devs; i++) {
340
+ if (!dmz_check_bdev(&zmd->dev[i]))
341
+ return false;
342
+ }
343
+ return true;
344
+}
345
+
346
+bool dmz_dev_is_dying(struct dmz_metadata *zmd)
347
+{
348
+ unsigned int i;
349
+
350
+ for (i = 0; i < zmd->nr_devs; i++) {
351
+ if (dmz_bdev_is_dying(&zmd->dev[i]))
352
+ return true;
353
+ }
354
+ return false;
216355 }
217356
218357 /*
....@@ -233,7 +372,7 @@
233372 * Lock/unlock metadata access. This is a "read" lock on a semaphore
234373 * that prevents metadata flush from running while metadata are being
235374 * modified. The actual metadata write mutual exclusion is achieved with
236
- * the map lock and zone styate management (active and reclaim state are
375
+ * the map lock and zone state management (active and reclaim state are
237376 * mutually exclusive).
238377 */
239378 void dmz_lock_metadata(struct dmz_metadata *zmd)
....@@ -400,9 +539,10 @@
400539 {
401540 struct dmz_mblock *mblk, *m;
402541 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
542
+ struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
403543 struct bio *bio;
404544
405
- if (dmz_bdev_is_dying(zmd->dev))
545
+ if (dmz_bdev_is_dying(dev))
406546 return ERR_PTR(-EIO);
407547
408548 /* Get a new block and a BIO to read it */
....@@ -438,7 +578,7 @@
438578
439579 /* Submit read BIO */
440580 bio->bi_iter.bi_sector = dmz_blk2sect(block);
441
- bio_set_dev(bio, zmd->dev->bdev);
581
+ bio_set_dev(bio, dev->bdev);
442582 bio->bi_private = mblk;
443583 bio->bi_end_io = dmz_mblock_bio_end_io;
444584 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
....@@ -535,6 +675,7 @@
535675 sector_t mblk_no)
536676 {
537677 struct dmz_mblock *mblk;
678
+ struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
538679
539680 /* Check rbtree */
540681 spin_lock(&zmd->mblk_lock);
....@@ -553,7 +694,7 @@
553694 TASK_UNINTERRUPTIBLE);
554695 if (test_bit(DMZ_META_ERROR, &mblk->state)) {
555696 dmz_release_mblock(zmd, mblk);
556
- dmz_check_bdev(zmd->dev);
697
+ dmz_check_bdev(dev);
557698 return ERR_PTR(-EIO);
558699 }
559700
....@@ -577,10 +718,11 @@
577718 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
578719 unsigned int set)
579720 {
721
+ struct dmz_dev *dev = zmd->sb[set].dev;
580722 sector_t block = zmd->sb[set].block + mblk->no;
581723 struct bio *bio;
582724
583
- if (dmz_bdev_is_dying(zmd->dev))
725
+ if (dmz_bdev_is_dying(dev))
584726 return -EIO;
585727
586728 bio = bio_alloc(GFP_NOIO, 1);
....@@ -592,7 +734,7 @@
592734 set_bit(DMZ_META_WRITING, &mblk->state);
593735
594736 bio->bi_iter.bi_sector = dmz_blk2sect(block);
595
- bio_set_dev(bio, zmd->dev->bdev);
737
+ bio_set_dev(bio, dev->bdev);
596738 bio->bi_private = mblk;
597739 bio->bi_end_io = dmz_mblock_bio_end_io;
598740 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
....@@ -605,13 +747,16 @@
605747 /*
606748 * Read/write a metadata block.
607749 */
608
-static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
609
- struct page *page)
750
+static int dmz_rdwr_block(struct dmz_dev *dev, int op,
751
+ sector_t block, struct page *page)
610752 {
611753 struct bio *bio;
612754 int ret;
613755
614
- if (dmz_bdev_is_dying(zmd->dev))
756
+ if (WARN_ON(!dev))
757
+ return -EIO;
758
+
759
+ if (dmz_bdev_is_dying(dev))
615760 return -EIO;
616761
617762 bio = bio_alloc(GFP_NOIO, 1);
....@@ -619,14 +764,14 @@
619764 return -ENOMEM;
620765
621766 bio->bi_iter.bi_sector = dmz_blk2sect(block);
622
- bio_set_dev(bio, zmd->dev->bdev);
767
+ bio_set_dev(bio, dev->bdev);
623768 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
624769 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
625770 ret = submit_bio_wait(bio);
626771 bio_put(bio);
627772
628773 if (ret)
629
- dmz_check_bdev(zmd->dev);
774
+ dmz_check_bdev(dev);
630775 return ret;
631776 }
632777
....@@ -635,18 +780,32 @@
635780 */
636781 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
637782 {
638
- sector_t block = zmd->sb[set].block;
639783 struct dmz_mblock *mblk = zmd->sb[set].mblk;
640784 struct dmz_super *sb = zmd->sb[set].sb;
785
+ struct dmz_dev *dev = zmd->sb[set].dev;
786
+ sector_t sb_block;
641787 u64 sb_gen = zmd->sb_gen + 1;
642788 int ret;
643789
644790 sb->magic = cpu_to_le32(DMZ_MAGIC);
645
- sb->version = cpu_to_le32(DMZ_META_VER);
791
+
792
+ sb->version = cpu_to_le32(zmd->sb_version);
793
+ if (zmd->sb_version > 1) {
794
+ BUILD_BUG_ON(UUID_SIZE != 16);
795
+ export_uuid(sb->dmz_uuid, &zmd->uuid);
796
+ memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE);
797
+ export_uuid(sb->dev_uuid, &dev->uuid);
798
+ }
646799
647800 sb->gen = cpu_to_le64(sb_gen);
648801
649
- sb->sb_block = cpu_to_le64(block);
802
+ /*
803
+ * The metadata always references the absolute block address,
804
+ * ie relative to the entire block range, not the per-device
805
+ * block address.
806
+ */
807
+ sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift;
808
+ sb->sb_block = cpu_to_le64(sb_block);
650809 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
651810 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
652811 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
....@@ -657,9 +816,10 @@
657816 sb->crc = 0;
658817 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
659818
660
- ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
819
+ ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
820
+ mblk->page);
661821 if (ret == 0)
662
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
822
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
663823
664824 return ret;
665825 }
....@@ -672,6 +832,7 @@
672832 unsigned int set)
673833 {
674834 struct dmz_mblock *mblk;
835
+ struct dmz_dev *dev = zmd->sb[set].dev;
675836 struct blk_plug plug;
676837 int ret = 0, nr_mblks_submitted = 0;
677838
....@@ -693,7 +854,7 @@
693854 TASK_UNINTERRUPTIBLE);
694855 if (test_bit(DMZ_META_ERROR, &mblk->state)) {
695856 clear_bit(DMZ_META_ERROR, &mblk->state);
696
- dmz_check_bdev(zmd->dev);
857
+ dmz_check_bdev(dev);
697858 ret = -EIO;
698859 }
699860 nr_mblks_submitted--;
....@@ -701,7 +862,7 @@
701862
702863 /* Flush drive cache (this will also sync data) */
703864 if (ret == 0)
704
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
865
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
705866
706867 return ret;
707868 }
....@@ -738,6 +899,7 @@
738899 {
739900 struct dmz_mblock *mblk;
740901 struct list_head write_list;
902
+ struct dmz_dev *dev;
741903 int ret;
742904
743905 if (WARN_ON(!zmd))
....@@ -751,6 +913,7 @@
751913 * from modifying metadata.
752914 */
753915 down_write(&zmd->mblk_sem);
916
+ dev = zmd->sb[zmd->mblk_primary].dev;
754917
755918 /*
756919 * This is called from the target flush work and reclaim work.
....@@ -758,7 +921,7 @@
758921 */
759922 dmz_lock_flush(zmd);
760923
761
- if (dmz_bdev_is_dying(zmd->dev)) {
924
+ if (dmz_bdev_is_dying(dev)) {
762925 ret = -EIO;
763926 goto out;
764927 }
....@@ -770,7 +933,7 @@
770933
771934 /* If there are no dirty metadata blocks, just flush the device cache */
772935 if (list_empty(&write_list)) {
773
- ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
936
+ ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
774937 goto err;
775938 }
776939
....@@ -819,7 +982,7 @@
819982 list_splice(&write_list, &zmd->mblk_dirty_list);
820983 spin_unlock(&zmd->mblk_lock);
821984 }
822
- if (!dmz_check_bdev(zmd->dev))
985
+ if (!dmz_check_bdev(dev))
823986 ret = -EIO;
824987 goto out;
825988 }
....@@ -827,12 +990,31 @@
827990 /*
828991 * Check super block.
829992 */
830
-static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
993
+static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb,
994
+ bool tertiary)
831995 {
996
+ struct dmz_super *sb = dsb->sb;
997
+ struct dmz_dev *dev = dsb->dev;
832998 unsigned int nr_meta_zones, nr_data_zones;
833
- struct dmz_dev *dev = zmd->dev;
834999 u32 crc, stored_crc;
835
- u64 gen;
1000
+ u64 gen, sb_block;
1001
+
1002
+ if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
1003
+ dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
1004
+ DMZ_MAGIC, le32_to_cpu(sb->magic));
1005
+ return -ENXIO;
1006
+ }
1007
+
1008
+ zmd->sb_version = le32_to_cpu(sb->version);
1009
+ if (zmd->sb_version > DMZ_META_VER) {
1010
+ dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
1011
+ DMZ_META_VER, zmd->sb_version);
1012
+ return -EINVAL;
1013
+ }
1014
+ if (zmd->sb_version < 2 && tertiary) {
1015
+ dmz_dev_err(dev, "Tertiary superblocks are not supported");
1016
+ return -EINVAL;
1017
+ }
8361018
8371019 gen = le64_to_cpu(sb->gen);
8381020 stored_crc = le32_to_cpu(sb->crc);
....@@ -844,22 +1026,60 @@
8441026 return -ENXIO;
8451027 }
8461028
847
- if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
848
- dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
849
- DMZ_MAGIC, le32_to_cpu(sb->magic));
850
- return -ENXIO;
1029
+ sb_block = le64_to_cpu(sb->sb_block);
1030
+ if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) {
1031
+ dmz_dev_err(dev, "Invalid superblock position "
1032
+ "(is %llu expected %llu)",
1033
+ sb_block,
1034
+ (u64)dsb->zone->id << zmd->zone_nr_blocks_shift);
1035
+ return -EINVAL;
1036
+ }
1037
+ if (zmd->sb_version > 1) {
1038
+ uuid_t sb_uuid;
1039
+
1040
+ import_uuid(&sb_uuid, sb->dmz_uuid);
1041
+ if (uuid_is_null(&sb_uuid)) {
1042
+ dmz_dev_err(dev, "NULL DM-Zoned uuid");
1043
+ return -ENXIO;
1044
+ } else if (uuid_is_null(&zmd->uuid)) {
1045
+ uuid_copy(&zmd->uuid, &sb_uuid);
1046
+ } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) {
1047
+ dmz_dev_err(dev, "mismatching DM-Zoned uuid, "
1048
+ "is %pUl expected %pUl",
1049
+ &sb_uuid, &zmd->uuid);
1050
+ return -ENXIO;
1051
+ }
1052
+ if (!strlen(zmd->label))
1053
+ memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE);
1054
+ else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) {
1055
+ dmz_dev_err(dev, "mismatching DM-Zoned label, "
1056
+ "is %s expected %s",
1057
+ sb->dmz_label, zmd->label);
1058
+ return -ENXIO;
1059
+ }
1060
+ import_uuid(&dev->uuid, sb->dev_uuid);
1061
+ if (uuid_is_null(&dev->uuid)) {
1062
+ dmz_dev_err(dev, "NULL device uuid");
1063
+ return -ENXIO;
1064
+ }
1065
+
1066
+ if (tertiary) {
1067
+ /*
1068
+ * Generation number should be 0, but it doesn't
1069
+ * really matter if it isn't.
1070
+ */
1071
+ if (gen != 0)
1072
+ dmz_dev_warn(dev, "Invalid generation %llu",
1073
+ gen);
1074
+ return 0;
1075
+ }
8511076 }
8521077
853
- if (le32_to_cpu(sb->version) != DMZ_META_VER) {
854
- dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
855
- DMZ_META_VER, le32_to_cpu(sb->version));
856
- return -ENXIO;
857
- }
858
-
859
- nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
860
- >> dev->zone_nr_blocks_shift;
1078
+ nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1)
1079
+ >> zmd->zone_nr_blocks_shift;
8611080 if (!nr_meta_zones ||
862
- nr_meta_zones >= zmd->nr_rnd_zones) {
1081
+ (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) ||
1082
+ (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) {
8631083 dmz_dev_err(dev, "Invalid number of metadata blocks");
8641084 return -ENXIO;
8651085 }
....@@ -893,10 +1113,13 @@
8931113 /*
8941114 * Read the first or second super block from disk.
8951115 */
896
-static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
1116
+static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
8971117 {
898
- return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
899
- zmd->sb[set].mblk->page);
1118
+ dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu",
1119
+ set, sb->dev->name, sb->block);
1120
+
1121
+ return dmz_rdwr_block(sb->dev, REQ_OP_READ,
1122
+ sb->block, sb->mblk->page);
9001123 }
9011124
9021125 /*
....@@ -906,8 +1129,9 @@
9061129 */
9071130 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
9081131 {
909
- unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
1132
+ unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
9101133 struct dmz_mblock *mblk;
1134
+ unsigned int zone_id = zmd->sb[0].zone->id;
9111135 int i;
9121136
9131137 /* Allocate a block */
....@@ -920,24 +1144,29 @@
9201144
9211145 /* Bad first super block: search for the second one */
9221146 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
923
- for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
924
- if (dmz_read_sb(zmd, 1) != 0)
1147
+ zmd->sb[1].zone = dmz_get(zmd, zone_id + 1);
1148
+ zmd->sb[1].dev = zmd->sb[0].dev;
1149
+ for (i = 1; i < zmd->nr_rnd_zones; i++) {
1150
+ if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0)
9251151 break;
9261152 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
9271153 return 0;
9281154 zmd->sb[1].block += zone_nr_blocks;
1155
+ zmd->sb[1].zone = dmz_get(zmd, zone_id + i);
9291156 }
9301157
9311158 dmz_free_mblock(zmd, mblk);
9321159 zmd->sb[1].mblk = NULL;
1160
+ zmd->sb[1].zone = NULL;
1161
+ zmd->sb[1].dev = NULL;
9331162
9341163 return -EIO;
9351164 }
9361165
9371166 /*
938
- * Read the first or second super block from disk.
1167
+ * Read a super block from disk.
9391168 */
940
-static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
1169
+static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
9411170 {
9421171 struct dmz_mblock *mblk;
9431172 int ret;
....@@ -947,14 +1176,14 @@
9471176 if (!mblk)
9481177 return -ENOMEM;
9491178
950
- zmd->sb[set].mblk = mblk;
951
- zmd->sb[set].sb = mblk->data;
1179
+ sb->mblk = mblk;
1180
+ sb->sb = mblk->data;
9521181
9531182 /* Read super block */
954
- ret = dmz_read_sb(zmd, set);
1183
+ ret = dmz_read_sb(zmd, sb, set);
9551184 if (ret) {
9561185 dmz_free_mblock(zmd, mblk);
957
- zmd->sb[set].mblk = NULL;
1186
+ sb->mblk = NULL;
9581187 return ret;
9591188 }
9601189
....@@ -970,14 +1199,13 @@
9701199 struct page *page;
9711200 int i, ret;
9721201
973
- dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
1202
+ dmz_dev_warn(zmd->sb[dst_set].dev,
1203
+ "Metadata set %u invalid: recovering", dst_set);
9741204
9751205 if (dst_set == 0)
976
- zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
977
- else {
978
- zmd->sb[1].block = zmd->sb[0].block +
979
- (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
980
- }
1206
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
1207
+ else
1208
+ zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
9811209
9821210 page = alloc_page(GFP_NOIO);
9831211 if (!page)
....@@ -985,11 +1213,11 @@
9851213
9861214 /* Copy metadata blocks */
9871215 for (i = 1; i < zmd->nr_meta_blocks; i++) {
988
- ret = dmz_rdwr_block(zmd, REQ_OP_READ,
1216
+ ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ,
9891217 zmd->sb[src_set].block + i, page);
9901218 if (ret)
9911219 goto out;
992
- ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
1220
+ ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE,
9931221 zmd->sb[dst_set].block + i, page);
9941222 if (ret)
9951223 goto out;
....@@ -1021,53 +1249,73 @@
10211249 u64 sb_gen[2] = {0, 0};
10221250 int ret;
10231251
1252
+ if (!zmd->sb[0].zone) {
1253
+ dmz_zmd_err(zmd, "Primary super block zone not set");
1254
+ return -ENXIO;
1255
+ }
1256
+
10241257 /* Read and check the primary super block */
1025
- zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
1026
- ret = dmz_get_sb(zmd, 0);
1258
+ zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
1259
+ zmd->sb[0].dev = zmd->sb[0].zone->dev;
1260
+ ret = dmz_get_sb(zmd, &zmd->sb[0], 0);
10271261 if (ret) {
1028
- dmz_dev_err(zmd->dev, "Read primary super block failed");
1262
+ dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed");
10291263 return ret;
10301264 }
10311265
1032
- ret = dmz_check_sb(zmd, zmd->sb[0].sb);
1266
+ ret = dmz_check_sb(zmd, &zmd->sb[0], false);
10331267
10341268 /* Read and check secondary super block */
10351269 if (ret == 0) {
10361270 sb_good[0] = true;
1037
- zmd->sb[1].block = zmd->sb[0].block +
1038
- (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
1039
- ret = dmz_get_sb(zmd, 1);
1271
+ if (!zmd->sb[1].zone) {
1272
+ unsigned int zone_id =
1273
+ zmd->sb[0].zone->id + zmd->nr_meta_zones;
1274
+
1275
+ zmd->sb[1].zone = dmz_get(zmd, zone_id);
1276
+ }
1277
+ zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
1278
+ zmd->sb[1].dev = zmd->sb[0].dev;
1279
+ ret = dmz_get_sb(zmd, &zmd->sb[1], 1);
10401280 } else
10411281 ret = dmz_lookup_secondary_sb(zmd);
10421282
10431283 if (ret) {
1044
- dmz_dev_err(zmd->dev, "Read secondary super block failed");
1284
+ dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed");
10451285 return ret;
10461286 }
10471287
1048
- ret = dmz_check_sb(zmd, zmd->sb[1].sb);
1288
+ ret = dmz_check_sb(zmd, &zmd->sb[1], false);
10491289 if (ret == 0)
10501290 sb_good[1] = true;
10511291
10521292 /* Use highest generation sb first */
10531293 if (!sb_good[0] && !sb_good[1]) {
1054
- dmz_dev_err(zmd->dev, "No valid super block found");
1294
+ dmz_zmd_err(zmd, "No valid super block found");
10551295 return -EIO;
10561296 }
10571297
10581298 if (sb_good[0])
10591299 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
1060
- else
1300
+ else {
10611301 ret = dmz_recover_mblocks(zmd, 0);
1302
+ if (ret) {
1303
+ dmz_dev_err(zmd->sb[0].dev,
1304
+ "Recovery of superblock 0 failed");
1305
+ return -EIO;
1306
+ }
1307
+ }
10621308
10631309 if (sb_good[1])
10641310 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
1065
- else
1311
+ else {
10661312 ret = dmz_recover_mblocks(zmd, 1);
10671313
1068
- if (ret) {
1069
- dmz_dev_err(zmd->dev, "Recovery failed");
1070
- return -EIO;
1314
+ if (ret) {
1315
+ dmz_dev_err(zmd->sb[1].dev,
1316
+ "Recovery of superblock 1 failed");
1317
+ return -EIO;
1318
+ }
10711319 }
10721320
10731321 if (sb_gen[0] >= sb_gen[1]) {
....@@ -1078,60 +1326,141 @@
10781326 zmd->mblk_primary = 1;
10791327 }
10801328
1081
- dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
1329
+ dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev,
1330
+ "Using super block %u (gen %llu)",
10821331 zmd->mblk_primary, zmd->sb_gen);
10831332
1084
- return 0;
1333
+ if (zmd->sb_version > 1) {
1334
+ int i;
1335
+ struct dmz_sb *sb;
1336
+
1337
+ sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL);
1338
+ if (!sb)
1339
+ return -ENOMEM;
1340
+ for (i = 1; i < zmd->nr_devs; i++) {
1341
+ sb->block = 0;
1342
+ sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset);
1343
+ sb->dev = &zmd->dev[i];
1344
+ if (!dmz_is_meta(sb->zone)) {
1345
+ dmz_dev_err(sb->dev,
1346
+ "Tertiary super block zone %u not marked as metadata zone",
1347
+ sb->zone->id);
1348
+ ret = -EINVAL;
1349
+ goto out_kfree;
1350
+ }
1351
+ ret = dmz_get_sb(zmd, sb, i + 1);
1352
+ if (ret) {
1353
+ dmz_dev_err(sb->dev,
1354
+ "Read tertiary super block failed");
1355
+ dmz_free_mblock(zmd, sb->mblk);
1356
+ goto out_kfree;
1357
+ }
1358
+ ret = dmz_check_sb(zmd, sb, true);
1359
+ dmz_free_mblock(zmd, sb->mblk);
1360
+ if (ret == -EINVAL)
1361
+ goto out_kfree;
1362
+ }
1363
+ out_kfree:
1364
+ kfree(sb);
1365
+ }
1366
+ return ret;
10851367 }
10861368
10871369 /*
10881370 * Initialize a zone descriptor.
10891371 */
1090
-static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
1091
- struct blk_zone *blkz)
1372
+static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
10921373 {
1093
- struct dmz_dev *dev = zmd->dev;
1374
+ struct dmz_dev *dev = data;
1375
+ struct dmz_metadata *zmd = dev->metadata;
1376
+ int idx = num + dev->zone_offset;
1377
+ struct dm_zone *zone;
10941378
1095
- /* Ignore the eventual last runt (smaller) zone */
1096
- if (blkz->len != dev->zone_nr_sectors) {
1097
- if (blkz->start + blkz->len == dev->capacity)
1379
+ zone = dmz_insert(zmd, idx, dev);
1380
+ if (IS_ERR(zone))
1381
+ return PTR_ERR(zone);
1382
+
1383
+ if (blkz->len != zmd->zone_nr_sectors) {
1384
+ if (zmd->sb_version > 1) {
1385
+ /* Ignore the eventual runt (smaller) zone */
1386
+ set_bit(DMZ_OFFLINE, &zone->flags);
1387
+ return 0;
1388
+ } else if (blkz->start + blkz->len == dev->capacity)
10981389 return 0;
10991390 return -ENXIO;
11001391 }
11011392
1102
- INIT_LIST_HEAD(&zone->link);
1103
- atomic_set(&zone->refcount, 0);
1104
- zone->chunk = DMZ_MAP_UNMAPPED;
1105
-
1106
- if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
1107
- set_bit(DMZ_RND, &zone->flags);
1108
- } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
1109
- blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
1110
- set_bit(DMZ_SEQ, &zone->flags);
1111
- } else
1393
+ /*
1394
+ * Devices that have zones with a capacity smaller than the zone size
1395
+ * (e.g. NVMe zoned namespaces) are not supported.
1396
+ */
1397
+ if (blkz->capacity != blkz->len)
11121398 return -ENXIO;
11131399
1114
- if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1115
- set_bit(DMZ_OFFLINE, &zone->flags);
1116
- else if (blkz->cond == BLK_ZONE_COND_READONLY)
1117
- set_bit(DMZ_READ_ONLY, &zone->flags);
1400
+ switch (blkz->type) {
1401
+ case BLK_ZONE_TYPE_CONVENTIONAL:
1402
+ set_bit(DMZ_RND, &zone->flags);
1403
+ break;
1404
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
1405
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
1406
+ set_bit(DMZ_SEQ, &zone->flags);
1407
+ break;
1408
+ default:
1409
+ return -ENXIO;
1410
+ }
11181411
11191412 if (dmz_is_rnd(zone))
11201413 zone->wp_block = 0;
11211414 else
11221415 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
11231416
1124
- if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
1417
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1418
+ set_bit(DMZ_OFFLINE, &zone->flags);
1419
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
1420
+ set_bit(DMZ_READ_ONLY, &zone->flags);
1421
+ else {
11251422 zmd->nr_useable_zones++;
11261423 if (dmz_is_rnd(zone)) {
11271424 zmd->nr_rnd_zones++;
1128
- if (!zmd->sb_zone) {
1129
- /* Super block zone */
1130
- zmd->sb_zone = zone;
1425
+ if (zmd->nr_devs == 1 && !zmd->sb[0].zone) {
1426
+ /* Primary super block zone */
1427
+ zmd->sb[0].zone = zone;
11311428 }
11321429 }
1430
+ if (zmd->nr_devs > 1 && num == 0) {
1431
+ /*
1432
+ * Tertiary superblock zones are always at the
1433
+ * start of the zoned devices, so mark them
1434
+ * as metadata zone.
1435
+ */
1436
+ set_bit(DMZ_META, &zone->flags);
1437
+ }
11331438 }
1439
+ return 0;
1440
+}
11341441
1442
+static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev)
1443
+{
1444
+ int idx;
1445
+ sector_t zone_offset = 0;
1446
+
1447
+ for(idx = 0; idx < dev->nr_zones; idx++) {
1448
+ struct dm_zone *zone;
1449
+
1450
+ zone = dmz_insert(zmd, idx, dev);
1451
+ if (IS_ERR(zone))
1452
+ return PTR_ERR(zone);
1453
+ set_bit(DMZ_CACHE, &zone->flags);
1454
+ zone->wp_block = 0;
1455
+ zmd->nr_cache_zones++;
1456
+ zmd->nr_useable_zones++;
1457
+ if (dev->capacity - zone_offset < zmd->zone_nr_sectors) {
1458
+ /* Disable runt zone */
1459
+ set_bit(DMZ_OFFLINE, &zone->flags);
1460
+ break;
1461
+ }
1462
+ zone_offset += zmd->zone_nr_sectors;
1463
+ }
11351464 return 0;
11361465 }
11371466
....@@ -1140,15 +1469,16 @@
11401469 */
11411470 static void dmz_drop_zones(struct dmz_metadata *zmd)
11421471 {
1143
- kfree(zmd->zones);
1144
- zmd->zones = NULL;
1145
-}
1472
+ int idx;
11461473
1147
-/*
1148
- * The size of a zone report in number of zones.
1149
- * This results in 4096*64B=256KB report zones commands.
1150
- */
1151
-#define DMZ_REPORT_NR_ZONES 4096
1474
+ for(idx = 0; idx < zmd->nr_zones; idx++) {
1475
+ struct dm_zone *zone = xa_load(&zmd->zones, idx);
1476
+
1477
+ kfree(zone);
1478
+ xa_erase(&zmd->zones, idx);
1479
+ }
1480
+ xa_destroy(&zmd->zones);
1481
+}
11521482
11531483 /*
11541484 * Allocate and initialize zone descriptors using the zone
....@@ -1156,77 +1486,111 @@
11561486 */
11571487 static int dmz_init_zones(struct dmz_metadata *zmd)
11581488 {
1159
- struct dmz_dev *dev = zmd->dev;
1160
- struct dm_zone *zone;
1161
- struct blk_zone *blkz;
1162
- unsigned int nr_blkz;
1163
- sector_t sector = 0;
1164
- int i, ret = 0;
1489
+ int i, ret;
1490
+ struct dmz_dev *zoned_dev = &zmd->dev[0];
11651491
11661492 /* Init */
1167
- zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
1493
+ zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors;
1494
+ zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors);
1495
+ zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors);
1496
+ zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks);
1497
+ zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3;
11681498 zmd->zone_nr_bitmap_blocks =
11691499 max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
1170
- zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
1500
+ zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks,
11711501 DMZ_BLOCK_SIZE_BITS);
11721502
11731503 /* Allocate zone array */
1174
- zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
1175
- if (!zmd->zones)
1176
- return -ENOMEM;
1504
+ zmd->nr_zones = 0;
1505
+ for (i = 0; i < zmd->nr_devs; i++) {
1506
+ struct dmz_dev *dev = &zmd->dev[i];
11771507
1178
- dmz_dev_info(dev, "Using %zu B for zone information",
1179
- sizeof(struct dm_zone) * dev->nr_zones);
1508
+ dev->metadata = zmd;
1509
+ zmd->nr_zones += dev->nr_zones;
11801510
1181
- /* Get zone information */
1182
- nr_blkz = DMZ_REPORT_NR_ZONES;
1183
- blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
1184
- if (!blkz) {
1185
- ret = -ENOMEM;
1186
- goto out;
1511
+ atomic_set(&dev->unmap_nr_rnd, 0);
1512
+ INIT_LIST_HEAD(&dev->unmap_rnd_list);
1513
+ INIT_LIST_HEAD(&dev->map_rnd_list);
1514
+
1515
+ atomic_set(&dev->unmap_nr_seq, 0);
1516
+ INIT_LIST_HEAD(&dev->unmap_seq_list);
1517
+ INIT_LIST_HEAD(&dev->map_seq_list);
1518
+ }
1519
+
1520
+ if (!zmd->nr_zones) {
1521
+ DMERR("(%s): No zones found", zmd->devname);
1522
+ return -ENXIO;
1523
+ }
1524
+ xa_init(&zmd->zones);
1525
+
1526
+ DMDEBUG("(%s): Using %zu B for zone information",
1527
+ zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
1528
+
1529
+ if (zmd->nr_devs > 1) {
1530
+ ret = dmz_emulate_zones(zmd, &zmd->dev[0]);
1531
+ if (ret < 0) {
1532
+ DMDEBUG("(%s): Failed to emulate zones, error %d",
1533
+ zmd->devname, ret);
1534
+ dmz_drop_zones(zmd);
1535
+ return ret;
1536
+ }
1537
+
1538
+ /*
1539
+ * Primary superblock zone is always at zone 0 when multiple
1540
+ * drives are present.
1541
+ */
1542
+ zmd->sb[0].zone = dmz_get(zmd, 0);
1543
+
1544
+ for (i = 1; i < zmd->nr_devs; i++) {
1545
+ zoned_dev = &zmd->dev[i];
1546
+
1547
+ ret = blkdev_report_zones(zoned_dev->bdev, 0,
1548
+ BLK_ALL_ZONES,
1549
+ dmz_init_zone, zoned_dev);
1550
+ if (ret < 0) {
1551
+ DMDEBUG("(%s): Failed to report zones, error %d",
1552
+ zmd->devname, ret);
1553
+ dmz_drop_zones(zmd);
1554
+ return ret;
1555
+ }
1556
+ }
1557
+ return 0;
11871558 }
11881559
11891560 /*
1190
- * Get zone information and initialize zone descriptors.
1191
- * At the same time, determine where the super block
1192
- * should be: first block of the first randomly writable
1193
- * zone.
1561
+ * Get zone information and initialize zone descriptors. At the same
1562
+ * time, determine where the super block should be: first block of the
1563
+ * first randomly writable zone.
11941564 */
1195
- zone = zmd->zones;
1196
- while (sector < dev->capacity) {
1197
- /* Get zone information */
1198
- nr_blkz = DMZ_REPORT_NR_ZONES;
1199
- ret = blkdev_report_zones(dev->bdev, sector, blkz,
1200
- &nr_blkz, GFP_KERNEL);
1201
- if (ret) {
1202
- dmz_dev_err(dev, "Report zones failed %d", ret);
1203
- goto out;
1204
- }
1205
-
1206
- if (!nr_blkz)
1207
- break;
1208
-
1209
- /* Process report */
1210
- for (i = 0; i < nr_blkz; i++) {
1211
- ret = dmz_init_zone(zmd, zone, &blkz[i]);
1212
- if (ret)
1213
- goto out;
1214
- sector += dev->zone_nr_sectors;
1215
- zone++;
1216
- }
1217
- }
1218
-
1219
- /* The entire zone configuration of the disk should now be known */
1220
- if (sector < dev->capacity) {
1221
- dmz_dev_err(dev, "Failed to get correct zone information");
1222
- ret = -ENXIO;
1223
- }
1224
-out:
1225
- kfree(blkz);
1226
- if (ret)
1565
+ ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES,
1566
+ dmz_init_zone, zoned_dev);
1567
+ if (ret < 0) {
1568
+ DMDEBUG("(%s): Failed to report zones, error %d",
1569
+ zmd->devname, ret);
12271570 dmz_drop_zones(zmd);
1571
+ return ret;
1572
+ }
12281573
1229
- return ret;
1574
+ return 0;
1575
+}
1576
+
1577
+static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx,
1578
+ void *data)
1579
+{
1580
+ struct dm_zone *zone = data;
1581
+
1582
+ clear_bit(DMZ_OFFLINE, &zone->flags);
1583
+ clear_bit(DMZ_READ_ONLY, &zone->flags);
1584
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1585
+ set_bit(DMZ_OFFLINE, &zone->flags);
1586
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
1587
+ set_bit(DMZ_READ_ONLY, &zone->flags);
1588
+
1589
+ if (dmz_is_seq(zone))
1590
+ zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
1591
+ else
1592
+ zone->wp_block = 0;
1593
+ return 0;
12301594 }
12311595
12321596 /*
....@@ -1234,33 +1598,32 @@
12341598 */
12351599 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
12361600 {
1237
- unsigned int nr_blkz = 1;
1238
- struct blk_zone blkz;
1601
+ struct dmz_dev *dev = zone->dev;
1602
+ unsigned int noio_flag;
12391603 int ret;
12401604
1241
- /* Get zone information from disk */
1242
- ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
1243
- &blkz, &nr_blkz, GFP_NOIO);
1244
- if (!nr_blkz)
1605
+ if (dev->flags & DMZ_BDEV_REGULAR)
1606
+ return 0;
1607
+
1608
+ /*
1609
+ * Get zone information from disk. Since blkdev_report_zones() uses
1610
+ * GFP_KERNEL by default for memory allocations, set the per-task
1611
+ * PF_MEMALLOC_NOIO flag so that all allocations are done as if
1612
+ * GFP_NOIO was specified.
1613
+ */
1614
+ noio_flag = memalloc_noio_save();
1615
+ ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1,
1616
+ dmz_update_zone_cb, zone);
1617
+ memalloc_noio_restore(noio_flag);
1618
+
1619
+ if (ret == 0)
12451620 ret = -EIO;
1246
- if (ret) {
1247
- dmz_dev_err(zmd->dev, "Get zone %u report failed",
1248
- dmz_id(zmd, zone));
1249
- dmz_check_bdev(zmd->dev);
1621
+ if (ret < 0) {
1622
+ dmz_dev_err(dev, "Get zone %u report failed",
1623
+ zone->id);
1624
+ dmz_check_bdev(dev);
12501625 return ret;
12511626 }
1252
-
1253
- clear_bit(DMZ_OFFLINE, &zone->flags);
1254
- clear_bit(DMZ_READ_ONLY, &zone->flags);
1255
- if (blkz.cond == BLK_ZONE_COND_OFFLINE)
1256
- set_bit(DMZ_OFFLINE, &zone->flags);
1257
- else if (blkz.cond == BLK_ZONE_COND_READONLY)
1258
- set_bit(DMZ_READ_ONLY, &zone->flags);
1259
-
1260
- if (dmz_is_seq(zone))
1261
- zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
1262
- else
1263
- zone->wp_block = 0;
12641627
12651628 return 0;
12661629 }
....@@ -1272,6 +1635,7 @@
12721635 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
12731636 struct dm_zone *zone)
12741637 {
1638
+ struct dmz_dev *dev = zone->dev;
12751639 unsigned int wp = 0;
12761640 int ret;
12771641
....@@ -1280,8 +1644,8 @@
12801644 if (ret)
12811645 return ret;
12821646
1283
- dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
1284
- dmz_id(zmd, zone), zone->wp_block, wp);
1647
+ dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)",
1648
+ zone->id, zone->wp_block, wp);
12851649
12861650 if (zone->wp_block < wp) {
12871651 dmz_invalidate_blocks(zmd, zone, zone->wp_block,
....@@ -1289,11 +1653,6 @@
12891653 }
12901654
12911655 return 0;
1292
-}
1293
-
1294
-static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
1295
-{
1296
- return &zmd->zones[zone_id];
12971656 }
12981657
12991658 /*
....@@ -1313,14 +1672,14 @@
13131672 return 0;
13141673
13151674 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
1316
- struct dmz_dev *dev = zmd->dev;
1675
+ struct dmz_dev *dev = zone->dev;
13171676
1318
- ret = blkdev_reset_zones(dev->bdev,
1319
- dmz_start_sect(zmd, zone),
1320
- dev->zone_nr_sectors, GFP_NOIO);
1677
+ ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
1678
+ dmz_start_sect(zmd, zone),
1679
+ zmd->zone_nr_sectors, GFP_NOIO);
13211680 if (ret) {
13221681 dmz_dev_err(dev, "Reset zone %u failed %d",
1323
- dmz_id(zmd, zone), ret);
1682
+ zone->id, ret);
13241683 return ret;
13251684 }
13261685 }
....@@ -1339,7 +1698,6 @@
13391698 */
13401699 static int dmz_load_mapping(struct dmz_metadata *zmd)
13411700 {
1342
- struct dmz_dev *dev = zmd->dev;
13431701 struct dm_zone *dzone, *bzone;
13441702 struct dmz_mblock *dmap_mblk = NULL;
13451703 struct dmz_map *dmap;
....@@ -1371,36 +1729,48 @@
13711729 if (dzone_id == DMZ_MAP_UNMAPPED)
13721730 goto next;
13731731
1374
- if (dzone_id >= dev->nr_zones) {
1375
- dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
1732
+ if (dzone_id >= zmd->nr_zones) {
1733
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u",
13761734 chunk, dzone_id);
13771735 return -EIO;
13781736 }
13791737
13801738 dzone = dmz_get(zmd, dzone_id);
1739
+ if (!dzone) {
1740
+ dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present",
1741
+ chunk, dzone_id);
1742
+ return -EIO;
1743
+ }
13811744 set_bit(DMZ_DATA, &dzone->flags);
13821745 dzone->chunk = chunk;
13831746 dmz_get_zone_weight(zmd, dzone);
13841747
1385
- if (dmz_is_rnd(dzone))
1386
- list_add_tail(&dzone->link, &zmd->map_rnd_list);
1748
+ if (dmz_is_cache(dzone))
1749
+ list_add_tail(&dzone->link, &zmd->map_cache_list);
1750
+ else if (dmz_is_rnd(dzone))
1751
+ list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
13871752 else
1388
- list_add_tail(&dzone->link, &zmd->map_seq_list);
1753
+ list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
13891754
13901755 /* Check buffer zone */
13911756 bzone_id = le32_to_cpu(dmap[e].bzone_id);
13921757 if (bzone_id == DMZ_MAP_UNMAPPED)
13931758 goto next;
13941759
1395
- if (bzone_id >= dev->nr_zones) {
1396
- dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
1760
+ if (bzone_id >= zmd->nr_zones) {
1761
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u",
13971762 chunk, bzone_id);
13981763 return -EIO;
13991764 }
14001765
14011766 bzone = dmz_get(zmd, bzone_id);
1402
- if (!dmz_is_rnd(bzone)) {
1403
- dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
1767
+ if (!bzone) {
1768
+ dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present",
1769
+ chunk, bzone_id);
1770
+ return -EIO;
1771
+ }
1772
+ if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) {
1773
+ dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u",
14041774 chunk, bzone_id);
14051775 return -EIO;
14061776 }
....@@ -1411,7 +1781,10 @@
14111781 bzone->bzone = dzone;
14121782 dzone->bzone = bzone;
14131783 dmz_get_zone_weight(zmd, bzone);
1414
- list_add_tail(&bzone->link, &zmd->map_rnd_list);
1784
+ if (dmz_is_cache(bzone))
1785
+ list_add_tail(&bzone->link, &zmd->map_cache_list);
1786
+ else
1787
+ list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
14151788 next:
14161789 chunk++;
14171790 e++;
....@@ -1424,15 +1797,21 @@
14241797 * fully initialized. All remaining zones are unmapped data
14251798 * zones. Finish initializing those here.
14261799 */
1427
- for (i = 0; i < dev->nr_zones; i++) {
1800
+ for (i = 0; i < zmd->nr_zones; i++) {
14281801 dzone = dmz_get(zmd, i);
1802
+ if (!dzone)
1803
+ continue;
14291804 if (dmz_is_meta(dzone))
14301805 continue;
1806
+ if (dmz_is_offline(dzone))
1807
+ continue;
14311808
1432
- if (dmz_is_rnd(dzone))
1433
- zmd->nr_rnd++;
1809
+ if (dmz_is_cache(dzone))
1810
+ zmd->nr_cache++;
1811
+ else if (dmz_is_rnd(dzone))
1812
+ dzone->dev->nr_rnd++;
14341813 else
1435
- zmd->nr_seq++;
1814
+ dzone->dev->nr_seq++;
14361815
14371816 if (dmz_is_data(dzone)) {
14381817 /* Already initialized */
....@@ -1442,16 +1821,22 @@
14421821 /* Unmapped data zone */
14431822 set_bit(DMZ_DATA, &dzone->flags);
14441823 dzone->chunk = DMZ_MAP_UNMAPPED;
1445
- if (dmz_is_rnd(dzone)) {
1446
- list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
1447
- atomic_inc(&zmd->unmap_nr_rnd);
1824
+ if (dmz_is_cache(dzone)) {
1825
+ list_add_tail(&dzone->link, &zmd->unmap_cache_list);
1826
+ atomic_inc(&zmd->unmap_nr_cache);
1827
+ } else if (dmz_is_rnd(dzone)) {
1828
+ list_add_tail(&dzone->link,
1829
+ &dzone->dev->unmap_rnd_list);
1830
+ atomic_inc(&dzone->dev->unmap_nr_rnd);
14481831 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
14491832 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
1833
+ set_bit(DMZ_RESERVED, &dzone->flags);
14501834 atomic_inc(&zmd->nr_reserved_seq_zones);
1451
- zmd->nr_seq--;
1835
+ dzone->dev->nr_seq--;
14521836 } else {
1453
- list_add_tail(&dzone->link, &zmd->unmap_seq_list);
1454
- atomic_inc(&zmd->unmap_nr_seq);
1837
+ list_add_tail(&dzone->link,
1838
+ &dzone->dev->unmap_seq_list);
1839
+ atomic_inc(&dzone->dev->unmap_nr_seq);
14551840 }
14561841 }
14571842
....@@ -1485,10 +1870,13 @@
14851870 list_del_init(&zone->link);
14861871 if (dmz_is_seq(zone)) {
14871872 /* LRU rotate sequential zone */
1488
- list_add_tail(&zone->link, &zmd->map_seq_list);
1873
+ list_add_tail(&zone->link, &zone->dev->map_seq_list);
1874
+ } else if (dmz_is_cache(zone)) {
1875
+ /* LRU rotate cache zone */
1876
+ list_add_tail(&zone->link, &zmd->map_cache_list);
14891877 } else {
14901878 /* LRU rotate random zone */
1491
- list_add_tail(&zone->link, &zmd->map_rnd_list);
1879
+ list_add_tail(&zone->link, &zone->dev->map_rnd_list);
14921880 }
14931881 }
14941882
....@@ -1555,26 +1943,64 @@
15551943 {
15561944 dmz_unlock_map(zmd);
15571945 dmz_unlock_metadata(zmd);
1946
+ set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
15581947 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
1948
+ clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
15591949 dmz_lock_metadata(zmd);
15601950 dmz_lock_map(zmd);
15611951 }
15621952
15631953 /*
1564
- * Select a random write zone for reclaim.
1954
+ * Select a cache or random write zone for reclaim.
15651955 */
1566
-static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
1956
+static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd,
1957
+ unsigned int idx, bool idle)
15671958 {
15681959 struct dm_zone *dzone = NULL;
1569
- struct dm_zone *zone;
1960
+ struct dm_zone *zone, *maxw_z = NULL;
1961
+ struct list_head *zone_list;
15701962
1571
- if (list_empty(&zmd->map_rnd_list))
1572
- return ERR_PTR(-EBUSY);
1963
+ /* If we have cache zones select from the cache zone list */
1964
+ if (zmd->nr_cache) {
1965
+ zone_list = &zmd->map_cache_list;
1966
+ /* Try to relaim random zones, too, when idle */
1967
+ if (idle && list_empty(zone_list))
1968
+ zone_list = &zmd->dev[idx].map_rnd_list;
1969
+ } else
1970
+ zone_list = &zmd->dev[idx].map_rnd_list;
15731971
1574
- list_for_each_entry(zone, &zmd->map_rnd_list, link) {
1575
- if (dmz_is_buf(zone))
1972
+ /*
1973
+ * Find the buffer zone with the heaviest weight or the first (oldest)
1974
+ * data zone that can be reclaimed.
1975
+ */
1976
+ list_for_each_entry(zone, zone_list, link) {
1977
+ if (dmz_is_buf(zone)) {
15761978 dzone = zone->bzone;
1577
- else
1979
+ if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
1980
+ continue;
1981
+ if (!maxw_z || maxw_z->weight < dzone->weight)
1982
+ maxw_z = dzone;
1983
+ } else {
1984
+ dzone = zone;
1985
+ if (dmz_lock_zone_reclaim(dzone))
1986
+ return dzone;
1987
+ }
1988
+ }
1989
+
1990
+ if (maxw_z && dmz_lock_zone_reclaim(maxw_z))
1991
+ return maxw_z;
1992
+
1993
+ /*
1994
+ * If we come here, none of the zones inspected could be locked for
1995
+ * reclaim. Try again, being more aggressive, that is, find the
1996
+ * first zone that can be reclaimed regardless of its weitght.
1997
+ */
1998
+ list_for_each_entry(zone, zone_list, link) {
1999
+ if (dmz_is_buf(zone)) {
2000
+ dzone = zone->bzone;
2001
+ if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
2002
+ continue;
2003
+ } else
15782004 dzone = zone;
15792005 if (dmz_lock_zone_reclaim(dzone))
15802006 return dzone;
....@@ -1586,14 +2012,12 @@
15862012 /*
15872013 * Select a buffered sequential zone for reclaim.
15882014 */
1589
-static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
2015
+static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd,
2016
+ unsigned int idx)
15902017 {
15912018 struct dm_zone *zone;
15922019
1593
- if (list_empty(&zmd->map_seq_list))
1594
- return ERR_PTR(-EBUSY);
1595
-
1596
- list_for_each_entry(zone, &zmd->map_seq_list, link) {
2020
+ list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) {
15972021 if (!zone->bzone)
15982022 continue;
15992023 if (dmz_lock_zone_reclaim(zone))
....@@ -1606,9 +2030,10 @@
16062030 /*
16072031 * Select a zone for reclaim.
16082032 */
1609
-struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
2033
+struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
2034
+ unsigned int dev_idx, bool idle)
16102035 {
1611
- struct dm_zone *zone;
2036
+ struct dm_zone *zone = NULL;
16122037
16132038 /*
16142039 * Search for a zone candidate to reclaim: 2 cases are possible.
....@@ -1620,9 +2045,9 @@
16202045 */
16212046 dmz_lock_map(zmd);
16222047 if (list_empty(&zmd->reserved_seq_zones_list))
1623
- zone = dmz_get_seq_zone_for_reclaim(zmd);
1624
- else
1625
- zone = dmz_get_rnd_zone_for_reclaim(zmd);
2048
+ zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx);
2049
+ if (!zone)
2050
+ zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle);
16262051 dmz_unlock_map(zmd);
16272052
16282053 return zone;
....@@ -1642,6 +2067,7 @@
16422067 unsigned int dzone_id;
16432068 struct dm_zone *dzone = NULL;
16442069 int ret = 0;
2070
+ int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
16452071
16462072 dmz_lock_map(zmd);
16472073 again:
....@@ -1655,10 +2081,10 @@
16552081 if (op != REQ_OP_WRITE)
16562082 goto out;
16572083
1658
- /* Alloate a random zone */
1659
- dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
2084
+ /* Allocate a random zone */
2085
+ dzone = dmz_alloc_zone(zmd, 0, alloc_flags);
16602086 if (!dzone) {
1661
- if (dmz_bdev_is_dying(zmd->dev)) {
2087
+ if (dmz_dev_is_dying(zmd)) {
16622088 dzone = ERR_PTR(-EIO);
16632089 goto out;
16642090 }
....@@ -1671,6 +2097,10 @@
16712097 } else {
16722098 /* The chunk is already mapped: get the mapping zone */
16732099 dzone = dmz_get(zmd, dzone_id);
2100
+ if (!dzone) {
2101
+ dzone = ERR_PTR(-EIO);
2102
+ goto out;
2103
+ }
16742104 if (dzone->chunk != chunk) {
16752105 dzone = ERR_PTR(-EIO);
16762106 goto out;
....@@ -1749,6 +2179,7 @@
17492179 struct dm_zone *dzone)
17502180 {
17512181 struct dm_zone *bzone;
2182
+ int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
17522183
17532184 dmz_lock_map(zmd);
17542185 again:
....@@ -1756,10 +2187,10 @@
17562187 if (bzone)
17572188 goto out;
17582189
1759
- /* Alloate a random zone */
1760
- bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
2190
+ /* Allocate a random zone */
2191
+ bzone = dmz_alloc_zone(zmd, 0, alloc_flags);
17612192 if (!bzone) {
1762
- if (dmz_bdev_is_dying(zmd->dev)) {
2193
+ if (dmz_dev_is_dying(zmd)) {
17632194 bzone = ERR_PTR(-EIO);
17642195 goto out;
17652196 }
....@@ -1768,14 +2199,16 @@
17682199 }
17692200
17702201 /* Update the chunk mapping */
1771
- dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
1772
- dmz_id(zmd, bzone));
2202
+ dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id);
17732203
17742204 set_bit(DMZ_BUF, &bzone->flags);
17752205 bzone->chunk = dzone->chunk;
17762206 bzone->bzone = dzone;
17772207 dzone->bzone = bzone;
1778
- list_add_tail(&bzone->link, &zmd->map_rnd_list);
2208
+ if (dmz_is_cache(bzone))
2209
+ list_add_tail(&bzone->link, &zmd->map_cache_list);
2210
+ else
2211
+ list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
17792212 out:
17802213 dmz_unlock_map(zmd);
17812214
....@@ -1786,46 +2219,75 @@
17862219 * Get an unmapped (free) zone.
17872220 * This must be called with the mapping lock held.
17882221 */
1789
-struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
2222
+struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx,
2223
+ unsigned long flags)
17902224 {
17912225 struct list_head *list;
17922226 struct dm_zone *zone;
2227
+ int i;
17932228
1794
- if (flags & DMZ_ALLOC_RND)
1795
- list = &zmd->unmap_rnd_list;
1796
- else
1797
- list = &zmd->unmap_seq_list;
2229
+ /* Schedule reclaim to ensure free zones are available */
2230
+ if (!(flags & DMZ_ALLOC_RECLAIM)) {
2231
+ for (i = 0; i < zmd->nr_devs; i++)
2232
+ dmz_schedule_reclaim(zmd->dev[i].reclaim);
2233
+ }
2234
+
2235
+ i = 0;
17982236 again:
2237
+ if (flags & DMZ_ALLOC_CACHE)
2238
+ list = &zmd->unmap_cache_list;
2239
+ else if (flags & DMZ_ALLOC_RND)
2240
+ list = &zmd->dev[dev_idx].unmap_rnd_list;
2241
+ else
2242
+ list = &zmd->dev[dev_idx].unmap_seq_list;
2243
+
17992244 if (list_empty(list)) {
18002245 /*
1801
- * No free zone: if this is for reclaim, allow using the
1802
- * reserved sequential zones.
2246
+ * No free zone: return NULL if this is for not reclaim.
18032247 */
1804
- if (!(flags & DMZ_ALLOC_RECLAIM) ||
1805
- list_empty(&zmd->reserved_seq_zones_list))
2248
+ if (!(flags & DMZ_ALLOC_RECLAIM))
18062249 return NULL;
2250
+ /*
2251
+ * Try to allocate from other devices
2252
+ */
2253
+ if (i < zmd->nr_devs) {
2254
+ dev_idx = (dev_idx + 1) % zmd->nr_devs;
2255
+ i++;
2256
+ goto again;
2257
+ }
18072258
1808
- zone = list_first_entry(&zmd->reserved_seq_zones_list,
1809
- struct dm_zone, link);
1810
- list_del_init(&zone->link);
1811
- atomic_dec(&zmd->nr_reserved_seq_zones);
2259
+ /*
2260
+ * Fallback to the reserved sequential zones
2261
+ */
2262
+ zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list,
2263
+ struct dm_zone, link);
2264
+ if (zone) {
2265
+ list_del_init(&zone->link);
2266
+ atomic_dec(&zmd->nr_reserved_seq_zones);
2267
+ }
18122268 return zone;
18132269 }
18142270
18152271 zone = list_first_entry(list, struct dm_zone, link);
18162272 list_del_init(&zone->link);
18172273
1818
- if (dmz_is_rnd(zone))
1819
- atomic_dec(&zmd->unmap_nr_rnd);
2274
+ if (dmz_is_cache(zone))
2275
+ atomic_dec(&zmd->unmap_nr_cache);
2276
+ else if (dmz_is_rnd(zone))
2277
+ atomic_dec(&zone->dev->unmap_nr_rnd);
18202278 else
1821
- atomic_dec(&zmd->unmap_nr_seq);
2279
+ atomic_dec(&zone->dev->unmap_nr_seq);
18222280
18232281 if (dmz_is_offline(zone)) {
1824
- dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
2282
+ dmz_zmd_warn(zmd, "Zone %u is offline", zone->id);
18252283 zone = NULL;
18262284 goto again;
18272285 }
1828
-
2286
+ if (dmz_is_meta(zone)) {
2287
+ dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id);
2288
+ zone = NULL;
2289
+ goto again;
2290
+ }
18292291 return zone;
18302292 }
18312293
....@@ -1840,16 +2302,18 @@
18402302 dmz_reset_zone(zmd, zone);
18412303
18422304 /* Return the zone to its type unmap list */
1843
- if (dmz_is_rnd(zone)) {
1844
- list_add_tail(&zone->link, &zmd->unmap_rnd_list);
1845
- atomic_inc(&zmd->unmap_nr_rnd);
1846
- } else if (atomic_read(&zmd->nr_reserved_seq_zones) <
1847
- zmd->nr_reserved_seq) {
2305
+ if (dmz_is_cache(zone)) {
2306
+ list_add_tail(&zone->link, &zmd->unmap_cache_list);
2307
+ atomic_inc(&zmd->unmap_nr_cache);
2308
+ } else if (dmz_is_rnd(zone)) {
2309
+ list_add_tail(&zone->link, &zone->dev->unmap_rnd_list);
2310
+ atomic_inc(&zone->dev->unmap_nr_rnd);
2311
+ } else if (dmz_is_reserved(zone)) {
18482312 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
18492313 atomic_inc(&zmd->nr_reserved_seq_zones);
18502314 } else {
1851
- list_add_tail(&zone->link, &zmd->unmap_seq_list);
1852
- atomic_inc(&zmd->unmap_nr_seq);
2315
+ list_add_tail(&zone->link, &zone->dev->unmap_seq_list);
2316
+ atomic_inc(&zone->dev->unmap_nr_seq);
18532317 }
18542318
18552319 wake_up_all(&zmd->free_wq);
....@@ -1863,13 +2327,15 @@
18632327 unsigned int chunk)
18642328 {
18652329 /* Set the chunk mapping */
1866
- dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
2330
+ dmz_set_chunk_mapping(zmd, chunk, dzone->id,
18672331 DMZ_MAP_UNMAPPED);
18682332 dzone->chunk = chunk;
1869
- if (dmz_is_rnd(dzone))
1870
- list_add_tail(&dzone->link, &zmd->map_rnd_list);
2333
+ if (dmz_is_cache(dzone))
2334
+ list_add_tail(&dzone->link, &zmd->map_cache_list);
2335
+ else if (dmz_is_rnd(dzone))
2336
+ list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
18712337 else
1872
- list_add_tail(&dzone->link, &zmd->map_seq_list);
2338
+ list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
18732339 }
18742340
18752341 /*
....@@ -1891,7 +2357,7 @@
18912357 * Unmapping the chunk buffer zone: clear only
18922358 * the chunk buffer mapping
18932359 */
1894
- dzone_id = dmz_id(zmd, zone->bzone);
2360
+ dzone_id = zone->bzone->id;
18952361 zone->bzone->bzone = NULL;
18962362 zone->bzone = NULL;
18972363
....@@ -1953,7 +2419,7 @@
19532419 sector_t chunk_block)
19542420 {
19552421 sector_t bitmap_block = 1 + zmd->nr_map_blocks +
1956
- (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
2422
+ (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) +
19572423 (chunk_block >> DMZ_BLOCK_SHIFT_BITS);
19582424
19592425 return dmz_get_mblock(zmd, bitmap_block);
....@@ -1969,7 +2435,7 @@
19692435 sector_t chunk_block = 0;
19702436
19712437 /* Get the zones bitmap blocks */
1972
- while (chunk_block < zmd->dev->zone_nr_blocks) {
2438
+ while (chunk_block < zmd->zone_nr_blocks) {
19732439 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
19742440 if (IS_ERR(from_mblk))
19752441 return PTR_ERR(from_mblk);
....@@ -2004,7 +2470,7 @@
20042470 int ret;
20052471
20062472 /* Get the zones bitmap blocks */
2007
- while (chunk_block < zmd->dev->zone_nr_blocks) {
2473
+ while (chunk_block < zmd->zone_nr_blocks) {
20082474 /* Get a valid region from the source zone */
20092475 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
20102476 if (ret <= 0)
....@@ -2028,12 +2494,12 @@
20282494 sector_t chunk_block, unsigned int nr_blocks)
20292495 {
20302496 unsigned int count, bit, nr_bits;
2031
- unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
2497
+ unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
20322498 struct dmz_mblock *mblk;
20332499 unsigned int n = 0;
20342500
2035
- dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
2036
- dmz_id(zmd, zone), (unsigned long long)chunk_block,
2501
+ dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks",
2502
+ zone->id, (unsigned long long)chunk_block,
20372503 nr_blocks);
20382504
20392505 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
....@@ -2062,8 +2528,8 @@
20622528 if (likely(zone->weight + n <= zone_nr_blocks))
20632529 zone->weight += n;
20642530 else {
2065
- dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
2066
- dmz_id(zmd, zone), zone->weight,
2531
+ dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u",
2532
+ zone->id, zone->weight,
20672533 zone_nr_blocks - n);
20682534 zone->weight = zone_nr_blocks;
20692535 }
....@@ -2112,10 +2578,10 @@
21122578 struct dmz_mblock *mblk;
21132579 unsigned int n = 0;
21142580
2115
- dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
2116
- dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
2581
+ dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks",
2582
+ zone->id, (u64)chunk_block, nr_blocks);
21172583
2118
- WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2584
+ WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
21192585
21202586 while (nr_blocks) {
21212587 /* Get bitmap block */
....@@ -2142,8 +2608,8 @@
21422608 if (zone->weight >= n)
21432609 zone->weight -= n;
21442610 else {
2145
- dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
2146
- dmz_id(zmd, zone), zone->weight, n);
2611
+ dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u",
2612
+ zone->id, zone->weight, n);
21472613 zone->weight = 0;
21482614 }
21492615
....@@ -2159,7 +2625,7 @@
21592625 struct dmz_mblock *mblk;
21602626 int ret;
21612627
2162
- WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
2628
+ WARN_ON(chunk_block >= zmd->zone_nr_blocks);
21632629
21642630 /* Get bitmap block */
21652631 mblk = dmz_get_bitmap(zmd, zone, chunk_block);
....@@ -2189,7 +2655,7 @@
21892655 unsigned long *bitmap;
21902656 int n = 0;
21912657
2192
- WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2658
+ WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
21932659
21942660 while (nr_blocks) {
21952661 /* Get bitmap block */
....@@ -2233,7 +2699,7 @@
22332699
22342700 /* The block is valid: get the number of valid blocks from block */
22352701 return dmz_to_next_set_block(zmd, zone, chunk_block,
2236
- zmd->dev->zone_nr_blocks - chunk_block, 0);
2702
+ zmd->zone_nr_blocks - chunk_block, 0);
22372703 }
22382704
22392705 /*
....@@ -2249,7 +2715,7 @@
22492715 int ret;
22502716
22512717 ret = dmz_to_next_set_block(zmd, zone, start_block,
2252
- zmd->dev->zone_nr_blocks - start_block, 1);
2718
+ zmd->zone_nr_blocks - start_block, 1);
22532719 if (ret < 0)
22542720 return ret;
22552721
....@@ -2257,7 +2723,7 @@
22572723 *chunk_block = start_block;
22582724
22592725 return dmz_to_next_set_block(zmd, zone, start_block,
2260
- zmd->dev->zone_nr_blocks - start_block, 0);
2726
+ zmd->zone_nr_blocks - start_block, 0);
22612727 }
22622728
22632729 /*
....@@ -2296,7 +2762,7 @@
22962762 struct dmz_mblock *mblk;
22972763 sector_t chunk_block = 0;
22982764 unsigned int bit, nr_bits;
2299
- unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
2765
+ unsigned int nr_blocks = zmd->zone_nr_blocks;
23002766 void *bitmap;
23012767 int n = 0;
23022768
....@@ -2352,7 +2818,7 @@
23522818 while (!list_empty(&zmd->mblk_dirty_list)) {
23532819 mblk = list_first_entry(&zmd->mblk_dirty_list,
23542820 struct dmz_mblock, link);
2355
- dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
2821
+ dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)",
23562822 (u64)mblk->no, mblk->ref);
23572823 list_del_init(&mblk->link);
23582824 rb_erase(&mblk->node, &zmd->mblk_rbtree);
....@@ -2370,7 +2836,7 @@
23702836 /* Sanity checks: the mblock rbtree should now be empty */
23712837 root = &zmd->mblk_rbtree;
23722838 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
2373
- dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
2839
+ dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree",
23742840 (u64)mblk->no, mblk->ref);
23752841 mblk->ref = 0;
23762842 dmz_free_mblock(zmd, mblk);
....@@ -2383,13 +2849,42 @@
23832849 mutex_destroy(&zmd->map_lock);
23842850 }
23852851
2852
+static void dmz_print_dev(struct dmz_metadata *zmd, int num)
2853
+{
2854
+ struct dmz_dev *dev = &zmd->dev[num];
2855
+
2856
+ if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
2857
+ dmz_dev_info(dev, "Regular block device");
2858
+ else
2859
+ dmz_dev_info(dev, "Host-%s zoned block device",
2860
+ bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
2861
+ "aware" : "managed");
2862
+ if (zmd->sb_version > 1) {
2863
+ sector_t sector_offset =
2864
+ dev->zone_offset << zmd->zone_nr_sectors_shift;
2865
+
2866
+ dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)",
2867
+ (u64)dev->capacity, (u64)sector_offset);
2868
+ dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)",
2869
+ dev->nr_zones, (u64)zmd->zone_nr_sectors,
2870
+ (u64)dev->zone_offset);
2871
+ } else {
2872
+ dmz_dev_info(dev, " %llu 512-byte logical sectors",
2873
+ (u64)dev->capacity);
2874
+ dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors",
2875
+ dev->nr_zones, (u64)zmd->zone_nr_sectors);
2876
+ }
2877
+}
2878
+
23862879 /*
23872880 * Initialize the zoned metadata.
23882881 */
2389
-int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
2882
+int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
2883
+ struct dmz_metadata **metadata,
2884
+ const char *devname)
23902885 {
23912886 struct dmz_metadata *zmd;
2392
- unsigned int i, zid;
2887
+ unsigned int i;
23932888 struct dm_zone *zone;
23942889 int ret;
23952890
....@@ -2397,7 +2892,9 @@
23972892 if (!zmd)
23982893 return -ENOMEM;
23992894
2895
+ strcpy(zmd->devname, devname);
24002896 zmd->dev = dev;
2897
+ zmd->nr_devs = num_dev;
24012898 zmd->mblk_rbtree = RB_ROOT;
24022899 init_rwsem(&zmd->mblk_sem);
24032900 mutex_init(&zmd->mblk_flush_lock);
....@@ -2406,13 +2903,10 @@
24062903 INIT_LIST_HEAD(&zmd->mblk_dirty_list);
24072904
24082905 mutex_init(&zmd->map_lock);
2409
- atomic_set(&zmd->unmap_nr_rnd, 0);
2410
- INIT_LIST_HEAD(&zmd->unmap_rnd_list);
2411
- INIT_LIST_HEAD(&zmd->map_rnd_list);
24122906
2413
- atomic_set(&zmd->unmap_nr_seq, 0);
2414
- INIT_LIST_HEAD(&zmd->unmap_seq_list);
2415
- INIT_LIST_HEAD(&zmd->map_seq_list);
2907
+ atomic_set(&zmd->unmap_nr_cache, 0);
2908
+ INIT_LIST_HEAD(&zmd->unmap_cache_list);
2909
+ INIT_LIST_HEAD(&zmd->map_cache_list);
24162910
24172911 atomic_set(&zmd->nr_reserved_seq_zones, 0);
24182912 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
....@@ -2430,14 +2924,22 @@
24302924 goto err;
24312925
24322926 /* Set metadata zones starting from sb_zone */
2433
- zid = dmz_id(zmd, zmd->sb_zone);
24342927 for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
2435
- zone = dmz_get(zmd, zid + i);
2436
- if (!dmz_is_rnd(zone))
2928
+ zone = dmz_get(zmd, zmd->sb[0].zone->id + i);
2929
+ if (!zone) {
2930
+ dmz_zmd_err(zmd,
2931
+ "metadata zone %u not present", i);
2932
+ ret = -ENXIO;
24372933 goto err;
2934
+ }
2935
+ if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) {
2936
+ dmz_zmd_err(zmd,
2937
+ "metadata zone %d is not random", i);
2938
+ ret = -ENXIO;
2939
+ goto err;
2940
+ }
24382941 set_bit(DMZ_META, &zone->flags);
24392942 }
2440
-
24412943 /* Load mapping table */
24422944 ret = dmz_load_mapping(zmd);
24432945 if (ret)
....@@ -2458,34 +2960,38 @@
24582960 /* Metadata cache shrinker */
24592961 ret = register_shrinker(&zmd->mblk_shrinker);
24602962 if (ret) {
2461
- dmz_dev_err(dev, "Register metadata cache shrinker failed");
2963
+ dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
24622964 goto err;
24632965 }
24642966
2465
- dmz_dev_info(dev, "Host-%s zoned block device",
2466
- bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
2467
- "aware" : "managed");
2468
- dmz_dev_info(dev, " %llu 512-byte logical sectors",
2469
- (u64)dev->capacity);
2470
- dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors",
2471
- dev->nr_zones, (u64)dev->zone_nr_sectors);
2472
- dmz_dev_info(dev, " %u metadata zones",
2473
- zmd->nr_meta_zones * 2);
2474
- dmz_dev_info(dev, " %u data zones for %u chunks",
2475
- zmd->nr_data_zones, zmd->nr_chunks);
2476
- dmz_dev_info(dev, " %u random zones (%u unmapped)",
2477
- zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
2478
- dmz_dev_info(dev, " %u sequential zones (%u unmapped)",
2479
- zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
2480
- dmz_dev_info(dev, " %u reserved sequential data zones",
2481
- zmd->nr_reserved_seq);
2967
+ dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
2968
+ for (i = 0; i < zmd->nr_devs; i++)
2969
+ dmz_print_dev(zmd, i);
24822970
2483
- dmz_dev_debug(dev, "Format:");
2484
- dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
2971
+ dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors",
2972
+ zmd->nr_zones, (u64)zmd->zone_nr_sectors);
2973
+ dmz_zmd_debug(zmd, " %u metadata zones",
2974
+ zmd->nr_meta_zones * 2);
2975
+ dmz_zmd_debug(zmd, " %u data zones for %u chunks",
2976
+ zmd->nr_data_zones, zmd->nr_chunks);
2977
+ dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)",
2978
+ zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache));
2979
+ for (i = 0; i < zmd->nr_devs; i++) {
2980
+ dmz_zmd_debug(zmd, " %u random zones (%u unmapped)",
2981
+ dmz_nr_rnd_zones(zmd, i),
2982
+ dmz_nr_unmap_rnd_zones(zmd, i));
2983
+ dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)",
2984
+ dmz_nr_seq_zones(zmd, i),
2985
+ dmz_nr_unmap_seq_zones(zmd, i));
2986
+ }
2987
+ dmz_zmd_debug(zmd, " %u reserved sequential data zones",
2988
+ zmd->nr_reserved_seq);
2989
+ dmz_zmd_debug(zmd, "Format:");
2990
+ dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)",
24852991 zmd->nr_meta_blocks, zmd->max_nr_mblks);
2486
- dmz_dev_debug(dev, " %u data zone mapping blocks",
2992
+ dmz_zmd_debug(zmd, " %u data zone mapping blocks",
24872993 zmd->nr_map_blocks);
2488
- dmz_dev_debug(dev, " %u bitmap blocks",
2994
+ dmz_zmd_debug(zmd, " %u bitmap blocks",
24892995 zmd->nr_bitmap_blocks);
24902996
24912997 *metadata = zmd;
....@@ -2514,30 +3020,28 @@
25143020 */
25153021 int dmz_resume_metadata(struct dmz_metadata *zmd)
25163022 {
2517
- struct dmz_dev *dev = zmd->dev;
25183023 struct dm_zone *zone;
25193024 sector_t wp_block;
25203025 unsigned int i;
25213026 int ret;
25223027
25233028 /* Check zones */
2524
- for (i = 0; i < dev->nr_zones; i++) {
3029
+ for (i = 0; i < zmd->nr_zones; i++) {
25253030 zone = dmz_get(zmd, i);
25263031 if (!zone) {
2527
- dmz_dev_err(dev, "Unable to get zone %u", i);
3032
+ dmz_zmd_err(zmd, "Unable to get zone %u", i);
25283033 return -EIO;
25293034 }
2530
-
25313035 wp_block = zone->wp_block;
25323036
25333037 ret = dmz_update_zone(zmd, zone);
25343038 if (ret) {
2535
- dmz_dev_err(dev, "Broken zone %u", i);
3039
+ dmz_zmd_err(zmd, "Broken zone %u", i);
25363040 return ret;
25373041 }
25383042
25393043 if (dmz_is_offline(zone)) {
2540
- dmz_dev_warn(dev, "Zone %u is offline", i);
3044
+ dmz_zmd_warn(zmd, "Zone %u is offline", i);
25413045 continue;
25423046 }
25433047
....@@ -2545,11 +3049,11 @@
25453049 if (!dmz_is_seq(zone))
25463050 zone->wp_block = 0;
25473051 else if (zone->wp_block != wp_block) {
2548
- dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
3052
+ dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)",
25493053 i, (u64)zone->wp_block, (u64)wp_block);
25503054 zone->wp_block = wp_block;
25513055 dmz_invalidate_blocks(zmd, zone, zone->wp_block,
2552
- dev->zone_nr_blocks - zone->wp_block);
3056
+ zmd->zone_nr_blocks - zone->wp_block);
25533057 }
25543058 }
25553059