| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Zoned block device handling |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 12 | 13 | #include <linux/module.h> |
|---|
| 13 | 14 | #include <linux/rbtree.h> |
|---|
| 14 | 15 | #include <linux/blkdev.h> |
|---|
| 16 | +#include <linux/blk-mq.h> |
|---|
| 17 | +#include <linux/mm.h> |
|---|
| 18 | +#include <linux/vmalloc.h> |
|---|
| 19 | +#include <linux/sched/mm.h> |
|---|
| 20 | + |
|---|
| 21 | +#include "blk.h" |
|---|
| 22 | + |
|---|
| 23 | +#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name |
|---|
| 24 | +static const char *const zone_cond_name[] = { |
|---|
| 25 | + ZONE_COND_NAME(NOT_WP), |
|---|
| 26 | + ZONE_COND_NAME(EMPTY), |
|---|
| 27 | + ZONE_COND_NAME(IMP_OPEN), |
|---|
| 28 | + ZONE_COND_NAME(EXP_OPEN), |
|---|
| 29 | + ZONE_COND_NAME(CLOSED), |
|---|
| 30 | + ZONE_COND_NAME(READONLY), |
|---|
| 31 | + ZONE_COND_NAME(FULL), |
|---|
| 32 | + ZONE_COND_NAME(OFFLINE), |
|---|
| 33 | +}; |
|---|
| 34 | +#undef ZONE_COND_NAME |
|---|
| 35 | + |
|---|
| 36 | +/** |
|---|
| 37 | + * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. |
|---|
| 38 | + * @zone_cond: BLK_ZONE_COND_XXX. |
|---|
| 39 | + * |
|---|
| 40 | + * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX |
|---|
| 41 | + * into string format. Useful in the debugging and tracing zone conditions. For |
|---|
| 42 | + * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". |
|---|
| 43 | + */ |
|---|
| 44 | +const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) |
|---|
| 45 | +{ |
|---|
| 46 | + static const char *zone_cond_str = "UNKNOWN"; |
|---|
| 47 | + |
|---|
| 48 | + if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) |
|---|
| 49 | + zone_cond_str = zone_cond_name[zone_cond]; |
|---|
| 50 | + |
|---|
| 51 | + return zone_cond_str; |
|---|
| 52 | +} |
|---|
| 53 | +EXPORT_SYMBOL_GPL(blk_zone_cond_str); |
|---|
| 15 | 54 | |
|---|
| 16 | 55 | static inline sector_t blk_zone_start(struct request_queue *q, |
|---|
| 17 | 56 | sector_t sector) |
|---|
| .. | .. |
|---|
| 43 | 82 | } |
|---|
| 44 | 83 | EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); |
|---|
| 45 | 84 | |
|---|
| 85 | +bool blk_req_zone_write_trylock(struct request *rq) |
|---|
| 86 | +{ |
|---|
| 87 | + unsigned int zno = blk_rq_zone_no(rq); |
|---|
| 88 | + |
|---|
| 89 | + if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) |
|---|
| 90 | + return false; |
|---|
| 91 | + |
|---|
| 92 | + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); |
|---|
| 93 | + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; |
|---|
| 94 | + |
|---|
| 95 | + return true; |
|---|
| 96 | +} |
|---|
| 97 | +EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); |
|---|
| 98 | + |
|---|
| 46 | 99 | void __blk_req_zone_write_lock(struct request *rq) |
|---|
| 47 | 100 | { |
|---|
| 48 | 101 | if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), |
|---|
| .. | .. |
|---|
| 63 | 116 | } |
|---|
| 64 | 117 | EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); |
|---|
| 65 | 118 | |
|---|
| 66 | | -/* |
|---|
| 67 | | - * Check that a zone report belongs to the partition. |
|---|
| 68 | | - * If yes, fix its start sector and write pointer, copy it in the |
|---|
| 69 | | - * zone information array and return true. Return false otherwise. |
|---|
| 119 | +/** |
|---|
| 120 | + * blkdev_nr_zones - Get number of zones |
|---|
| 121 | + * @disk: Target gendisk |
|---|
| 122 | + * |
|---|
| 123 | + * Return the total number of zones of a zoned block device. For a block |
|---|
| 124 | + * device without zone capabilities, the number of zones is always 0. |
|---|
| 70 | 125 | */ |
|---|
| 71 | | -static bool blkdev_report_zone(struct block_device *bdev, |
|---|
| 72 | | - struct blk_zone *rep, |
|---|
| 73 | | - struct blk_zone *zone) |
|---|
| 126 | +unsigned int blkdev_nr_zones(struct gendisk *disk) |
|---|
| 74 | 127 | { |
|---|
| 75 | | - sector_t offset = get_start_sect(bdev); |
|---|
| 128 | + sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); |
|---|
| 76 | 129 | |
|---|
| 77 | | - if (rep->start < offset) |
|---|
| 78 | | - return false; |
|---|
| 79 | | - |
|---|
| 80 | | - rep->start -= offset; |
|---|
| 81 | | - if (rep->start + rep->len > bdev->bd_part->nr_sects) |
|---|
| 82 | | - return false; |
|---|
| 83 | | - |
|---|
| 84 | | - if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) |
|---|
| 85 | | - rep->wp = rep->start + rep->len; |
|---|
| 86 | | - else |
|---|
| 87 | | - rep->wp -= offset; |
|---|
| 88 | | - memcpy(zone, rep, sizeof(struct blk_zone)); |
|---|
| 89 | | - |
|---|
| 90 | | - return true; |
|---|
| 130 | + if (!blk_queue_is_zoned(disk->queue)) |
|---|
| 131 | + return 0; |
|---|
| 132 | + return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); |
|---|
| 91 | 133 | } |
|---|
| 134 | +EXPORT_SYMBOL_GPL(blkdev_nr_zones); |
|---|
| 92 | 135 | |
|---|
| 93 | 136 | /** |
|---|
| 94 | 137 | * blkdev_report_zones - Get zones information |
|---|
| 95 | 138 | * @bdev: Target block device |
|---|
| 96 | 139 | * @sector: Sector from which to report zones |
|---|
| 97 | | - * @zones: Array of zone structures where to return the zones information |
|---|
| 98 | | - * @nr_zones: Number of zone structures in the zone array |
|---|
| 99 | | - * @gfp_mask: Memory allocation flags (for bio_alloc) |
|---|
| 140 | + * @nr_zones: Maximum number of zones to report |
|---|
| 141 | + * @cb: Callback function called for each reported zone |
|---|
| 142 | + * @data: Private data for the callback |
|---|
| 100 | 143 | * |
|---|
| 101 | 144 | * Description: |
|---|
| 102 | | - * Get zone information starting from the zone containing @sector. |
|---|
| 103 | | - * The number of zone information reported may be less than the number |
|---|
| 104 | | - * requested by @nr_zones. The number of zones actually reported is |
|---|
| 105 | | - * returned in @nr_zones. |
|---|
| 145 | + * Get zone information starting from the zone containing @sector for at most |
|---|
| 146 | + * @nr_zones, and call @cb for each zone reported by the device. |
|---|
| 147 | + * To report all zones in a device starting from @sector, the BLK_ALL_ZONES |
|---|
| 148 | + * constant can be passed to @nr_zones. |
|---|
| 149 | + * Returns the number of zones reported by the device, or a negative errno |
|---|
| 150 | + * value in case of failure. |
|---|
| 151 | + * |
|---|
| 152 | + * Note: The caller must use memalloc_noXX_save/restore() calls to control |
|---|
| 153 | + * memory allocations done within this function. |
|---|
| 106 | 154 | */ |
|---|
| 107 | | -int blkdev_report_zones(struct block_device *bdev, |
|---|
| 108 | | - sector_t sector, |
|---|
| 109 | | - struct blk_zone *zones, |
|---|
| 110 | | - unsigned int *nr_zones, |
|---|
| 111 | | - gfp_t gfp_mask) |
|---|
| 155 | +int blkdev_report_zones(struct block_device *bdev, sector_t sector, |
|---|
| 156 | + unsigned int nr_zones, report_zones_cb cb, void *data) |
|---|
| 112 | 157 | { |
|---|
| 113 | | - struct request_queue *q = bdev_get_queue(bdev); |
|---|
| 114 | | - struct blk_zone_report_hdr *hdr; |
|---|
| 115 | | - unsigned int nrz = *nr_zones; |
|---|
| 116 | | - struct page *page; |
|---|
| 117 | | - unsigned int nr_rep; |
|---|
| 118 | | - size_t rep_bytes; |
|---|
| 119 | | - unsigned int nr_pages; |
|---|
| 120 | | - struct bio *bio; |
|---|
| 121 | | - struct bio_vec *bv; |
|---|
| 122 | | - unsigned int i, n, nz; |
|---|
| 123 | | - unsigned int ofst; |
|---|
| 124 | | - void *addr; |
|---|
| 125 | | - int ret; |
|---|
| 158 | + struct gendisk *disk = bdev->bd_disk; |
|---|
| 159 | + sector_t capacity = get_capacity(disk); |
|---|
| 126 | 160 | |
|---|
| 127 | | - if (!q) |
|---|
| 128 | | - return -ENXIO; |
|---|
| 129 | | - |
|---|
| 130 | | - if (!blk_queue_is_zoned(q)) |
|---|
| 161 | + if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || |
|---|
| 162 | + WARN_ON_ONCE(!disk->fops->report_zones)) |
|---|
| 131 | 163 | return -EOPNOTSUPP; |
|---|
| 132 | 164 | |
|---|
| 133 | | - if (!nrz) |
|---|
| 165 | + if (!nr_zones || sector >= capacity) |
|---|
| 134 | 166 | return 0; |
|---|
| 135 | 167 | |
|---|
| 136 | | - if (sector > bdev->bd_part->nr_sects) { |
|---|
| 137 | | - *nr_zones = 0; |
|---|
| 138 | | - return 0; |
|---|
| 139 | | - } |
|---|
| 140 | | - |
|---|
| 141 | | - /* |
|---|
| 142 | | - * The zone report has a header. So make room for it in the |
|---|
| 143 | | - * payload. Also make sure that the report fits in a single BIO |
|---|
| 144 | | - * that will not be split down the stack. |
|---|
| 145 | | - */ |
|---|
| 146 | | - rep_bytes = sizeof(struct blk_zone_report_hdr) + |
|---|
| 147 | | - sizeof(struct blk_zone) * nrz; |
|---|
| 148 | | - rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK; |
|---|
| 149 | | - if (rep_bytes > (queue_max_sectors(q) << 9)) |
|---|
| 150 | | - rep_bytes = queue_max_sectors(q) << 9; |
|---|
| 151 | | - |
|---|
| 152 | | - nr_pages = min_t(unsigned int, BIO_MAX_PAGES, |
|---|
| 153 | | - rep_bytes >> PAGE_SHIFT); |
|---|
| 154 | | - nr_pages = min_t(unsigned int, nr_pages, |
|---|
| 155 | | - queue_max_segments(q)); |
|---|
| 156 | | - |
|---|
| 157 | | - bio = bio_alloc(gfp_mask, nr_pages); |
|---|
| 158 | | - if (!bio) |
|---|
| 159 | | - return -ENOMEM; |
|---|
| 160 | | - |
|---|
| 161 | | - bio_set_dev(bio, bdev); |
|---|
| 162 | | - bio->bi_iter.bi_sector = blk_zone_start(q, sector); |
|---|
| 163 | | - bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); |
|---|
| 164 | | - |
|---|
| 165 | | - for (i = 0; i < nr_pages; i++) { |
|---|
| 166 | | - page = alloc_page(gfp_mask); |
|---|
| 167 | | - if (!page) { |
|---|
| 168 | | - ret = -ENOMEM; |
|---|
| 169 | | - goto out; |
|---|
| 170 | | - } |
|---|
| 171 | | - if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { |
|---|
| 172 | | - __free_page(page); |
|---|
| 173 | | - break; |
|---|
| 174 | | - } |
|---|
| 175 | | - } |
|---|
| 176 | | - |
|---|
| 177 | | - if (i == 0) |
|---|
| 178 | | - ret = -ENOMEM; |
|---|
| 179 | | - else |
|---|
| 180 | | - ret = submit_bio_wait(bio); |
|---|
| 181 | | - if (ret) |
|---|
| 182 | | - goto out; |
|---|
| 183 | | - |
|---|
| 184 | | - /* |
|---|
| 185 | | - * Process the report result: skip the header and go through the |
|---|
| 186 | | - * reported zones to fixup and fixup the zone information for |
|---|
| 187 | | - * partitions. At the same time, return the zone information into |
|---|
| 188 | | - * the zone array. |
|---|
| 189 | | - */ |
|---|
| 190 | | - n = 0; |
|---|
| 191 | | - nz = 0; |
|---|
| 192 | | - nr_rep = 0; |
|---|
| 193 | | - bio_for_each_segment_all(bv, bio, i) { |
|---|
| 194 | | - |
|---|
| 195 | | - if (!bv->bv_page) |
|---|
| 196 | | - break; |
|---|
| 197 | | - |
|---|
| 198 | | - addr = kmap_atomic(bv->bv_page); |
|---|
| 199 | | - |
|---|
| 200 | | - /* Get header in the first page */ |
|---|
| 201 | | - ofst = 0; |
|---|
| 202 | | - if (!nr_rep) { |
|---|
| 203 | | - hdr = addr; |
|---|
| 204 | | - nr_rep = hdr->nr_zones; |
|---|
| 205 | | - ofst = sizeof(struct blk_zone_report_hdr); |
|---|
| 206 | | - } |
|---|
| 207 | | - |
|---|
| 208 | | - /* Fixup and report zones */ |
|---|
| 209 | | - while (ofst < bv->bv_len && |
|---|
| 210 | | - n < nr_rep && nz < nrz) { |
|---|
| 211 | | - if (blkdev_report_zone(bdev, addr + ofst, &zones[nz])) |
|---|
| 212 | | - nz++; |
|---|
| 213 | | - ofst += sizeof(struct blk_zone); |
|---|
| 214 | | - n++; |
|---|
| 215 | | - } |
|---|
| 216 | | - |
|---|
| 217 | | - kunmap_atomic(addr); |
|---|
| 218 | | - |
|---|
| 219 | | - if (n >= nr_rep || nz >= nrz) |
|---|
| 220 | | - break; |
|---|
| 221 | | - |
|---|
| 222 | | - } |
|---|
| 223 | | - |
|---|
| 224 | | - *nr_zones = nz; |
|---|
| 225 | | -out: |
|---|
| 226 | | - bio_for_each_segment_all(bv, bio, i) |
|---|
| 227 | | - __free_page(bv->bv_page); |
|---|
| 228 | | - bio_put(bio); |
|---|
| 229 | | - |
|---|
| 230 | | - return ret; |
|---|
| 168 | + return disk->fops->report_zones(disk, sector, nr_zones, cb, data); |
|---|
| 231 | 169 | } |
|---|
| 232 | 170 | EXPORT_SYMBOL_GPL(blkdev_report_zones); |
|---|
| 233 | 171 | |
|---|
| 172 | +static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, |
|---|
| 173 | + sector_t sector, |
|---|
| 174 | + sector_t nr_sectors) |
|---|
| 175 | +{ |
|---|
| 176 | + if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) |
|---|
| 177 | + return false; |
|---|
| 178 | + |
|---|
| 179 | + /* |
|---|
| 180 | + * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors |
|---|
| 181 | + * of the applicable zone range is the entire disk. |
|---|
| 182 | + */ |
|---|
| 183 | + return !sector && nr_sectors == get_capacity(bdev->bd_disk); |
|---|
| 184 | +} |
|---|
| 185 | + |
|---|
| 234 | 186 | /** |
|---|
| 235 | | - * blkdev_reset_zones - Reset zones write pointer |
|---|
| 187 | + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones |
|---|
| 236 | 188 | * @bdev: Target block device |
|---|
| 237 | | - * @sector: Start sector of the first zone to reset |
|---|
| 238 | | - * @nr_sectors: Number of sectors, at least the length of one zone |
|---|
| 189 | + * @op: Operation to be performed on the zones |
|---|
| 190 | + * @sector: Start sector of the first zone to operate on |
|---|
| 191 | + * @nr_sectors: Number of sectors, should be at least the length of one zone and |
|---|
| 192 | + * must be zone size aligned. |
|---|
| 239 | 193 | * @gfp_mask: Memory allocation flags (for bio_alloc) |
|---|
| 240 | 194 | * |
|---|
| 241 | 195 | * Description: |
|---|
| 242 | | - * Reset the write pointer of the zones contained in the range |
|---|
| 196 | + * Perform the specified operation on the range of zones specified by |
|---|
| 243 | 197 | * @sector..@sector+@nr_sectors. Specifying the entire disk sector range |
|---|
| 244 | 198 | * is valid, but the specified range should not contain conventional zones. |
|---|
| 199 | + * The operation to execute on each zone can be a zone reset, open, close |
|---|
| 200 | + * or finish request. |
|---|
| 245 | 201 | */ |
|---|
| 246 | | -int blkdev_reset_zones(struct block_device *bdev, |
|---|
| 247 | | - sector_t sector, sector_t nr_sectors, |
|---|
| 248 | | - gfp_t gfp_mask) |
|---|
| 202 | +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, |
|---|
| 203 | + sector_t sector, sector_t nr_sectors, |
|---|
| 204 | + gfp_t gfp_mask) |
|---|
| 249 | 205 | { |
|---|
| 250 | 206 | struct request_queue *q = bdev_get_queue(bdev); |
|---|
| 251 | | - sector_t zone_sectors; |
|---|
| 207 | + sector_t zone_sectors = blk_queue_zone_sectors(q); |
|---|
| 208 | + sector_t capacity = get_capacity(bdev->bd_disk); |
|---|
| 252 | 209 | sector_t end_sector = sector + nr_sectors; |
|---|
| 253 | | - struct bio *bio; |
|---|
| 210 | + struct bio *bio = NULL; |
|---|
| 254 | 211 | int ret; |
|---|
| 255 | | - |
|---|
| 256 | | - if (!q) |
|---|
| 257 | | - return -ENXIO; |
|---|
| 258 | 212 | |
|---|
| 259 | 213 | if (!blk_queue_is_zoned(q)) |
|---|
| 260 | 214 | return -EOPNOTSUPP; |
|---|
| 261 | 215 | |
|---|
| 262 | | - if (end_sector > bdev->bd_part->nr_sects) |
|---|
| 216 | + if (bdev_read_only(bdev)) |
|---|
| 217 | + return -EPERM; |
|---|
| 218 | + |
|---|
| 219 | + if (!op_is_zone_mgmt(op)) |
|---|
| 220 | + return -EOPNOTSUPP; |
|---|
| 221 | + |
|---|
| 222 | + if (end_sector <= sector || end_sector > capacity) |
|---|
| 263 | 223 | /* Out of range */ |
|---|
| 264 | 224 | return -EINVAL; |
|---|
| 265 | 225 | |
|---|
| 266 | 226 | /* Check alignment (handle eventual smaller last zone) */ |
|---|
| 267 | | - zone_sectors = blk_queue_zone_sectors(q); |
|---|
| 268 | 227 | if (sector & (zone_sectors - 1)) |
|---|
| 269 | 228 | return -EINVAL; |
|---|
| 270 | 229 | |
|---|
| 271 | | - if ((nr_sectors & (zone_sectors - 1)) && |
|---|
| 272 | | - end_sector != bdev->bd_part->nr_sects) |
|---|
| 230 | + if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) |
|---|
| 273 | 231 | return -EINVAL; |
|---|
| 274 | 232 | |
|---|
| 275 | 233 | while (sector < end_sector) { |
|---|
| 276 | | - |
|---|
| 277 | | - bio = bio_alloc(gfp_mask, 0); |
|---|
| 278 | | - bio->bi_iter.bi_sector = sector; |
|---|
| 234 | + bio = blk_next_bio(bio, 0, gfp_mask); |
|---|
| 279 | 235 | bio_set_dev(bio, bdev); |
|---|
| 280 | | - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); |
|---|
| 281 | 236 | |
|---|
| 282 | | - ret = submit_bio_wait(bio); |
|---|
| 283 | | - bio_put(bio); |
|---|
| 237 | + /* |
|---|
| 238 | + * Special case for the zone reset operation that reset all |
|---|
| 239 | + * zones, this is useful for applications like mkfs. |
|---|
| 240 | + */ |
|---|
| 241 | + if (op == REQ_OP_ZONE_RESET && |
|---|
| 242 | + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { |
|---|
| 243 | + bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; |
|---|
| 244 | + break; |
|---|
| 245 | + } |
|---|
| 284 | 246 | |
|---|
| 285 | | - if (ret) |
|---|
| 286 | | - return ret; |
|---|
| 287 | | - |
|---|
| 247 | + bio->bi_opf = op | REQ_SYNC; |
|---|
| 248 | + bio->bi_iter.bi_sector = sector; |
|---|
| 288 | 249 | sector += zone_sectors; |
|---|
| 289 | 250 | |
|---|
| 290 | 251 | /* This may take a while, so be nice to others */ |
|---|
| 291 | 252 | cond_resched(); |
|---|
| 292 | | - |
|---|
| 293 | 253 | } |
|---|
| 294 | 254 | |
|---|
| 255 | + ret = submit_bio_wait(bio); |
|---|
| 256 | + bio_put(bio); |
|---|
| 257 | + |
|---|
| 258 | + return ret; |
|---|
| 259 | +} |
|---|
| 260 | +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); |
|---|
| 261 | + |
|---|
| 262 | +struct zone_report_args { |
|---|
| 263 | + struct blk_zone __user *zones; |
|---|
| 264 | +}; |
|---|
| 265 | + |
|---|
| 266 | +static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, |
|---|
| 267 | + void *data) |
|---|
| 268 | +{ |
|---|
| 269 | + struct zone_report_args *args = data; |
|---|
| 270 | + |
|---|
| 271 | + if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) |
|---|
| 272 | + return -EFAULT; |
|---|
| 295 | 273 | return 0; |
|---|
| 296 | 274 | } |
|---|
| 297 | | -EXPORT_SYMBOL_GPL(blkdev_reset_zones); |
|---|
| 298 | 275 | |
|---|
| 299 | 276 | /* |
|---|
| 300 | 277 | * BLKREPORTZONE ioctl processing. |
|---|
| .. | .. |
|---|
| 304 | 281 | unsigned int cmd, unsigned long arg) |
|---|
| 305 | 282 | { |
|---|
| 306 | 283 | void __user *argp = (void __user *)arg; |
|---|
| 284 | + struct zone_report_args args; |
|---|
| 307 | 285 | struct request_queue *q; |
|---|
| 308 | 286 | struct blk_zone_report rep; |
|---|
| 309 | | - struct blk_zone *zones; |
|---|
| 310 | 287 | int ret; |
|---|
| 311 | 288 | |
|---|
| 312 | 289 | if (!argp) |
|---|
| .. | .. |
|---|
| 325 | 302 | if (!rep.nr_zones) |
|---|
| 326 | 303 | return -EINVAL; |
|---|
| 327 | 304 | |
|---|
| 328 | | - if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone)) |
|---|
| 329 | | - return -ERANGE; |
|---|
| 305 | + args.zones = argp + sizeof(struct blk_zone_report); |
|---|
| 306 | + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, |
|---|
| 307 | + blkdev_copy_zone_to_user, &args); |
|---|
| 308 | + if (ret < 0) |
|---|
| 309 | + return ret; |
|---|
| 330 | 310 | |
|---|
| 331 | | - zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone), |
|---|
| 332 | | - GFP_KERNEL | __GFP_ZERO); |
|---|
| 333 | | - if (!zones) |
|---|
| 334 | | - return -ENOMEM; |
|---|
| 311 | + rep.nr_zones = ret; |
|---|
| 312 | + rep.flags = BLK_ZONE_REP_CAPACITY; |
|---|
| 313 | + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) |
|---|
| 314 | + return -EFAULT; |
|---|
| 315 | + return 0; |
|---|
| 316 | +} |
|---|
| 335 | 317 | |
|---|
| 336 | | - ret = blkdev_report_zones(bdev, rep.sector, |
|---|
| 337 | | - zones, &rep.nr_zones, |
|---|
| 338 | | - GFP_KERNEL); |
|---|
| 339 | | - if (ret) |
|---|
| 340 | | - goto out; |
|---|
| 318 | +static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, |
|---|
| 319 | + const struct blk_zone_range *zrange) |
|---|
| 320 | +{ |
|---|
| 321 | + loff_t start, end; |
|---|
| 341 | 322 | |
|---|
| 342 | | - if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { |
|---|
| 343 | | - ret = -EFAULT; |
|---|
| 344 | | - goto out; |
|---|
| 345 | | - } |
|---|
| 323 | + if (zrange->sector + zrange->nr_sectors <= zrange->sector || |
|---|
| 324 | + zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) |
|---|
| 325 | + /* Out of range */ |
|---|
| 326 | + return -EINVAL; |
|---|
| 346 | 327 | |
|---|
| 347 | | - if (rep.nr_zones) { |
|---|
| 348 | | - if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, |
|---|
| 349 | | - sizeof(struct blk_zone) * rep.nr_zones)) |
|---|
| 350 | | - ret = -EFAULT; |
|---|
| 351 | | - } |
|---|
| 328 | + start = zrange->sector << SECTOR_SHIFT; |
|---|
| 329 | + end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; |
|---|
| 352 | 330 | |
|---|
| 353 | | - out: |
|---|
| 354 | | - kvfree(zones); |
|---|
| 355 | | - |
|---|
| 356 | | - return ret; |
|---|
| 331 | + return truncate_bdev_range(bdev, mode, start, end); |
|---|
| 357 | 332 | } |
|---|
| 358 | 333 | |
|---|
| 359 | 334 | /* |
|---|
| 360 | | - * BLKRESETZONE ioctl processing. |
|---|
| 335 | + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. |
|---|
| 361 | 336 | * Called from blkdev_ioctl. |
|---|
| 362 | 337 | */ |
|---|
| 363 | | -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, |
|---|
| 364 | | - unsigned int cmd, unsigned long arg) |
|---|
| 338 | +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, |
|---|
| 339 | + unsigned int cmd, unsigned long arg) |
|---|
| 365 | 340 | { |
|---|
| 366 | 341 | void __user *argp = (void __user *)arg; |
|---|
| 367 | 342 | struct request_queue *q; |
|---|
| 368 | 343 | struct blk_zone_range zrange; |
|---|
| 344 | + enum req_opf op; |
|---|
| 345 | + int ret; |
|---|
| 369 | 346 | |
|---|
| 370 | 347 | if (!argp) |
|---|
| 371 | 348 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 383 | 360 | if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) |
|---|
| 384 | 361 | return -EFAULT; |
|---|
| 385 | 362 | |
|---|
| 386 | | - return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, |
|---|
| 387 | | - GFP_KERNEL); |
|---|
| 363 | + switch (cmd) { |
|---|
| 364 | + case BLKRESETZONE: |
|---|
| 365 | + op = REQ_OP_ZONE_RESET; |
|---|
| 366 | + |
|---|
| 367 | + /* Invalidate the page cache, including dirty pages. */ |
|---|
| 368 | + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
|---|
| 369 | + if (ret) |
|---|
| 370 | + return ret; |
|---|
| 371 | + break; |
|---|
| 372 | + case BLKOPENZONE: |
|---|
| 373 | + op = REQ_OP_ZONE_OPEN; |
|---|
| 374 | + break; |
|---|
| 375 | + case BLKCLOSEZONE: |
|---|
| 376 | + op = REQ_OP_ZONE_CLOSE; |
|---|
| 377 | + break; |
|---|
| 378 | + case BLKFINISHZONE: |
|---|
| 379 | + op = REQ_OP_ZONE_FINISH; |
|---|
| 380 | + break; |
|---|
| 381 | + default: |
|---|
| 382 | + return -ENOTTY; |
|---|
| 383 | + } |
|---|
| 384 | + |
|---|
| 385 | + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, |
|---|
| 386 | + GFP_KERNEL); |
|---|
| 387 | + |
|---|
| 388 | + /* |
|---|
| 389 | + * Invalidate the page cache again for zone reset: writes can only be |
|---|
| 390 | + * direct for zoned devices so concurrent writes would not add any page |
|---|
| 391 | + * to the page cache after/during reset. The page cache may be filled |
|---|
| 392 | + * again due to concurrent reads though and dropping the pages for |
|---|
| 393 | + * these is fine. |
|---|
| 394 | + */ |
|---|
| 395 | + if (!ret && cmd == BLKRESETZONE) |
|---|
| 396 | + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); |
|---|
| 397 | + |
|---|
| 398 | + return ret; |
|---|
| 388 | 399 | } |
|---|
| 400 | + |
|---|
| 401 | +static inline unsigned long *blk_alloc_zone_bitmap(int node, |
|---|
| 402 | + unsigned int nr_zones) |
|---|
| 403 | +{ |
|---|
| 404 | + return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), |
|---|
| 405 | + GFP_NOIO, node); |
|---|
| 406 | +} |
|---|
| 407 | + |
|---|
| 408 | +void blk_queue_free_zone_bitmaps(struct request_queue *q) |
|---|
| 409 | +{ |
|---|
| 410 | + kfree(q->conv_zones_bitmap); |
|---|
| 411 | + q->conv_zones_bitmap = NULL; |
|---|
| 412 | + kfree(q->seq_zones_wlock); |
|---|
| 413 | + q->seq_zones_wlock = NULL; |
|---|
| 414 | +} |
|---|
| 415 | + |
|---|
| 416 | +struct blk_revalidate_zone_args { |
|---|
| 417 | + struct gendisk *disk; |
|---|
| 418 | + unsigned long *conv_zones_bitmap; |
|---|
| 419 | + unsigned long *seq_zones_wlock; |
|---|
| 420 | + unsigned int nr_zones; |
|---|
| 421 | + sector_t zone_sectors; |
|---|
| 422 | + sector_t sector; |
|---|
| 423 | +}; |
|---|
| 424 | + |
|---|
| 425 | +/* |
|---|
| 426 | + * Helper function to check the validity of zones of a zoned block device. |
|---|
| 427 | + */ |
|---|
| 428 | +static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, |
|---|
| 429 | + void *data) |
|---|
| 430 | +{ |
|---|
| 431 | + struct blk_revalidate_zone_args *args = data; |
|---|
| 432 | + struct gendisk *disk = args->disk; |
|---|
| 433 | + struct request_queue *q = disk->queue; |
|---|
| 434 | + sector_t capacity = get_capacity(disk); |
|---|
| 435 | + |
|---|
| 436 | + /* |
|---|
| 437 | + * All zones must have the same size, with the exception on an eventual |
|---|
| 438 | + * smaller last zone. |
|---|
| 439 | + */ |
|---|
| 440 | + if (zone->start == 0) { |
|---|
| 441 | + if (zone->len == 0 || !is_power_of_2(zone->len)) { |
|---|
| 442 | + pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", |
|---|
| 443 | + disk->disk_name, zone->len); |
|---|
| 444 | + return -ENODEV; |
|---|
| 445 | + } |
|---|
| 446 | + |
|---|
| 447 | + args->zone_sectors = zone->len; |
|---|
| 448 | + args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); |
|---|
| 449 | + } else if (zone->start + args->zone_sectors < capacity) { |
|---|
| 450 | + if (zone->len != args->zone_sectors) { |
|---|
| 451 | + pr_warn("%s: Invalid zoned device with non constant zone size\n", |
|---|
| 452 | + disk->disk_name); |
|---|
| 453 | + return -ENODEV; |
|---|
| 454 | + } |
|---|
| 455 | + } else { |
|---|
| 456 | + if (zone->len > args->zone_sectors) { |
|---|
| 457 | + pr_warn("%s: Invalid zoned device with larger last zone size\n", |
|---|
| 458 | + disk->disk_name); |
|---|
| 459 | + return -ENODEV; |
|---|
| 460 | + } |
|---|
| 461 | + } |
|---|
| 462 | + |
|---|
| 463 | + /* Check for holes in the zone report */ |
|---|
| 464 | + if (zone->start != args->sector) { |
|---|
| 465 | + pr_warn("%s: Zone gap at sectors %llu..%llu\n", |
|---|
| 466 | + disk->disk_name, args->sector, zone->start); |
|---|
| 467 | + return -ENODEV; |
|---|
| 468 | + } |
|---|
| 469 | + |
|---|
| 470 | + /* Check zone type */ |
|---|
| 471 | + switch (zone->type) { |
|---|
| 472 | + case BLK_ZONE_TYPE_CONVENTIONAL: |
|---|
| 473 | + if (!args->conv_zones_bitmap) { |
|---|
| 474 | + args->conv_zones_bitmap = |
|---|
| 475 | + blk_alloc_zone_bitmap(q->node, args->nr_zones); |
|---|
| 476 | + if (!args->conv_zones_bitmap) |
|---|
| 477 | + return -ENOMEM; |
|---|
| 478 | + } |
|---|
| 479 | + set_bit(idx, args->conv_zones_bitmap); |
|---|
| 480 | + break; |
|---|
| 481 | + case BLK_ZONE_TYPE_SEQWRITE_REQ: |
|---|
| 482 | + case BLK_ZONE_TYPE_SEQWRITE_PREF: |
|---|
| 483 | + if (!args->seq_zones_wlock) { |
|---|
| 484 | + args->seq_zones_wlock = |
|---|
| 485 | + blk_alloc_zone_bitmap(q->node, args->nr_zones); |
|---|
| 486 | + if (!args->seq_zones_wlock) |
|---|
| 487 | + return -ENOMEM; |
|---|
| 488 | + } |
|---|
| 489 | + break; |
|---|
| 490 | + default: |
|---|
| 491 | + pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", |
|---|
| 492 | + disk->disk_name, (int)zone->type, zone->start); |
|---|
| 493 | + return -ENODEV; |
|---|
| 494 | + } |
|---|
| 495 | + |
|---|
| 496 | + args->sector += zone->len; |
|---|
| 497 | + return 0; |
|---|
| 498 | +} |
|---|
| 499 | + |
|---|
| 500 | +/** |
|---|
| 501 | + * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps |
|---|
| 502 | + * @disk: Target disk |
|---|
| 503 | + * @update_driver_data: Callback to update driver data on the frozen disk |
|---|
| 504 | + * |
|---|
| 505 | + * Helper function for low-level device drivers to (re) allocate and initialize |
|---|
| 506 | + * a disk request queue zone bitmaps. This functions should normally be called |
|---|
| 507 | + * within the disk ->revalidate method for blk-mq based drivers. For BIO based |
|---|
| 508 | + * drivers only q->nr_zones needs to be updated so that the sysfs exposed value |
|---|
| 509 | + * is correct. |
|---|
| 510 | + * If the @update_driver_data callback function is not NULL, the callback is |
|---|
| 511 | + * executed with the device request queue frozen after all zones have been |
|---|
| 512 | + * checked. |
|---|
| 513 | + */ |
|---|
| 514 | +int blk_revalidate_disk_zones(struct gendisk *disk, |
|---|
| 515 | + void (*update_driver_data)(struct gendisk *disk)) |
|---|
| 516 | +{ |
|---|
| 517 | + struct request_queue *q = disk->queue; |
|---|
| 518 | + struct blk_revalidate_zone_args args = { |
|---|
| 519 | + .disk = disk, |
|---|
| 520 | + }; |
|---|
| 521 | + unsigned int noio_flag; |
|---|
| 522 | + int ret; |
|---|
| 523 | + |
|---|
| 524 | + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) |
|---|
| 525 | + return -EIO; |
|---|
| 526 | + if (WARN_ON_ONCE(!queue_is_mq(q))) |
|---|
| 527 | + return -EIO; |
|---|
| 528 | + |
|---|
| 529 | + if (!get_capacity(disk)) |
|---|
| 530 | + return -EIO; |
|---|
| 531 | + |
|---|
| 532 | + /* |
|---|
| 533 | + * Ensure that all memory allocations in this context are done as if |
|---|
| 534 | + * GFP_NOIO was specified. |
|---|
| 535 | + */ |
|---|
| 536 | + noio_flag = memalloc_noio_save(); |
|---|
| 537 | + ret = disk->fops->report_zones(disk, 0, UINT_MAX, |
|---|
| 538 | + blk_revalidate_zone_cb, &args); |
|---|
| 539 | + memalloc_noio_restore(noio_flag); |
|---|
| 540 | + |
|---|
| 541 | + /* |
|---|
| 542 | + * Install the new bitmaps and update nr_zones only once the queue is |
|---|
| 543 | + * stopped and all I/Os are completed (i.e. a scheduler is not |
|---|
| 544 | + * referencing the bitmaps). |
|---|
| 545 | + */ |
|---|
| 546 | + blk_mq_freeze_queue(q); |
|---|
| 547 | + if (ret >= 0) { |
|---|
| 548 | + blk_queue_chunk_sectors(q, args.zone_sectors); |
|---|
| 549 | + q->nr_zones = args.nr_zones; |
|---|
| 550 | + swap(q->seq_zones_wlock, args.seq_zones_wlock); |
|---|
| 551 | + swap(q->conv_zones_bitmap, args.conv_zones_bitmap); |
|---|
| 552 | + if (update_driver_data) |
|---|
| 553 | + update_driver_data(disk); |
|---|
| 554 | + ret = 0; |
|---|
| 555 | + } else { |
|---|
| 556 | + pr_warn("%s: failed to revalidate zones\n", disk->disk_name); |
|---|
| 557 | + blk_queue_free_zone_bitmaps(q); |
|---|
| 558 | + } |
|---|
| 559 | + blk_mq_unfreeze_queue(q); |
|---|
| 560 | + |
|---|
| 561 | + kfree(args.seq_zones_wlock); |
|---|
| 562 | + kfree(args.conv_zones_bitmap); |
|---|
| 563 | + return ret; |
|---|
| 564 | +} |
|---|
| 565 | +EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); |
|---|