hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/drivers/block/null_blk_main.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
34 * Shaohua Li <shli@fb.com>
....@@ -22,6 +23,7 @@
2223 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
2324 static DECLARE_FAULT_ATTR(null_timeout_attr);
2425 static DECLARE_FAULT_ATTR(null_requeue_attr);
26
+static DECLARE_FAULT_ATTR(null_init_hctx_attr);
2527 #endif
2628
2729 static inline u64 mb_per_tick(int mbps)
....@@ -95,11 +97,21 @@
9597 MODULE_PARM_DESC(home_node, "Home node for the device");
9698
9799 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
100
+/*
101
+ * For more details about fault injection, please refer to
102
+ * Documentation/fault-injection/fault-injection.rst.
103
+ */
98104 static char g_timeout_str[80];
99105 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
106
+MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");
100107
101108 static char g_requeue_str[80];
102109 module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
110
+MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");
111
+
112
+static char g_init_hctx_str[80];
113
+module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
114
+MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
103115 #endif
104116
105117 static int g_queue_mode = NULL_Q_MQ;
....@@ -140,8 +152,8 @@
140152 module_param_named(bs, g_bs, int, 0444);
141153 MODULE_PARM_DESC(bs, "Block size (in bytes)");
142154
143
-static int nr_devices = 1;
144
-module_param(nr_devices, int, 0444);
155
+static unsigned int nr_devices = 1;
156
+module_param(nr_devices, uint, 0444);
145157 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
146158
147159 static bool g_blocking;
....@@ -151,6 +163,10 @@
151163 static bool shared_tags;
152164 module_param(shared_tags, bool, 0444);
153165 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
166
+
167
+static bool g_shared_tag_bitmap;
168
+module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
169
+MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");
154170
155171 static int g_irqmode = NULL_IRQ_SOFTIRQ;
156172
....@@ -188,6 +204,22 @@
188204 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
189205 MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
190206
207
+static unsigned long g_zone_capacity;
208
+module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
209
+MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
210
+
211
+static unsigned int g_zone_nr_conv;
212
+module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
213
+MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
214
+
215
+static unsigned int g_zone_max_open;
216
+module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
217
+MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
218
+
219
+static unsigned int g_zone_max_active;
220
+module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
221
+MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
222
+
191223 static struct nullb_device *null_alloc_dev(void);
192224 static void null_free_dev(struct nullb_device *dev);
193225 static void null_del_dev(struct nullb *nullb);
....@@ -222,7 +254,7 @@
222254 int result;
223255
224256 result = kstrtouint(page, 0, &tmp);
225
- if (result)
257
+ if (result < 0)
226258 return result;
227259
228260 *val = tmp;
....@@ -236,7 +268,7 @@
236268 unsigned long tmp;
237269
238270 result = kstrtoul(page, 0, &tmp);
239
- if (result)
271
+ if (result < 0)
240272 return result;
241273
242274 *val = tmp;
....@@ -250,7 +282,7 @@
250282 int result;
251283
252284 result = kstrtobool(page, &tmp);
253
- if (result)
285
+ if (result < 0)
254286 return result;
255287
256288 *val = tmp;
....@@ -258,41 +290,77 @@
258290 }
259291
260292 /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
261
-#define NULLB_DEVICE_ATTR(NAME, TYPE) \
262
-static ssize_t \
263
-nullb_device_##NAME##_show(struct config_item *item, char *page) \
264
-{ \
265
- return nullb_device_##TYPE##_attr_show( \
266
- to_nullb_device(item)->NAME, page); \
267
-} \
268
-static ssize_t \
269
-nullb_device_##NAME##_store(struct config_item *item, const char *page, \
270
- size_t count) \
271
-{ \
272
- if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \
273
- return -EBUSY; \
274
- return nullb_device_##TYPE##_attr_store( \
275
- &to_nullb_device(item)->NAME, page, count); \
276
-} \
293
+#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \
294
+static ssize_t \
295
+nullb_device_##NAME##_show(struct config_item *item, char *page) \
296
+{ \
297
+ return nullb_device_##TYPE##_attr_show( \
298
+ to_nullb_device(item)->NAME, page); \
299
+} \
300
+static ssize_t \
301
+nullb_device_##NAME##_store(struct config_item *item, const char *page, \
302
+ size_t count) \
303
+{ \
304
+ int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
305
+ struct nullb_device *dev = to_nullb_device(item); \
306
+ TYPE new_value = 0; \
307
+ int ret; \
308
+ \
309
+ ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
310
+ if (ret < 0) \
311
+ return ret; \
312
+ if (apply_fn) \
313
+ ret = apply_fn(dev, new_value); \
314
+ else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \
315
+ ret = -EBUSY; \
316
+ if (ret < 0) \
317
+ return ret; \
318
+ dev->NAME = new_value; \
319
+ return count; \
320
+} \
277321 CONFIGFS_ATTR(nullb_device_, NAME);
278322
279
-NULLB_DEVICE_ATTR(size, ulong);
280
-NULLB_DEVICE_ATTR(completion_nsec, ulong);
281
-NULLB_DEVICE_ATTR(submit_queues, uint);
282
-NULLB_DEVICE_ATTR(home_node, uint);
283
-NULLB_DEVICE_ATTR(queue_mode, uint);
284
-NULLB_DEVICE_ATTR(blocksize, uint);
285
-NULLB_DEVICE_ATTR(irqmode, uint);
286
-NULLB_DEVICE_ATTR(hw_queue_depth, uint);
287
-NULLB_DEVICE_ATTR(index, uint);
288
-NULLB_DEVICE_ATTR(blocking, bool);
289
-NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
290
-NULLB_DEVICE_ATTR(memory_backed, bool);
291
-NULLB_DEVICE_ATTR(discard, bool);
292
-NULLB_DEVICE_ATTR(mbps, uint);
293
-NULLB_DEVICE_ATTR(cache_size, ulong);
294
-NULLB_DEVICE_ATTR(zoned, bool);
295
-NULLB_DEVICE_ATTR(zone_size, ulong);
323
+static int nullb_apply_submit_queues(struct nullb_device *dev,
324
+ unsigned int submit_queues)
325
+{
326
+ struct nullb *nullb = dev->nullb;
327
+ struct blk_mq_tag_set *set;
328
+
329
+ if (!nullb)
330
+ return 0;
331
+
332
+ /*
333
+ * Make sure that null_init_hctx() does not access nullb->queues[] past
334
+ * the end of that array.
335
+ */
336
+ if (submit_queues > nr_cpu_ids)
337
+ return -EINVAL;
338
+ set = nullb->tag_set;
339
+ blk_mq_update_nr_hw_queues(set, submit_queues);
340
+ return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM;
341
+}
342
+
343
+NULLB_DEVICE_ATTR(size, ulong, NULL);
344
+NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
345
+NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
346
+NULLB_DEVICE_ATTR(home_node, uint, NULL);
347
+NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
348
+NULLB_DEVICE_ATTR(blocksize, uint, NULL);
349
+NULLB_DEVICE_ATTR(irqmode, uint, NULL);
350
+NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
351
+NULLB_DEVICE_ATTR(index, uint, NULL);
352
+NULLB_DEVICE_ATTR(blocking, bool, NULL);
353
+NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL);
354
+NULLB_DEVICE_ATTR(memory_backed, bool, NULL);
355
+NULLB_DEVICE_ATTR(discard, bool, NULL);
356
+NULLB_DEVICE_ATTR(mbps, uint, NULL);
357
+NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
358
+NULLB_DEVICE_ATTR(zoned, bool, NULL);
359
+NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
360
+NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
361
+NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
362
+NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
363
+NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
296364
297365 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
298366 {
....@@ -408,6 +476,10 @@
408476 &nullb_device_attr_badblocks,
409477 &nullb_device_attr_zoned,
410478 &nullb_device_attr_zone_size,
479
+ &nullb_device_attr_zone_capacity,
480
+ &nullb_device_attr_zone_nr_conv,
481
+ &nullb_device_attr_zone_max_open,
482
+ &nullb_device_attr_zone_max_active,
411483 NULL,
412484 };
413485
....@@ -460,7 +532,8 @@
460532
461533 static ssize_t memb_group_features_show(struct config_item *item, char *page)
462534 {
463
- return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
535
+ return snprintf(page, PAGE_SIZE,
536
+ "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n");
464537 }
465538
466539 CONFIGFS_ATTR_RO(memb_group_, features);
....@@ -521,6 +594,10 @@
521594 dev->use_per_node_hctx = g_use_per_node_hctx;
522595 dev->zoned = g_zoned;
523596 dev->zone_size = g_zone_size;
597
+ dev->zone_capacity = g_zone_capacity;
598
+ dev->zone_nr_conv = g_zone_nr_conv;
599
+ dev->zone_max_open = g_zone_max_open;
600
+ dev->zone_max_active = g_zone_max_active;
524601 return dev;
525602 }
526603
....@@ -529,7 +606,7 @@
529606 if (!dev)
530607 return;
531608
532
- null_zone_exit(dev);
609
+ null_free_zoned_dev(dev);
533610 badblocks_exit(&dev->badblocks);
534611 kfree(dev);
535612 }
....@@ -608,20 +685,12 @@
608685
609686 static void end_cmd(struct nullb_cmd *cmd)
610687 {
611
- struct request_queue *q = NULL;
612688 int queue_mode = cmd->nq->dev->queue_mode;
613
-
614
- if (cmd->rq)
615
- q = cmd->rq->q;
616689
617690 switch (queue_mode) {
618691 case NULL_Q_MQ:
619692 blk_mq_end_request(cmd->rq, cmd->error);
620693 return;
621
- case NULL_Q_RQ:
622
- INIT_LIST_HEAD(&cmd->rq->queuelist);
623
- blk_end_request_all(cmd->rq, cmd->error);
624
- break;
625694 case NULL_Q_BIO:
626695 cmd->bio->bi_status = cmd->error;
627696 bio_endio(cmd->bio);
....@@ -629,15 +698,6 @@
629698 }
630699
631700 free_cmd(cmd);
632
-
633
- /* Restart queue if needed, as we are freeing a tag */
634
- if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) {
635
- unsigned long flags;
636
-
637
- spin_lock_irqsave(q->queue_lock, flags);
638
- blk_start_queue_async(q);
639
- spin_unlock_irqrestore(q->queue_lock, flags);
640
- }
641701 }
642702
643703 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
....@@ -654,14 +714,9 @@
654714 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
655715 }
656716
657
-static void null_softirq_done_fn(struct request *rq)
717
+static void null_complete_rq(struct request *rq)
658718 {
659
- struct nullb *nullb = rq->q->queuedata;
660
-
661
- if (nullb->dev->queue_mode == NULL_Q_MQ)
662
- end_cmd(blk_mq_rq_to_pdu(rq));
663
- else
664
- end_cmd(rq->special);
719
+ end_cmd(blk_mq_rq_to_pdu(rq));
665720 }
666721
667722 static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
....@@ -1011,6 +1066,16 @@
10111066 return 0;
10121067 }
10131068
1069
+static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
1070
+ unsigned int len, unsigned int off)
1071
+{
1072
+ void *dst;
1073
+
1074
+ dst = kmap_atomic(page);
1075
+ memset(dst + off, 0xFF, len);
1076
+ kunmap_atomic(dst);
1077
+}
1078
+
10141079 static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
10151080 {
10161081 size_t temp;
....@@ -1051,10 +1116,24 @@
10511116 unsigned int len, unsigned int off, bool is_write, sector_t sector,
10521117 bool is_fua)
10531118 {
1119
+ struct nullb_device *dev = nullb->dev;
1120
+ unsigned int valid_len = len;
10541121 int err = 0;
10551122
10561123 if (!is_write) {
1057
- err = copy_from_nullb(nullb, page, off, sector, len);
1124
+ if (dev->zoned)
1125
+ valid_len = null_zone_valid_read_len(nullb,
1126
+ sector, len);
1127
+
1128
+ if (valid_len) {
1129
+ err = copy_from_nullb(nullb, page, off,
1130
+ sector, valid_len);
1131
+ off += valid_len;
1132
+ len -= valid_len;
1133
+ }
1134
+
1135
+ if (len)
1136
+ nullb_fill_pattern(nullb, page, len, off);
10581137 flush_dcache_page(page);
10591138 } else {
10601139 flush_dcache_page(page);
....@@ -1121,7 +1200,7 @@
11211200 len = bvec.bv_len;
11221201 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
11231202 op_is_write(bio_op(bio)), sector,
1124
- bio_op(bio) & REQ_FUA);
1203
+ bio->bi_opf & REQ_FUA);
11251204 if (err) {
11261205 spin_unlock_irq(&nullb->lock);
11271206 return err;
....@@ -1138,150 +1217,100 @@
11381217
11391218 if (nullb->dev->queue_mode == NULL_Q_MQ)
11401219 blk_mq_stop_hw_queues(q);
1141
- else {
1142
- spin_lock_irq(q->queue_lock);
1143
- blk_stop_queue(q);
1144
- spin_unlock_irq(q->queue_lock);
1145
- }
11461220 }
11471221
11481222 static void null_restart_queue_async(struct nullb *nullb)
11491223 {
11501224 struct request_queue *q = nullb->q;
1151
- unsigned long flags;
11521225
11531226 if (nullb->dev->queue_mode == NULL_Q_MQ)
11541227 blk_mq_start_stopped_hw_queues(q, true);
1155
- else {
1156
- spin_lock_irqsave(q->queue_lock, flags);
1157
- blk_start_queue_async(q);
1158
- spin_unlock_irqrestore(q->queue_lock, flags);
1159
- }
11601228 }
11611229
1162
-static bool cmd_report_zone(struct nullb *nullb, struct nullb_cmd *cmd)
1163
-{
1164
- struct nullb_device *dev = cmd->nq->dev;
1165
-
1166
- if (dev->queue_mode == NULL_Q_BIO) {
1167
- if (bio_op(cmd->bio) == REQ_OP_ZONE_REPORT) {
1168
- cmd->error = null_zone_report(nullb, cmd->bio);
1169
- return true;
1170
- }
1171
- } else {
1172
- if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
1173
- cmd->error = null_zone_report(nullb, cmd->rq->bio);
1174
- return true;
1175
- }
1176
- }
1177
-
1178
- return false;
1179
-}
1180
-
1181
-static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
1230
+static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
11821231 {
11831232 struct nullb_device *dev = cmd->nq->dev;
11841233 struct nullb *nullb = dev->nullb;
1185
- int err = 0;
1234
+ blk_status_t sts = BLK_STS_OK;
1235
+ struct request *rq = cmd->rq;
11861236
1187
- if (cmd_report_zone(nullb, cmd))
1188
- goto out;
1237
+ if (!hrtimer_active(&nullb->bw_timer))
1238
+ hrtimer_restart(&nullb->bw_timer);
11891239
1190
- if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
1191
- struct request *rq = cmd->rq;
1192
-
1193
- if (!hrtimer_active(&nullb->bw_timer))
1194
- hrtimer_restart(&nullb->bw_timer);
1195
-
1196
- if (atomic_long_sub_return(blk_rq_bytes(rq),
1197
- &nullb->cur_bytes) < 0) {
1198
- null_stop_queue(nullb);
1199
- /* race with timer */
1200
- if (atomic_long_read(&nullb->cur_bytes) > 0)
1201
- null_restart_queue_async(nullb);
1202
- if (dev->queue_mode == NULL_Q_RQ) {
1203
- struct request_queue *q = nullb->q;
1204
-
1205
- spin_lock_irq(q->queue_lock);
1206
- rq->rq_flags |= RQF_DONTPREP;
1207
- blk_requeue_request(q, rq);
1208
- spin_unlock_irq(q->queue_lock);
1209
- return BLK_STS_OK;
1210
- } else
1211
- /* requeue request */
1212
- return BLK_STS_DEV_RESOURCE;
1213
- }
1240
+ if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
1241
+ null_stop_queue(nullb);
1242
+ /* race with timer */
1243
+ if (atomic_long_read(&nullb->cur_bytes) > 0)
1244
+ null_restart_queue_async(nullb);
1245
+ /* requeue request */
1246
+ sts = BLK_STS_DEV_RESOURCE;
12141247 }
1248
+ return sts;
1249
+}
12151250
1216
- if (nullb->dev->badblocks.shift != -1) {
1217
- int bad_sectors;
1218
- sector_t sector, size, first_bad;
1219
- bool is_flush = true;
1251
+static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
1252
+ sector_t sector,
1253
+ sector_t nr_sectors)
1254
+{
1255
+ struct badblocks *bb = &cmd->nq->dev->badblocks;
1256
+ sector_t first_bad;
1257
+ int bad_sectors;
12201258
1221
- if (dev->queue_mode == NULL_Q_BIO &&
1222
- bio_op(cmd->bio) != REQ_OP_FLUSH) {
1223
- is_flush = false;
1224
- sector = cmd->bio->bi_iter.bi_sector;
1225
- size = bio_sectors(cmd->bio);
1226
- }
1227
- if (dev->queue_mode != NULL_Q_BIO &&
1228
- req_op(cmd->rq) != REQ_OP_FLUSH) {
1229
- is_flush = false;
1230
- sector = blk_rq_pos(cmd->rq);
1231
- size = blk_rq_sectors(cmd->rq);
1232
- }
1233
- if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
1234
- size, &first_bad, &bad_sectors)) {
1235
- cmd->error = BLK_STS_IOERR;
1236
- goto out;
1237
- }
1259
+ if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
1260
+ return BLK_STS_IOERR;
1261
+
1262
+ return BLK_STS_OK;
1263
+}
1264
+
1265
+static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
1266
+ enum req_opf op)
1267
+{
1268
+ struct nullb_device *dev = cmd->nq->dev;
1269
+ int err;
1270
+
1271
+ if (dev->queue_mode == NULL_Q_BIO)
1272
+ err = null_handle_bio(cmd);
1273
+ else
1274
+ err = null_handle_rq(cmd);
1275
+
1276
+ return errno_to_blk_status(err);
1277
+}
1278
+
1279
+static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
1280
+{
1281
+ struct nullb_device *dev = cmd->nq->dev;
1282
+ struct bio *bio;
1283
+
1284
+ if (dev->memory_backed)
1285
+ return;
1286
+
1287
+ if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
1288
+ zero_fill_bio(cmd->bio);
1289
+ } else if (req_op(cmd->rq) == REQ_OP_READ) {
1290
+ __rq_for_each_bio(bio, cmd->rq)
1291
+ zero_fill_bio(bio);
12381292 }
1293
+}
12391294
1240
- if (dev->memory_backed) {
1241
- if (dev->queue_mode == NULL_Q_BIO) {
1242
- if (bio_op(cmd->bio) == REQ_OP_FLUSH)
1243
- err = null_handle_flush(nullb);
1244
- else
1245
- err = null_handle_bio(cmd);
1246
- } else {
1247
- if (req_op(cmd->rq) == REQ_OP_FLUSH)
1248
- err = null_handle_flush(nullb);
1249
- else
1250
- err = null_handle_rq(cmd);
1251
- }
1252
- }
1253
- cmd->error = errno_to_blk_status(err);
1295
+static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
1296
+{
1297
+ /*
1298
+ * Since root privileges are required to configure the null_blk
1299
+ * driver, it is fine that this driver does not initialize the
1300
+ * data buffers of read commands. Zero-initialize these buffers
1301
+ * anyway if KMSAN is enabled to prevent that KMSAN complains
1302
+ * about null_blk not initializing read data buffers.
1303
+ */
1304
+ if (IS_ENABLED(CONFIG_KMSAN))
1305
+ nullb_zero_read_cmd_buffer(cmd);
12541306
1255
- if (!cmd->error && dev->zoned) {
1256
- sector_t sector;
1257
- unsigned int nr_sectors;
1258
- int op;
1259
-
1260
- if (dev->queue_mode == NULL_Q_BIO) {
1261
- op = bio_op(cmd->bio);
1262
- sector = cmd->bio->bi_iter.bi_sector;
1263
- nr_sectors = cmd->bio->bi_iter.bi_size >> 9;
1264
- } else {
1265
- op = req_op(cmd->rq);
1266
- sector = blk_rq_pos(cmd->rq);
1267
- nr_sectors = blk_rq_sectors(cmd->rq);
1268
- }
1269
-
1270
- if (op == REQ_OP_WRITE)
1271
- null_zone_write(cmd, sector, nr_sectors);
1272
- else if (op == REQ_OP_ZONE_RESET)
1273
- null_zone_reset(cmd, sector);
1274
- }
1275
-out:
12761307 /* Complete IO by inline, softirq or timer */
1277
- switch (dev->irqmode) {
1308
+ switch (cmd->nq->dev->irqmode) {
12781309 case NULL_IRQ_SOFTIRQ:
1279
- switch (dev->queue_mode) {
1310
+ switch (cmd->nq->dev->queue_mode) {
12801311 case NULL_Q_MQ:
1281
- blk_mq_complete_request(cmd->rq);
1282
- break;
1283
- case NULL_Q_RQ:
1284
- blk_complete_request(cmd->rq);
1312
+ if (likely(!blk_should_fake_timeout(cmd->rq->q)))
1313
+ blk_mq_complete_request(cmd->rq);
12851314 break;
12861315 case NULL_Q_BIO:
12871316 /*
....@@ -1298,6 +1327,56 @@
12981327 null_cmd_end_timer(cmd);
12991328 break;
13001329 }
1330
+}
1331
+
1332
+blk_status_t null_process_cmd(struct nullb_cmd *cmd,
1333
+ enum req_opf op, sector_t sector,
1334
+ unsigned int nr_sectors)
1335
+{
1336
+ struct nullb_device *dev = cmd->nq->dev;
1337
+ blk_status_t ret;
1338
+
1339
+ if (dev->badblocks.shift != -1) {
1340
+ ret = null_handle_badblocks(cmd, sector, nr_sectors);
1341
+ if (ret != BLK_STS_OK)
1342
+ return ret;
1343
+ }
1344
+
1345
+ if (dev->memory_backed)
1346
+ return null_handle_memory_backed(cmd, op);
1347
+
1348
+ return BLK_STS_OK;
1349
+}
1350
+
1351
+static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
1352
+ sector_t nr_sectors, enum req_opf op)
1353
+{
1354
+ struct nullb_device *dev = cmd->nq->dev;
1355
+ struct nullb *nullb = dev->nullb;
1356
+ blk_status_t sts;
1357
+
1358
+ if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
1359
+ sts = null_handle_throttled(cmd);
1360
+ if (sts != BLK_STS_OK)
1361
+ return sts;
1362
+ }
1363
+
1364
+ if (op == REQ_OP_FLUSH) {
1365
+ cmd->error = errno_to_blk_status(null_handle_flush(nullb));
1366
+ goto out;
1367
+ }
1368
+
1369
+ if (dev->zoned)
1370
+ sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
1371
+ else
1372
+ sts = null_process_cmd(cmd, op, sector, nr_sectors);
1373
+
1374
+ /* Do not overwrite errors (e.g. timeout errors) */
1375
+ if (cmd->error == BLK_STS_OK)
1376
+ cmd->error = sts;
1377
+
1378
+out:
1379
+ nullb_complete_cmd(cmd);
13011380 return BLK_STS_OK;
13021381 }
13031382
....@@ -1338,41 +1417,19 @@
13381417 return &nullb->queues[index];
13391418 }
13401419
1341
-static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
1420
+static blk_qc_t null_submit_bio(struct bio *bio)
13421421 {
1343
- struct nullb *nullb = q->queuedata;
1422
+ sector_t sector = bio->bi_iter.bi_sector;
1423
+ sector_t nr_sectors = bio_sectors(bio);
1424
+ struct nullb *nullb = bio->bi_disk->private_data;
13441425 struct nullb_queue *nq = nullb_to_queue(nullb);
13451426 struct nullb_cmd *cmd;
13461427
13471428 cmd = alloc_cmd(nq, 1);
13481429 cmd->bio = bio;
13491430
1350
- null_handle_cmd(cmd);
1431
+ null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
13511432 return BLK_QC_T_NONE;
1352
-}
1353
-
1354
-static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
1355
-{
1356
- pr_info("null: rq %p timed out\n", rq);
1357
- __blk_complete_request(rq);
1358
- return BLK_EH_DONE;
1359
-}
1360
-
1361
-static int null_rq_prep_fn(struct request_queue *q, struct request *req)
1362
-{
1363
- struct nullb *nullb = q->queuedata;
1364
- struct nullb_queue *nq = nullb_to_queue(nullb);
1365
- struct nullb_cmd *cmd;
1366
-
1367
- cmd = alloc_cmd(nq, 0);
1368
- if (cmd) {
1369
- cmd->rq = req;
1370
- req->special = cmd;
1371
- return BLKPREP_OK;
1372
- }
1373
- blk_stop_queue(q);
1374
-
1375
- return BLKPREP_DEFER;
13761433 }
13771434
13781435 static bool should_timeout_request(struct request *rq)
....@@ -1393,31 +1450,22 @@
13931450 return false;
13941451 }
13951452
1396
-static void null_request_fn(struct request_queue *q)
1397
-{
1398
- struct request *rq;
1399
-
1400
- while ((rq = blk_fetch_request(q)) != NULL) {
1401
- struct nullb_cmd *cmd = rq->special;
1402
-
1403
- /* just ignore the request */
1404
- if (should_timeout_request(rq))
1405
- continue;
1406
- if (should_requeue_request(rq)) {
1407
- blk_requeue_request(q, rq);
1408
- continue;
1409
- }
1410
-
1411
- spin_unlock_irq(q->queue_lock);
1412
- null_handle_cmd(cmd);
1413
- spin_lock_irq(q->queue_lock);
1414
- }
1415
-}
1416
-
14171453 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
14181454 {
1419
- pr_info("null: rq %p timed out\n", rq);
1420
- blk_mq_complete_request(rq);
1455
+ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
1456
+
1457
+ pr_info("rq %p timed out\n", rq);
1458
+
1459
+ /*
1460
+ * If the device is marked as blocking (i.e. memory backed or zoned
1461
+ * device), the submission path may be blocked waiting for resources
1462
+ * and cause real timeouts. For these real timeouts, the submission
1463
+ * path will complete the request using blk_mq_complete_request().
1464
+ * Only fake timeouts need to execute blk_mq_complete_request() here.
1465
+ */
1466
+ cmd->error = BLK_STS_TIMEOUT;
1467
+ if (cmd->fake_timeout)
1468
+ blk_mq_complete_request(rq);
14211469 return BLK_EH_DONE;
14221470 }
14231471
....@@ -1426,6 +1474,8 @@
14261474 {
14271475 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
14281476 struct nullb_queue *nq = hctx->driver_data;
1477
+ sector_t nr_sectors = blk_rq_sectors(bd->rq);
1478
+ sector_t sector = blk_rq_pos(bd->rq);
14291479
14301480 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
14311481
....@@ -1436,6 +1486,7 @@
14361486 cmd->rq = bd->rq;
14371487 cmd->error = BLK_STS_OK;
14381488 cmd->nq = nq;
1489
+ cmd->fake_timeout = should_timeout_request(bd->rq);
14391490
14401491 blk_mq_start_request(bd->rq);
14411492
....@@ -1452,17 +1503,11 @@
14521503 return BLK_STS_OK;
14531504 }
14541505 }
1455
- if (should_timeout_request(bd->rq))
1506
+ if (cmd->fake_timeout)
14561507 return BLK_STS_OK;
14571508
1458
- return null_handle_cmd(cmd);
1509
+ return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
14591510 }
1460
-
1461
-static const struct blk_mq_ops null_mq_ops = {
1462
- .queue_rq = null_queue_rq,
1463
- .complete = null_softirq_done_fn,
1464
- .timeout = null_timeout_rq,
1465
-};
14661511
14671512 static void cleanup_queue(struct nullb_queue *nq)
14681513 {
....@@ -1479,6 +1524,48 @@
14791524
14801525 kfree(nullb->queues);
14811526 }
1527
+
1528
+static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1529
+{
1530
+ struct nullb_queue *nq = hctx->driver_data;
1531
+ struct nullb *nullb = nq->dev->nullb;
1532
+
1533
+ nullb->nr_queues--;
1534
+}
1535
+
1536
+static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
1537
+{
1538
+ init_waitqueue_head(&nq->wait);
1539
+ nq->queue_depth = nullb->queue_depth;
1540
+ nq->dev = nullb->dev;
1541
+}
1542
+
1543
+static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1544
+ unsigned int hctx_idx)
1545
+{
1546
+ struct nullb *nullb = hctx->queue->queuedata;
1547
+ struct nullb_queue *nq;
1548
+
1549
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
1550
+ if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1))
1551
+ return -EFAULT;
1552
+#endif
1553
+
1554
+ nq = &nullb->queues[hctx_idx];
1555
+ hctx->driver_data = nq;
1556
+ null_init_queue(nullb, nq);
1557
+ nullb->nr_queues++;
1558
+
1559
+ return 0;
1560
+}
1561
+
1562
+static const struct blk_mq_ops null_mq_ops = {
1563
+ .queue_rq = null_queue_rq,
1564
+ .complete = null_complete_rq,
1565
+ .timeout = null_timeout_rq,
1566
+ .init_hctx = null_init_hctx,
1567
+ .exit_hctx = null_exit_hctx,
1568
+};
14821569
14831570 static void null_del_dev(struct nullb *nullb)
14841571 {
....@@ -1517,53 +1604,29 @@
15171604 {
15181605 if (nullb->dev->discard == false)
15191606 return;
1607
+
1608
+ if (nullb->dev->zoned) {
1609
+ nullb->dev->discard = false;
1610
+ pr_info("discard option is ignored in zoned mode\n");
1611
+ return;
1612
+ }
1613
+
15201614 nullb->q->limits.discard_granularity = nullb->dev->blocksize;
15211615 nullb->q->limits.discard_alignment = nullb->dev->blocksize;
15221616 blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
15231617 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q);
15241618 }
15251619
1526
-static int null_open(struct block_device *bdev, fmode_t mode)
1527
-{
1528
- return 0;
1529
-}
1530
-
1531
-static void null_release(struct gendisk *disk, fmode_t mode)
1532
-{
1533
-}
1534
-
1535
-static const struct block_device_operations null_fops = {
1536
- .owner = THIS_MODULE,
1537
- .open = null_open,
1538
- .release = null_release,
1620
+static const struct block_device_operations null_bio_ops = {
1621
+ .owner = THIS_MODULE,
1622
+ .submit_bio = null_submit_bio,
1623
+ .report_zones = null_report_zones,
15391624 };
15401625
1541
-static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
1542
-{
1543
- BUG_ON(!nullb);
1544
- BUG_ON(!nq);
1545
-
1546
- init_waitqueue_head(&nq->wait);
1547
- nq->queue_depth = nullb->queue_depth;
1548
- nq->dev = nullb->dev;
1549
-}
1550
-
1551
-static void null_init_queues(struct nullb *nullb)
1552
-{
1553
- struct request_queue *q = nullb->q;
1554
- struct blk_mq_hw_ctx *hctx;
1555
- struct nullb_queue *nq;
1556
- int i;
1557
-
1558
- queue_for_each_hw_ctx(q, hctx, i) {
1559
- if (!hctx->nr_ctx || !hctx->tags)
1560
- continue;
1561
- nq = &nullb->queues[i];
1562
- hctx->driver_data = nq;
1563
- null_init_queue(nullb, nq);
1564
- nullb->nr_queues++;
1565
- }
1566
-}
1626
+static const struct block_device_operations null_rq_ops = {
1627
+ .owner = THIS_MODULE,
1628
+ .report_zones = null_report_zones,
1629
+};
15671630
15681631 static int setup_commands(struct nullb_queue *nq)
15691632 {
....@@ -1583,8 +1646,6 @@
15831646
15841647 for (i = 0; i < nq->queue_depth; i++) {
15851648 cmd = &nq->cmds[i];
1586
- INIT_LIST_HEAD(&cmd->list);
1587
- cmd->ll_list.next = NULL;
15881649 cmd->tag = -1U;
15891650 }
15901651
....@@ -1593,13 +1654,11 @@
15931654
15941655 static int setup_queues(struct nullb *nullb)
15951656 {
1596
- nullb->queues = kcalloc(nullb->dev->submit_queues,
1597
- sizeof(struct nullb_queue),
1657
+ nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue),
15981658 GFP_KERNEL);
15991659 if (!nullb->queues)
16001660 return -ENOMEM;
16011661
1602
- nullb->nr_queues = 0;
16031662 nullb->queue_depth = nullb->dev->hw_queue_depth;
16041663
16051664 return 0;
....@@ -1625,22 +1684,31 @@
16251684
16261685 static int null_gendisk_register(struct nullb *nullb)
16271686 {
1687
+ sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
16281688 struct gendisk *disk;
1629
- sector_t size;
16301689
16311690 disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
16321691 if (!disk)
16331692 return -ENOMEM;
1634
- size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
1635
- set_capacity(disk, size >> 9);
1693
+ set_capacity(disk, size);
16361694
16371695 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
16381696 disk->major = null_major;
16391697 disk->first_minor = nullb->index;
1640
- disk->fops = &null_fops;
1698
+ if (queue_is_mq(nullb->q))
1699
+ disk->fops = &null_rq_ops;
1700
+ else
1701
+ disk->fops = &null_bio_ops;
16411702 disk->private_data = nullb;
16421703 disk->queue = nullb->q;
16431704 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
1705
+
1706
+ if (nullb->dev->zoned) {
1707
+ int ret = null_register_zoned_dev(nullb);
1708
+
1709
+ if (ret)
1710
+ return ret;
1711
+ }
16441712
16451713 add_disk(disk);
16461714 return 0;
....@@ -1658,6 +1726,8 @@
16581726 set->flags = BLK_MQ_F_SHOULD_MERGE;
16591727 if (g_no_sched)
16601728 set->flags |= BLK_MQ_F_NO_SCHED;
1729
+ if (g_shared_tag_bitmap)
1730
+ set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
16611731 set->driver_data = NULL;
16621732
16631733 if ((nullb && nullb->dev->blocking) || g_blocking)
....@@ -1666,7 +1736,7 @@
16661736 return blk_mq_alloc_tag_set(set);
16671737 }
16681738
1669
-static void null_validate_conf(struct nullb_device *dev)
1739
+static int null_validate_conf(struct nullb_device *dev)
16701740 {
16711741 dev->blocksize = round_down(dev->blocksize, 512);
16721742 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
....@@ -1693,6 +1763,14 @@
16931763 /* can not stop a queue */
16941764 if (dev->queue_mode == NULL_Q_BIO)
16951765 dev->mbps = 0;
1766
+
1767
+ if (dev->zoned &&
1768
+ (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
1769
+ pr_err("zone_size must be power-of-two\n");
1770
+ return -EINVAL;
1771
+ }
1772
+
1773
+ return 0;
16961774 }
16971775
16981776 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
....@@ -1716,6 +1794,8 @@
17161794 return false;
17171795 if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
17181796 return false;
1797
+ if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str))
1798
+ return false;
17191799 #endif
17201800 return true;
17211801 }
....@@ -1725,7 +1805,9 @@
17251805 struct nullb *nullb;
17261806 int rv;
17271807
1728
- null_validate_conf(dev);
1808
+ rv = null_validate_conf(dev);
1809
+ if (rv)
1810
+ return rv;
17291811
17301812 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
17311813 if (!nullb) {
....@@ -1757,38 +1839,17 @@
17571839 goto out_cleanup_queues;
17581840
17591841 nullb->tag_set->timeout = 5 * HZ;
1760
- nullb->q = blk_mq_init_queue(nullb->tag_set);
1842
+ nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb);
17611843 if (IS_ERR(nullb->q)) {
17621844 rv = -ENOMEM;
17631845 goto out_cleanup_tags;
17641846 }
1765
- null_init_queues(nullb);
17661847 } else if (dev->queue_mode == NULL_Q_BIO) {
1767
- nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
1768
- NULL);
1848
+ nullb->q = blk_alloc_queue(dev->home_node);
17691849 if (!nullb->q) {
17701850 rv = -ENOMEM;
17711851 goto out_cleanup_queues;
17721852 }
1773
- blk_queue_make_request(nullb->q, null_queue_bio);
1774
- rv = init_driver_queues(nullb);
1775
- if (rv)
1776
- goto out_cleanup_blk_queue;
1777
- } else {
1778
- nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
1779
- dev->home_node);
1780
- if (!nullb->q) {
1781
- rv = -ENOMEM;
1782
- goto out_cleanup_queues;
1783
- }
1784
-
1785
- if (!null_setup_fault())
1786
- goto out_cleanup_blk_queue;
1787
-
1788
- blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
1789
- blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
1790
- blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
1791
- nullb->q->rq_timeout = 5 * HZ;
17921853 rv = init_driver_queues(nullb);
17931854 if (rv)
17941855 goto out_cleanup_blk_queue;
....@@ -1802,16 +1863,12 @@
18021863 if (dev->cache_size > 0) {
18031864 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
18041865 blk_queue_write_cache(nullb->q, true, true);
1805
- blk_queue_flush_queueable(nullb->q, true);
18061866 }
18071867
18081868 if (dev->zoned) {
1809
- rv = null_zone_init(dev);
1869
+ rv = null_init_zoned_dev(dev, nullb->q);
18101870 if (rv)
18111871 goto out_cleanup_blk_queue;
1812
-
1813
- blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
1814
- nullb->q->limits.zoned = BLK_ZONED_HM;
18151872 }
18161873
18171874 nullb->q->queuedata = nullb;
....@@ -1819,8 +1876,13 @@
18191876 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
18201877
18211878 mutex_lock(&lock);
1822
- nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
1823
- dev->index = nullb->index;
1879
+ rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
1880
+ if (rv < 0) {
1881
+ mutex_unlock(&lock);
1882
+ goto out_cleanup_zone;
1883
+ }
1884
+ nullb->index = rv;
1885
+ dev->index = rv;
18241886 mutex_unlock(&lock);
18251887
18261888 blk_queue_logical_block_size(nullb->q, dev->blocksize);
....@@ -1832,16 +1894,18 @@
18321894
18331895 rv = null_gendisk_register(nullb);
18341896 if (rv)
1835
- goto out_cleanup_zone;
1897
+ goto out_ida_free;
18361898
18371899 mutex_lock(&lock);
18381900 list_add_tail(&nullb->list, &nullb_list);
18391901 mutex_unlock(&lock);
18401902
18411903 return 0;
1904
+
1905
+out_ida_free:
1906
+ ida_free(&nullb_indexes, nullb->index);
18421907 out_cleanup_zone:
1843
- if (dev->zoned)
1844
- null_zone_exit(dev);
1908
+ null_free_zoned_dev(dev);
18451909 out_cleanup_blk_queue:
18461910 blk_cleanup_queue(nullb->q);
18471911 out_cleanup_tags:
....@@ -1864,19 +1928,23 @@
18641928 struct nullb_device *dev;
18651929
18661930 if (g_bs > PAGE_SIZE) {
1867
- pr_warn("null_blk: invalid block size\n");
1868
- pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
1931
+ pr_warn("invalid block size\n");
1932
+ pr_warn("defaults block size to %lu\n", PAGE_SIZE);
18691933 g_bs = PAGE_SIZE;
18701934 }
18711935
1872
- if (!is_power_of_2(g_zone_size)) {
1873
- pr_err("null_blk: zone_size must be power-of-two\n");
1874
- return -EINVAL;
1936
+ if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
1937
+ pr_err("invalid home_node value\n");
1938
+ g_home_node = NUMA_NO_NODE;
18751939 }
18761940
1941
+ if (g_queue_mode == NULL_Q_RQ) {
1942
+ pr_err("legacy IO path no longer available\n");
1943
+ return -EINVAL;
1944
+ }
18771945 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
18781946 if (g_submit_queues != nr_online_nodes) {
1879
- pr_warn("null_blk: submit_queues param is set to %u.\n",
1947
+ pr_warn("submit_queues param is set to %u.\n",
18801948 nr_online_nodes);
18811949 g_submit_queues = nr_online_nodes;
18821950 }
....@@ -1919,7 +1987,7 @@
19191987 }
19201988 }
19211989
1922
- pr_info("null: module loaded\n");
1990
+ pr_info("module loaded\n");
19231991 return 0;
19241992
19251993 err_dev: