hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/drivers/nvme/host/core.c
....@@ -1,39 +1,31 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * NVM Express device driver
34 * Copyright (c) 2011-2014, Intel Corporation.
4
- *
5
- * This program is free software; you can redistribute it and/or modify it
6
- * under the terms and conditions of the GNU General Public License,
7
- * version 2, as published by the Free Software Foundation.
8
- *
9
- * This program is distributed in the hope it will be useful, but WITHOUT
10
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12
- * more details.
135 */
146
157 #include <linux/blkdev.h>
168 #include <linux/blk-mq.h>
9
+#include <linux/compat.h>
1710 #include <linux/delay.h>
1811 #include <linux/errno.h>
1912 #include <linux/hdreg.h>
2013 #include <linux/kernel.h>
2114 #include <linux/module.h>
22
-#include <linux/list_sort.h>
15
+#include <linux/backing-dev.h>
2316 #include <linux/slab.h>
2417 #include <linux/types.h>
2518 #include <linux/pr.h>
2619 #include <linux/ptrace.h>
2720 #include <linux/nvme_ioctl.h>
28
-#include <linux/t10-pi.h>
2921 #include <linux/pm_qos.h>
3022 #include <asm/unaligned.h>
3123
32
-#define CREATE_TRACE_POINTS
33
-#include "trace.h"
34
-
3524 #include "nvme.h"
3625 #include "fabrics.h"
26
+
27
+#define CREATE_TRACE_POINTS
28
+#include "trace.h"
3729
3830 #define NVME_MINORS (1U << MINORBITS)
3931
....@@ -73,8 +65,8 @@
7365 * nvme_reset_wq - hosts nvme reset works
7466 * nvme_delete_wq - hosts nvme delete works
7567 *
76
- * nvme_wq will host works such are scan, aen handling, fw activation,
77
- * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
68
+ * nvme_wq will host works such as scan, aen handling, fw activation,
69
+ * keep-alive, periodic reconnects etc. nvme_reset_wq
7870 * runs reset works which also flush works hosted on nvme_wq for
7971 * serialization purposes. nvme_delete_wq host controller deletion
8072 * works which flush reset works for serialization.
....@@ -88,7 +80,6 @@
8880 struct workqueue_struct *nvme_delete_wq;
8981 EXPORT_SYMBOL_GPL(nvme_delete_wq);
9082
91
-static DEFINE_IDA(nvme_subsystems_ida);
9283 static LIST_HEAD(nvme_subsystems);
9384 static DEFINE_MUTEX(nvme_subsystems_lock);
9485
....@@ -97,27 +88,38 @@
9788 static struct class *nvme_class;
9889 static struct class *nvme_subsys_class;
9990
100
-static void nvme_ns_remove(struct nvme_ns *ns);
101
-static int nvme_revalidate_disk(struct gendisk *disk);
10291 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
10392 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
10493 unsigned nsid);
10594
95
+static void nvme_update_bdev_size(struct gendisk *disk)
96
+{
97
+ struct block_device *bdev = bdget_disk(disk, 0);
98
+
99
+ if (bdev) {
100
+ bd_set_nr_sectors(bdev, get_capacity(disk));
101
+ bdput(bdev);
102
+ }
103
+}
104
+
105
+/*
106
+ * Prepare a queue for teardown.
107
+ *
108
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
109
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
110
+ * holding bd_butex. This will end buffered writers dirtying pages that can't
111
+ * be synced.
112
+ */
106113 static void nvme_set_queue_dying(struct nvme_ns *ns)
107114 {
108
- /*
109
- * Revalidating a dead namespace sets capacity to 0. This will end
110
- * buffered writers dirtying pages that can't be synced.
111
- */
112
- if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
115
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
113116 return;
117
+
114118 blk_set_queue_dying(ns->queue);
115
- /* Forcibly unquiesce queues to avoid blocking dispatch */
116119 blk_mq_unquiesce_queue(ns->queue);
117
- /*
118
- * Revalidate after unblocking dispatchers that may be holding bd_butex
119
- */
120
- revalidate_disk(ns->disk);
120
+
121
+ set_capacity(ns->disk, 0);
122
+ nvme_update_bdev_size(ns->disk);
121123 }
122124
123125 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
....@@ -125,9 +127,25 @@
125127 /*
126128 * Only new queue scan work when admin and IO queues are both alive
127129 */
128
- if (ctrl->state == NVME_CTRL_LIVE)
130
+ if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
129131 queue_work(nvme_wq, &ctrl->scan_work);
130132 }
133
+
134
+/*
135
+ * Use this function to proceed with scheduling reset_work for a controller
136
+ * that had previously been set to the resetting state. This is intended for
137
+ * code paths that can't be interrupted by other reset attempts. A hot removal
138
+ * may prevent this from succeeding.
139
+ */
140
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
141
+{
142
+ if (ctrl->state != NVME_CTRL_RESETTING)
143
+ return -EBUSY;
144
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
145
+ return -EBUSY;
146
+ return 0;
147
+}
148
+EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
131149
132150 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
133151 {
....@@ -146,8 +164,7 @@
146164 ret = nvme_reset_ctrl(ctrl);
147165 if (!ret) {
148166 flush_work(&ctrl->reset_work);
149
- if (ctrl->state != NVME_CTRL_LIVE &&
150
- ctrl->state != NVME_CTRL_ADMIN_ONLY)
167
+ if (ctrl->state != NVME_CTRL_LIVE)
151168 ret = -ENETRESET;
152169 }
153170
....@@ -155,11 +172,8 @@
155172 }
156173 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
157174
158
-static void nvme_delete_ctrl_work(struct work_struct *work)
175
+static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
159176 {
160
- struct nvme_ctrl *ctrl =
161
- container_of(work, struct nvme_ctrl, delete_work);
162
-
163177 dev_info(ctrl->device,
164178 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
165179
....@@ -168,7 +182,14 @@
168182 nvme_remove_namespaces(ctrl);
169183 ctrl->ops->delete_ctrl(ctrl);
170184 nvme_uninit_ctrl(ctrl);
171
- nvme_put_ctrl(ctrl);
185
+}
186
+
187
+static void nvme_delete_ctrl_work(struct work_struct *work)
188
+{
189
+ struct nvme_ctrl *ctrl =
190
+ container_of(work, struct nvme_ctrl, delete_work);
191
+
192
+ nvme_do_delete_ctrl(ctrl);
172193 }
173194
174195 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
....@@ -181,36 +202,28 @@
181202 }
182203 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
183204
184
-int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
205
+static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
185206 {
186
- int ret = 0;
187
-
188207 /*
189
- * Keep a reference until the work is flushed since ->delete_ctrl
190
- * can free the controller.
208
+ * Keep a reference until nvme_do_delete_ctrl() complete,
209
+ * since ->delete_ctrl can free the controller.
191210 */
192211 nvme_get_ctrl(ctrl);
193
- ret = nvme_delete_ctrl(ctrl);
194
- if (!ret)
195
- flush_work(&ctrl->delete_work);
212
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
213
+ nvme_do_delete_ctrl(ctrl);
196214 nvme_put_ctrl(ctrl);
197
- return ret;
198
-}
199
-EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
200
-
201
-static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
202
-{
203
- return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
204215 }
205216
206
-static blk_status_t nvme_error_status(struct request *req)
217
+static blk_status_t nvme_error_status(u16 status)
207218 {
208
- switch (nvme_req(req)->status & 0x7ff) {
219
+ switch (status & 0x7ff) {
209220 case NVME_SC_SUCCESS:
210221 return BLK_STS_OK;
211222 case NVME_SC_CAP_EXCEEDED:
212223 return BLK_STS_NOSPC;
213224 case NVME_SC_LBA_RANGE:
225
+ case NVME_SC_CMD_INTERRUPTED:
226
+ case NVME_SC_NS_NOT_READY:
214227 return BLK_STS_TARGET;
215228 case NVME_SC_BAD_ATTRIBUTES:
216229 case NVME_SC_ONCS_NOT_SUPPORTED:
....@@ -232,52 +245,131 @@
232245 return BLK_STS_PROTECTION;
233246 case NVME_SC_RESERVATION_CONFLICT:
234247 return BLK_STS_NEXUS;
248
+ case NVME_SC_HOST_PATH_ERROR:
249
+ return BLK_STS_TRANSPORT;
250
+ case NVME_SC_ZONE_TOO_MANY_ACTIVE:
251
+ return BLK_STS_ZONE_ACTIVE_RESOURCE;
252
+ case NVME_SC_ZONE_TOO_MANY_OPEN:
253
+ return BLK_STS_ZONE_OPEN_RESOURCE;
235254 default:
236255 return BLK_STS_IOERR;
237256 }
238257 }
239258
240
-static inline bool nvme_req_needs_retry(struct request *req)
259
+static void nvme_retry_req(struct request *req)
241260 {
242
- if (blk_noretry_request(req))
243
- return false;
244
- if (nvme_req(req)->status & NVME_SC_DNR)
245
- return false;
246
- if (nvme_req(req)->retries >= nvme_max_retries)
247
- return false;
248
- return true;
261
+ struct nvme_ns *ns = req->q->queuedata;
262
+ unsigned long delay = 0;
263
+ u16 crd;
264
+
265
+ /* The mask and shift result must be <= 3 */
266
+ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
267
+ if (ns && crd)
268
+ delay = ns->ctrl->crdt[crd - 1] * 100;
269
+
270
+ nvme_req(req)->retries++;
271
+ blk_mq_requeue_request(req, false);
272
+ blk_mq_delay_kick_requeue_list(req->q, delay);
273
+}
274
+
275
+enum nvme_disposition {
276
+ COMPLETE,
277
+ RETRY,
278
+ FAILOVER,
279
+};
280
+
281
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
282
+{
283
+ if (likely(nvme_req(req)->status == 0))
284
+ return COMPLETE;
285
+
286
+ if (blk_noretry_request(req) ||
287
+ (nvme_req(req)->status & NVME_SC_DNR) ||
288
+ nvme_req(req)->retries >= nvme_max_retries)
289
+ return COMPLETE;
290
+
291
+ if (req->cmd_flags & REQ_NVME_MPATH) {
292
+ if (nvme_is_path_error(nvme_req(req)->status) ||
293
+ blk_queue_dying(req->q))
294
+ return FAILOVER;
295
+ } else {
296
+ if (blk_queue_dying(req->q))
297
+ return COMPLETE;
298
+ }
299
+
300
+ return RETRY;
301
+}
302
+
303
+static inline void nvme_end_req(struct request *req)
304
+{
305
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
306
+
307
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
308
+ req_op(req) == REQ_OP_ZONE_APPEND)
309
+ req->__sector = nvme_lba_to_sect(req->q->queuedata,
310
+ le64_to_cpu(nvme_req(req)->result.u64));
311
+
312
+ nvme_trace_bio_complete(req, status);
313
+ blk_mq_end_request(req, status);
249314 }
250315
251316 void nvme_complete_rq(struct request *req)
252317 {
253
- blk_status_t status = nvme_error_status(req);
254
-
255318 trace_nvme_complete_rq(req);
319
+ nvme_cleanup_cmd(req);
256320
257
- if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
258
- if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
259
- return;
321
+ if (nvme_req(req)->ctrl->kas)
322
+ nvme_req(req)->ctrl->comp_seen = true;
260323
261
- if (!blk_queue_dying(req->q)) {
262
- nvme_req(req)->retries++;
263
- blk_mq_requeue_request(req, true);
264
- return;
265
- }
324
+ switch (nvme_decide_disposition(req)) {
325
+ case COMPLETE:
326
+ nvme_end_req(req);
327
+ return;
328
+ case RETRY:
329
+ nvme_retry_req(req);
330
+ return;
331
+ case FAILOVER:
332
+ nvme_failover_req(req);
333
+ return;
266334 }
267
- blk_mq_end_request(req, status);
268335 }
269336 EXPORT_SYMBOL_GPL(nvme_complete_rq);
270337
271
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
338
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
272339 {
273340 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
274341 "Cancelling I/O %d", req->tag);
275342
276
- nvme_req(req)->status = NVME_SC_ABORT_REQ;
277
- blk_mq_complete_request(req);
343
+ /* don't abort one completed request */
344
+ if (blk_mq_request_completed(req))
345
+ return true;
278346
347
+ nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
348
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
349
+ blk_mq_complete_request(req);
350
+ return true;
279351 }
280352 EXPORT_SYMBOL_GPL(nvme_cancel_request);
353
+
354
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
355
+{
356
+ if (ctrl->tagset) {
357
+ blk_mq_tagset_busy_iter(ctrl->tagset,
358
+ nvme_cancel_request, ctrl);
359
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
360
+ }
361
+}
362
+EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
363
+
364
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
365
+{
366
+ if (ctrl->admin_tagset) {
367
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset,
368
+ nvme_cancel_request, ctrl);
369
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
370
+ }
371
+}
372
+EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
281373
282374 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
283375 enum nvme_ctrl_state new_state)
....@@ -290,22 +382,13 @@
290382
291383 old_state = ctrl->state;
292384 switch (new_state) {
293
- case NVME_CTRL_ADMIN_ONLY:
294
- switch (old_state) {
295
- case NVME_CTRL_CONNECTING:
296
- changed = true;
297
- /* FALLTHRU */
298
- default:
299
- break;
300
- }
301
- break;
302385 case NVME_CTRL_LIVE:
303386 switch (old_state) {
304387 case NVME_CTRL_NEW:
305388 case NVME_CTRL_RESETTING:
306389 case NVME_CTRL_CONNECTING:
307390 changed = true;
308
- /* FALLTHRU */
391
+ fallthrough;
309392 default:
310393 break;
311394 }
....@@ -314,9 +397,8 @@
314397 switch (old_state) {
315398 case NVME_CTRL_NEW:
316399 case NVME_CTRL_LIVE:
317
- case NVME_CTRL_ADMIN_ONLY:
318400 changed = true;
319
- /* FALLTHRU */
401
+ fallthrough;
320402 default:
321403 break;
322404 }
....@@ -326,7 +408,7 @@
326408 case NVME_CTRL_NEW:
327409 case NVME_CTRL_RESETTING:
328410 changed = true;
329
- /* FALLTHRU */
411
+ fallthrough;
330412 default:
331413 break;
332414 }
....@@ -334,11 +416,20 @@
334416 case NVME_CTRL_DELETING:
335417 switch (old_state) {
336418 case NVME_CTRL_LIVE:
337
- case NVME_CTRL_ADMIN_ONLY:
338419 case NVME_CTRL_RESETTING:
339420 case NVME_CTRL_CONNECTING:
340421 changed = true;
341
- /* FALLTHRU */
422
+ fallthrough;
423
+ default:
424
+ break;
425
+ }
426
+ break;
427
+ case NVME_CTRL_DELETING_NOIO:
428
+ switch (old_state) {
429
+ case NVME_CTRL_DELETING:
430
+ case NVME_CTRL_DEAD:
431
+ changed = true;
432
+ fallthrough;
342433 default:
343434 break;
344435 }
....@@ -347,7 +438,7 @@
347438 switch (old_state) {
348439 case NVME_CTRL_DELETING:
349440 changed = true;
350
- /* FALLTHRU */
441
+ fallthrough;
351442 default:
352443 break;
353444 }
....@@ -356,8 +447,10 @@
356447 break;
357448 }
358449
359
- if (changed)
450
+ if (changed) {
360451 ctrl->state = new_state;
452
+ wake_up_all(&ctrl->state_wq);
453
+ }
361454
362455 spin_unlock_irqrestore(&ctrl->lock, flags);
363456 if (changed && ctrl->state == NVME_CTRL_LIVE)
....@@ -366,6 +459,40 @@
366459 }
367460 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
368461
462
+/*
463
+ * Returns true for sink states that can't ever transition back to live.
464
+ */
465
+static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
466
+{
467
+ switch (ctrl->state) {
468
+ case NVME_CTRL_NEW:
469
+ case NVME_CTRL_LIVE:
470
+ case NVME_CTRL_RESETTING:
471
+ case NVME_CTRL_CONNECTING:
472
+ return false;
473
+ case NVME_CTRL_DELETING:
474
+ case NVME_CTRL_DELETING_NOIO:
475
+ case NVME_CTRL_DEAD:
476
+ return true;
477
+ default:
478
+ WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
479
+ return true;
480
+ }
481
+}
482
+
483
+/*
484
+ * Waits for the controller state to be resetting, or returns false if it is
485
+ * not possible to ever transition to that state.
486
+ */
487
+bool nvme_wait_reset(struct nvme_ctrl *ctrl)
488
+{
489
+ wait_event(ctrl->state_wq,
490
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
491
+ nvme_state_terminal(ctrl));
492
+ return ctrl->state == NVME_CTRL_RESETTING;
493
+}
494
+EXPORT_SYMBOL_GPL(nvme_wait_reset);
495
+
369496 static void nvme_free_ns_head(struct kref *ref)
370497 {
371498 struct nvme_ns_head *head =
....@@ -373,8 +500,7 @@
373500
374501 nvme_mpath_remove_disk(head);
375502 ida_simple_remove(&head->subsys->ns_ida, head->instance);
376
- list_del_init(&head->entry);
377
- cleanup_srcu_struct_quiesced(&head->srcu);
503
+ cleanup_srcu_struct(&head->srcu);
378504 nvme_put_subsystem(head->subsys);
379505 kfree(head);
380506 }
....@@ -397,42 +523,61 @@
397523 kfree(ns);
398524 }
399525
400
-static void nvme_put_ns(struct nvme_ns *ns)
526
+void nvme_put_ns(struct nvme_ns *ns)
401527 {
402528 kref_put(&ns->kref, nvme_free_ns);
403529 }
530
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
404531
405532 static inline void nvme_clear_nvme_request(struct request *req)
406533 {
407
- if (!(req->rq_flags & RQF_DONTPREP)) {
408
- nvme_req(req)->retries = 0;
409
- nvme_req(req)->flags = 0;
410
- req->rq_flags |= RQF_DONTPREP;
411
- }
534
+ nvme_req(req)->retries = 0;
535
+ nvme_req(req)->flags = 0;
536
+ req->rq_flags |= RQF_DONTPREP;
412537 }
413538
414
-struct request *nvme_alloc_request(struct request_queue *q,
415
- struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
539
+static inline unsigned int nvme_req_op(struct nvme_command *cmd)
416540 {
417
- unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
418
- struct request *req;
541
+ return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
542
+}
419543
420
- if (qid == NVME_QID_ANY) {
421
- req = blk_mq_alloc_request(q, op, flags);
422
- } else {
423
- req = blk_mq_alloc_request_hctx(q, op, flags,
424
- qid ? qid - 1 : 0);
425
- }
426
- if (IS_ERR(req))
427
- return req;
544
+static inline void nvme_init_request(struct request *req,
545
+ struct nvme_command *cmd)
546
+{
547
+ if (req->q->queuedata)
548
+ req->timeout = NVME_IO_TIMEOUT;
549
+ else /* no queuedata implies admin queue */
550
+ req->timeout = ADMIN_TIMEOUT;
428551
429552 req->cmd_flags |= REQ_FAILFAST_DRIVER;
430553 nvme_clear_nvme_request(req);
431554 nvme_req(req)->cmd = cmd;
555
+}
432556
557
+struct request *nvme_alloc_request(struct request_queue *q,
558
+ struct nvme_command *cmd, blk_mq_req_flags_t flags)
559
+{
560
+ struct request *req;
561
+
562
+ req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
563
+ if (!IS_ERR(req))
564
+ nvme_init_request(req, cmd);
433565 return req;
434566 }
435567 EXPORT_SYMBOL_GPL(nvme_alloc_request);
568
+
569
+struct request *nvme_alloc_request_qid(struct request_queue *q,
570
+ struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
571
+{
572
+ struct request *req;
573
+
574
+ req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
575
+ qid ? qid - 1 : 0);
576
+ if (!IS_ERR(req))
577
+ nvme_init_request(req, cmd);
578
+ return req;
579
+}
580
+EXPORT_SYMBOL_GPL(nvme_alloc_request_qid);
436581
437582 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
438583 {
....@@ -470,7 +615,7 @@
470615
471616 c.directive.opcode = nvme_admin_directive_recv;
472617 c.directive.nsid = cpu_to_le32(nsid);
473
- c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
618
+ c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
474619 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
475620 c.directive.dtype = NVME_DIR_STREAMS;
476621
....@@ -493,19 +638,22 @@
493638
494639 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
495640 if (ret)
496
- return ret;
641
+ goto out_disable_stream;
497642
498643 ctrl->nssa = le16_to_cpu(s.nssa);
499644 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
500645 dev_info(ctrl->device, "too few streams (%u) available\n",
501646 ctrl->nssa);
502
- nvme_disable_streams(ctrl);
503
- return 0;
647
+ goto out_disable_stream;
504648 }
505649
506
- ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
650
+ ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
507651 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
508652 return 0;
653
+
654
+out_disable_stream:
655
+ nvme_disable_streams(ctrl);
656
+ return ret;
509657 }
510658
511659 /*
....@@ -533,10 +681,17 @@
533681 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
534682 }
535683
684
+static inline void nvme_setup_passthrough(struct request *req,
685
+ struct nvme_command *cmd)
686
+{
687
+ memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
688
+ /* passthru commands should let the driver set the SGL flags */
689
+ cmd->common.flags &= ~NVME_CMD_SGL_ALL;
690
+}
691
+
536692 static inline void nvme_setup_flush(struct nvme_ns *ns,
537693 struct nvme_command *cmnd)
538694 {
539
- memset(cmnd, 0, sizeof(*cmnd));
540695 cmnd->common.opcode = nvme_cmd_flush;
541696 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
542697 }
....@@ -568,16 +723,26 @@
568723 range = page_address(ns->ctrl->discard_page);
569724 }
570725
571
- __rq_for_each_bio(bio, req) {
572
- u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
573
- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
726
+ if (queue_max_discard_segments(req->q) == 1) {
727
+ u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
728
+ u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
574729
575
- if (n < segments) {
576
- range[n].cattr = cpu_to_le32(0);
577
- range[n].nlb = cpu_to_le32(nlb);
578
- range[n].slba = cpu_to_le64(slba);
730
+ range[0].cattr = cpu_to_le32(0);
731
+ range[0].nlb = cpu_to_le32(nlb);
732
+ range[0].slba = cpu_to_le64(slba);
733
+ n = 1;
734
+ } else {
735
+ __rq_for_each_bio(bio, req) {
736
+ u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
737
+ u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
738
+
739
+ if (n < segments) {
740
+ range[n].cattr = cpu_to_le32(0);
741
+ range[n].nlb = cpu_to_le32(nlb);
742
+ range[n].slba = cpu_to_le64(slba);
743
+ }
744
+ n++;
579745 }
580
- n++;
581746 }
582747
583748 if (WARN_ON_ONCE(n != segments)) {
....@@ -588,7 +753,6 @@
588753 return BLK_STS_IOERR;
589754 }
590755
591
- memset(cmnd, 0, sizeof(*cmnd));
592756 cmnd->dsm.opcode = nvme_cmd_dsm;
593757 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
594758 cmnd->dsm.nr = cpu_to_le32(segments - 1);
....@@ -602,8 +766,28 @@
602766 return BLK_STS_OK;
603767 }
604768
605
-static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
769
+static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
606770 struct request *req, struct nvme_command *cmnd)
771
+{
772
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
773
+ return nvme_setup_discard(ns, req, cmnd);
774
+
775
+ cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
776
+ cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
777
+ cmnd->write_zeroes.slba =
778
+ cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
779
+ cmnd->write_zeroes.length =
780
+ cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
781
+ if (nvme_ns_has_pi(ns))
782
+ cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
783
+ else
784
+ cmnd->write_zeroes.control = 0;
785
+ return BLK_STS_OK;
786
+}
787
+
788
+static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
789
+ struct request *req, struct nvme_command *cmnd,
790
+ enum nvme_opcode op)
607791 {
608792 struct nvme_ctrl *ctrl = ns->ctrl;
609793 u16 control = 0;
....@@ -617,10 +801,9 @@
617801 if (req->cmd_flags & REQ_RAHEAD)
618802 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
619803
620
- memset(cmnd, 0, sizeof(*cmnd));
621
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
804
+ cmnd->rw.opcode = op;
622805 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
623
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
806
+ cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
624807 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
625808
626809 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
....@@ -637,8 +820,6 @@
637820 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
638821 return BLK_STS_NOTSUPP;
639822 control |= NVME_RW_PRINFO_PRACT;
640
- } else if (req_op(req) == REQ_OP_WRITE) {
641
- t10_pi_prepare(req, ns->pi_type);
642823 }
643824
644825 switch (ns->pi_type) {
....@@ -649,6 +830,8 @@
649830 case NVME_NS_DPS_PI_TYPE2:
650831 control |= NVME_RW_PRINFO_PRCHK_GUARD |
651832 NVME_RW_PRINFO_PRCHK_REF;
833
+ if (op == nvme_cmd_zone_append)
834
+ control |= NVME_RW_APPEND_PIREMAP;
652835 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
653836 break;
654837 }
....@@ -661,13 +844,6 @@
661844
662845 void nvme_cleanup_cmd(struct request *req)
663846 {
664
- if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
665
- nvme_req(req)->status == 0) {
666
- struct nvme_ns *ns = req->rq_disk->private_data;
667
-
668
- t10_pi_complete(req, ns->pi_type,
669
- blk_rq_bytes(req) >> ns->lba_shift);
670
- }
671847 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
672848 struct nvme_ns *ns = req->rq_disk->private_data;
673849 struct page *page = req->special_vec.bv_page;
....@@ -683,37 +859,86 @@
683859 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
684860 struct nvme_command *cmd)
685861 {
862
+ struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
686863 blk_status_t ret = BLK_STS_OK;
687864
688
- nvme_clear_nvme_request(req);
865
+ if (!(req->rq_flags & RQF_DONTPREP))
866
+ nvme_clear_nvme_request(req);
689867
868
+ memset(cmd, 0, sizeof(*cmd));
690869 switch (req_op(req)) {
691870 case REQ_OP_DRV_IN:
692871 case REQ_OP_DRV_OUT:
693
- memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
872
+ nvme_setup_passthrough(req, cmd);
694873 break;
695874 case REQ_OP_FLUSH:
696875 nvme_setup_flush(ns, cmd);
697876 break;
877
+ case REQ_OP_ZONE_RESET_ALL:
878
+ case REQ_OP_ZONE_RESET:
879
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
880
+ break;
881
+ case REQ_OP_ZONE_OPEN:
882
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
883
+ break;
884
+ case REQ_OP_ZONE_CLOSE:
885
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
886
+ break;
887
+ case REQ_OP_ZONE_FINISH:
888
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
889
+ break;
698890 case REQ_OP_WRITE_ZEROES:
699
- /* currently only aliased to deallocate for a few ctrls: */
891
+ ret = nvme_setup_write_zeroes(ns, req, cmd);
892
+ break;
700893 case REQ_OP_DISCARD:
701894 ret = nvme_setup_discard(ns, req, cmd);
702895 break;
703896 case REQ_OP_READ:
897
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
898
+ break;
704899 case REQ_OP_WRITE:
705
- ret = nvme_setup_rw(ns, req, cmd);
900
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
901
+ break;
902
+ case REQ_OP_ZONE_APPEND:
903
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
706904 break;
707905 default:
708906 WARN_ON_ONCE(1);
709907 return BLK_STS_IOERR;
710908 }
711909
712
- cmd->common.command_id = req->tag;
910
+ if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN))
911
+ nvme_req(req)->genctr++;
912
+ cmd->common.command_id = nvme_cid(req);
713913 trace_nvme_setup_cmd(req, cmd);
714914 return ret;
715915 }
716916 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
917
+
918
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
919
+{
920
+ struct completion *waiting = rq->end_io_data;
921
+
922
+ rq->end_io_data = NULL;
923
+ complete(waiting);
924
+}
925
+
926
+static void nvme_execute_rq_polled(struct request_queue *q,
927
+ struct gendisk *bd_disk, struct request *rq, int at_head)
928
+{
929
+ DECLARE_COMPLETION_ONSTACK(wait);
930
+
931
+ WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
932
+
933
+ rq->cmd_flags |= REQ_HIPRI;
934
+ rq->end_io_data = &wait;
935
+ blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
936
+
937
+ while (!completion_done(&wait)) {
938
+ blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
939
+ cond_resched();
940
+ }
941
+}
717942
718943 /*
719944 * Returns 0 on success. If the result is negative, it's a Linux error code;
....@@ -722,16 +947,20 @@
722947 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
723948 union nvme_result *result, void *buffer, unsigned bufflen,
724949 unsigned timeout, int qid, int at_head,
725
- blk_mq_req_flags_t flags)
950
+ blk_mq_req_flags_t flags, bool poll)
726951 {
727952 struct request *req;
728953 int ret;
729954
730
- req = nvme_alloc_request(q, cmd, flags, qid);
955
+ if (qid == NVME_QID_ANY)
956
+ req = nvme_alloc_request(q, cmd, flags);
957
+ else
958
+ req = nvme_alloc_request_qid(q, cmd, flags, qid);
731959 if (IS_ERR(req))
732960 return PTR_ERR(req);
733961
734
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
962
+ if (timeout)
963
+ req->timeout = timeout;
735964
736965 if (buffer && bufflen) {
737966 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
....@@ -739,7 +968,10 @@
739968 goto out;
740969 }
741970
742
- blk_execute_rq(req->q, NULL, req, at_head);
971
+ if (poll)
972
+ nvme_execute_rq_polled(req->q, NULL, req, at_head);
973
+ else
974
+ blk_execute_rq(req->q, NULL, req, at_head);
743975 if (result)
744976 *result = nvme_req(req)->result;
745977 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
....@@ -756,7 +988,7 @@
756988 void *buffer, unsigned bufflen)
757989 {
758990 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
759
- NVME_QID_ANY, 0, 0);
991
+ NVME_QID_ANY, 0, 0, false);
760992 }
761993 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
762994
....@@ -794,10 +1026,97 @@
7941026 return ERR_PTR(ret);
7951027 }
7961028
1029
+static u32 nvme_known_admin_effects(u8 opcode)
1030
+{
1031
+ switch (opcode) {
1032
+ case nvme_admin_format_nvm:
1033
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1034
+ NVME_CMD_EFFECTS_CSE_MASK;
1035
+ case nvme_admin_sanitize_nvm:
1036
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1037
+ default:
1038
+ break;
1039
+ }
1040
+ return 0;
1041
+}
1042
+
1043
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1044
+{
1045
+ u32 effects = 0;
1046
+
1047
+ if (ns) {
1048
+ if (ns->head->effects)
1049
+ effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1050
+ if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1051
+ dev_warn(ctrl->device,
1052
+ "IO command:%02x has unhandled effects:%08x\n",
1053
+ opcode, effects);
1054
+ return 0;
1055
+ }
1056
+
1057
+ if (ctrl->effects)
1058
+ effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1059
+ effects |= nvme_known_admin_effects(opcode);
1060
+
1061
+ return effects;
1062
+}
1063
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1064
+
1065
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1066
+ u8 opcode)
1067
+{
1068
+ u32 effects = nvme_command_effects(ctrl, ns, opcode);
1069
+
1070
+ /*
1071
+ * For simplicity, IO to all namespaces is quiesced even if the command
1072
+ * effects say only one namespace is affected.
1073
+ */
1074
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1075
+ mutex_lock(&ctrl->scan_lock);
1076
+ mutex_lock(&ctrl->subsys->lock);
1077
+ nvme_mpath_start_freeze(ctrl->subsys);
1078
+ nvme_mpath_wait_freeze(ctrl->subsys);
1079
+ nvme_start_freeze(ctrl);
1080
+ nvme_wait_freeze(ctrl);
1081
+ }
1082
+ return effects;
1083
+}
1084
+
1085
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1086
+{
1087
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1088
+ nvme_unfreeze(ctrl);
1089
+ nvme_mpath_unfreeze(ctrl->subsys);
1090
+ mutex_unlock(&ctrl->subsys->lock);
1091
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1092
+ mutex_unlock(&ctrl->scan_lock);
1093
+ }
1094
+ if (effects & NVME_CMD_EFFECTS_CCC)
1095
+ nvme_init_identify(ctrl);
1096
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1097
+ nvme_queue_scan(ctrl);
1098
+ flush_work(&ctrl->scan_work);
1099
+ }
1100
+}
1101
+
1102
+void nvme_execute_passthru_rq(struct request *rq)
1103
+{
1104
+ struct nvme_command *cmd = nvme_req(rq)->cmd;
1105
+ struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1106
+ struct nvme_ns *ns = rq->q->queuedata;
1107
+ struct gendisk *disk = ns ? ns->disk : NULL;
1108
+ u32 effects;
1109
+
1110
+ effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1111
+ blk_execute_rq(rq->q, disk, rq, 0);
1112
+ nvme_passthru_end(ctrl, effects);
1113
+}
1114
+EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1115
+
7971116 static int nvme_submit_user_cmd(struct request_queue *q,
7981117 struct nvme_command *cmd, void __user *ubuffer,
7991118 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
800
- u32 meta_seed, u32 *result, unsigned timeout)
1119
+ u32 meta_seed, u64 *result, unsigned timeout)
8011120 {
8021121 bool write = nvme_is_write(cmd);
8031122 struct nvme_ns *ns = q->queuedata;
....@@ -807,11 +1126,12 @@
8071126 void *meta = NULL;
8081127 int ret;
8091128
810
- req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
1129
+ req = nvme_alloc_request(q, cmd, 0);
8111130 if (IS_ERR(req))
8121131 return PTR_ERR(req);
8131132
814
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1133
+ if (timeout)
1134
+ req->timeout = timeout;
8151135 nvme_req(req)->flags |= NVME_REQ_USERCMD;
8161136
8171137 if (ubuffer && bufflen) {
....@@ -832,13 +1152,13 @@
8321152 }
8331153 }
8341154
835
- blk_execute_rq(req->q, disk, req, 0);
1155
+ nvme_execute_passthru_rq(req);
8361156 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
8371157 ret = -EINTR;
8381158 else
8391159 ret = nvme_req(req)->status;
8401160 if (result)
841
- *result = le32_to_cpu(nvme_req(req)->result.u32);
1161
+ *result = le64_to_cpu(nvme_req(req)->result.u64);
8421162 if (meta && !ret && !write) {
8431163 if (copy_to_user(meta_buffer, meta, meta_len))
8441164 ret = -EFAULT;
....@@ -867,21 +1187,22 @@
8671187 return;
8681188 }
8691189
1190
+ ctrl->comp_seen = false;
8701191 spin_lock_irqsave(&ctrl->lock, flags);
8711192 if (ctrl->state == NVME_CTRL_LIVE ||
8721193 ctrl->state == NVME_CTRL_CONNECTING)
8731194 startka = true;
8741195 spin_unlock_irqrestore(&ctrl->lock, flags);
8751196 if (startka)
876
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
1197
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
8771198 }
8781199
8791200 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
8801201 {
8811202 struct request *rq;
8821203
883
- rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
884
- NVME_QID_ANY);
1204
+ rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
1205
+ BLK_MQ_REQ_RESERVED);
8851206 if (IS_ERR(rq))
8861207 return PTR_ERR(rq);
8871208
....@@ -897,6 +1218,15 @@
8971218 {
8981219 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
8991220 struct nvme_ctrl, ka_work);
1221
+ bool comp_seen = ctrl->comp_seen;
1222
+
1223
+ if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1224
+ dev_dbg(ctrl->device,
1225
+ "reschedule traffic based keep-alive timer\n");
1226
+ ctrl->comp_seen = false;
1227
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1228
+ return;
1229
+ }
9001230
9011231 if (nvme_keep_alive(ctrl)) {
9021232 /* allocation failure, reset the controller */
....@@ -911,7 +1241,7 @@
9111241 if (unlikely(ctrl->kato == 0))
9121242 return;
9131243
914
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
1244
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
9151245 }
9161246
9171247 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
....@@ -956,14 +1286,75 @@
9561286 return error;
9571287 }
9581288
1289
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
1290
+{
1291
+ return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
1292
+}
1293
+
1294
+static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1295
+ struct nvme_ns_id_desc *cur, bool *csi_seen)
1296
+{
1297
+ const char *warn_str = "ctrl returned bogus length:";
1298
+ void *data = cur;
1299
+
1300
+ switch (cur->nidt) {
1301
+ case NVME_NIDT_EUI64:
1302
+ if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1303
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1304
+ warn_str, cur->nidl);
1305
+ return -1;
1306
+ }
1307
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1308
+ return NVME_NIDT_EUI64_LEN;
1309
+ memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1310
+ return NVME_NIDT_EUI64_LEN;
1311
+ case NVME_NIDT_NGUID:
1312
+ if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1313
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1314
+ warn_str, cur->nidl);
1315
+ return -1;
1316
+ }
1317
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1318
+ return NVME_NIDT_NGUID_LEN;
1319
+ memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1320
+ return NVME_NIDT_NGUID_LEN;
1321
+ case NVME_NIDT_UUID:
1322
+ if (cur->nidl != NVME_NIDT_UUID_LEN) {
1323
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1324
+ warn_str, cur->nidl);
1325
+ return -1;
1326
+ }
1327
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1328
+ return NVME_NIDT_UUID_LEN;
1329
+ uuid_copy(&ids->uuid, data + sizeof(*cur));
1330
+ return NVME_NIDT_UUID_LEN;
1331
+ case NVME_NIDT_CSI:
1332
+ if (cur->nidl != NVME_NIDT_CSI_LEN) {
1333
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1334
+ warn_str, cur->nidl);
1335
+ return -1;
1336
+ }
1337
+ memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1338
+ *csi_seen = true;
1339
+ return NVME_NIDT_CSI_LEN;
1340
+ default:
1341
+ /* Skip unknown types */
1342
+ return cur->nidl;
1343
+ }
1344
+}
1345
+
9591346 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
9601347 struct nvme_ns_ids *ids)
9611348 {
9621349 struct nvme_command c = { };
963
- int status;
1350
+ bool csi_seen = false;
1351
+ int status, pos, len;
9641352 void *data;
965
- int pos;
966
- int len;
1353
+
1354
+ if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1355
+ return 0;
1356
+ if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1357
+ return 0;
9671358
9681359 c.identify.opcode = nvme_admin_identify;
9691360 c.identify.nsid = cpu_to_le32(nsid);
....@@ -975,8 +1366,11 @@
9751366
9761367 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
9771368 NVME_IDENTIFY_DATA_SIZE);
978
- if (status)
1369
+ if (status) {
1370
+ dev_warn(ctrl->device,
1371
+ "Identify Descriptors failed (%d)\n", status);
9791372 goto free_data;
1373
+ }
9801374
9811375 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
9821376 struct nvme_ns_id_desc *cur = data + pos;
....@@ -984,65 +1378,27 @@
9841378 if (cur->nidl == 0)
9851379 break;
9861380
987
- switch (cur->nidt) {
988
- case NVME_NIDT_EUI64:
989
- if (cur->nidl != NVME_NIDT_EUI64_LEN) {
990
- dev_warn(ctrl->device,
991
- "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
992
- cur->nidl);
993
- goto free_data;
994
- }
995
- len = NVME_NIDT_EUI64_LEN;
996
- memcpy(ids->eui64, data + pos + sizeof(*cur), len);
1381
+ len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1382
+ if (len < 0)
9971383 break;
998
- case NVME_NIDT_NGUID:
999
- if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1000
- dev_warn(ctrl->device,
1001
- "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
1002
- cur->nidl);
1003
- goto free_data;
1004
- }
1005
- len = NVME_NIDT_NGUID_LEN;
1006
- memcpy(ids->nguid, data + pos + sizeof(*cur), len);
1007
- break;
1008
- case NVME_NIDT_UUID:
1009
- if (cur->nidl != NVME_NIDT_UUID_LEN) {
1010
- dev_warn(ctrl->device,
1011
- "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
1012
- cur->nidl);
1013
- goto free_data;
1014
- }
1015
- len = NVME_NIDT_UUID_LEN;
1016
- uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
1017
- break;
1018
- default:
1019
- /* Skip unnkown types */
1020
- len = cur->nidl;
1021
- break;
1022
- }
10231384
10241385 len += sizeof(*cur);
10251386 }
1387
+
1388
+ if (nvme_multi_css(ctrl) && !csi_seen) {
1389
+ dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1390
+ nsid);
1391
+ status = -EINVAL;
1392
+ }
1393
+
10261394 free_data:
10271395 kfree(data);
10281396 return status;
10291397 }
10301398
1031
-static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
1399
+static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1400
+ struct nvme_ns_ids *ids, struct nvme_id_ns **id)
10321401 {
1033
- struct nvme_command c = { };
1034
-
1035
- c.identify.opcode = nvme_admin_identify;
1036
- c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
1037
- c.identify.nsid = cpu_to_le32(nsid);
1038
- return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
1039
- NVME_IDENTIFY_DATA_SIZE);
1040
-}
1041
-
1042
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
1043
- unsigned nsid)
1044
-{
1045
- struct nvme_id_ns *id;
10461402 struct nvme_command c = { };
10471403 int error;
10481404
....@@ -1051,38 +1407,76 @@
10511407 c.identify.nsid = cpu_to_le32(nsid);
10521408 c.identify.cns = NVME_ID_CNS_NS;
10531409
1054
- id = kmalloc(sizeof(*id), GFP_KERNEL);
1055
- if (!id)
1056
- return NULL;
1410
+ *id = kmalloc(sizeof(**id), GFP_KERNEL);
1411
+ if (!*id)
1412
+ return -ENOMEM;
10571413
1058
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1414
+ error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
10591415 if (error) {
1060
- dev_warn(ctrl->device, "Identify namespace failed\n");
1061
- kfree(id);
1062
- return NULL;
1416
+ dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1417
+ goto out_free_id;
10631418 }
10641419
1065
- return id;
1420
+ error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1421
+ if ((*id)->ncap == 0) /* namespace not allocated or attached */
1422
+ goto out_free_id;
1423
+
1424
+
1425
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1426
+ dev_info(ctrl->device,
1427
+ "Ignoring bogus Namespace Identifiers\n");
1428
+ } else {
1429
+ if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1430
+ !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1431
+ memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
1432
+ if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1433
+ !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1434
+ memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
1435
+ }
1436
+
1437
+ return 0;
1438
+
1439
+out_free_id:
1440
+ kfree(*id);
1441
+ return error;
10661442 }
10671443
1068
-static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
1069
- void *buffer, size_t buflen, u32 *result)
1444
+static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1445
+ unsigned int dword11, void *buffer, size_t buflen, u32 *result)
10701446 {
10711447 union nvme_result res = { 0 };
10721448 struct nvme_command c;
10731449 int ret;
10741450
10751451 memset(&c, 0, sizeof(c));
1076
- c.features.opcode = nvme_admin_set_features;
1452
+ c.features.opcode = op;
10771453 c.features.fid = cpu_to_le32(fid);
10781454 c.features.dword11 = cpu_to_le32(dword11);
10791455
10801456 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1081
- buffer, buflen, 0, NVME_QID_ANY, 0, 0);
1457
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
10821458 if (ret >= 0 && result)
10831459 *result = le32_to_cpu(res.u32);
10841460 return ret;
10851461 }
1462
+
1463
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1464
+ unsigned int dword11, void *buffer, size_t buflen,
1465
+ u32 *result)
1466
+{
1467
+ return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1468
+ buflen, result);
1469
+}
1470
+EXPORT_SYMBOL_GPL(nvme_set_features);
1471
+
1472
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1473
+ unsigned int dword11, void *buffer, size_t buflen,
1474
+ u32 *result)
1475
+{
1476
+ return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1477
+ buflen, result);
1478
+}
1479
+EXPORT_SYMBOL_GPL(nvme_get_features);
10861480
10871481 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
10881482 {
....@@ -1113,7 +1507,8 @@
11131507 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
11141508
11151509 #define NVME_AEN_SUPPORTED \
1116
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
1510
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1511
+ NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
11171512
11181513 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
11191514 {
....@@ -1128,6 +1523,20 @@
11281523 if (status)
11291524 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
11301525 supported_aens);
1526
+
1527
+ queue_work(nvme_wq, &ctrl->async_event_work);
1528
+}
1529
+
1530
+/*
1531
+ * Convert integer values from ioctl structures to user pointers, silently
1532
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
1533
+ * kernels.
1534
+ */
1535
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
1536
+{
1537
+ if (in_compat_syscall())
1538
+ ptrval = (compat_uptr_t)ptrval;
1539
+ return (void __user *)ptrval;
11311540 }
11321541
11331542 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
....@@ -1152,10 +1561,23 @@
11521561 }
11531562
11541563 length = (io.nblocks + 1) << ns->lba_shift;
1155
- meta_len = (io.nblocks + 1) * ns->ms;
1156
- metadata = (void __user *)(uintptr_t)io.metadata;
11571564
1158
- if (ns->ext) {
1565
+ if ((io.control & NVME_RW_PRINFO_PRACT) &&
1566
+ ns->ms == sizeof(struct t10_pi_tuple)) {
1567
+ /*
1568
+ * Protection information is stripped/inserted by the
1569
+ * controller.
1570
+ */
1571
+ if (nvme_to_user_ptr(io.metadata))
1572
+ return -EINVAL;
1573
+ meta_len = 0;
1574
+ metadata = NULL;
1575
+ } else {
1576
+ meta_len = (io.nblocks + 1) * ns->ms;
1577
+ metadata = nvme_to_user_ptr(io.metadata);
1578
+ }
1579
+
1580
+ if (ns->features & NVME_NS_EXT_LBAS) {
11591581 length += meta_len;
11601582 meta_len = 0;
11611583 } else if (meta_len) {
....@@ -1176,91 +1598,8 @@
11761598 c.rw.appmask = cpu_to_le16(io.appmask);
11771599
11781600 return nvme_submit_user_cmd(ns->queue, &c,
1179
- (void __user *)(uintptr_t)io.addr, length,
1180
- metadata, meta_len, io.slba, NULL, 0);
1181
-}
1182
-
1183
-static u32 nvme_known_admin_effects(u8 opcode)
1184
-{
1185
- switch (opcode) {
1186
- case nvme_admin_format_nvm:
1187
- return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1188
- NVME_CMD_EFFECTS_CSE_MASK;
1189
- case nvme_admin_sanitize_nvm:
1190
- return NVME_CMD_EFFECTS_CSE_MASK;
1191
- default:
1192
- break;
1193
- }
1194
- return 0;
1195
-}
1196
-
1197
-static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1198
- u8 opcode)
1199
-{
1200
- u32 effects = 0;
1201
-
1202
- if (ns) {
1203
- if (ctrl->effects)
1204
- effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1205
- if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1206
- dev_warn(ctrl->device,
1207
- "IO command:%02x has unhandled effects:%08x\n",
1208
- opcode, effects);
1209
- return 0;
1210
- }
1211
-
1212
- if (ctrl->effects)
1213
- effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1214
- else
1215
- effects = nvme_known_admin_effects(opcode);
1216
-
1217
- /*
1218
- * For simplicity, IO to all namespaces is quiesced even if the command
1219
- * effects say only one namespace is affected.
1220
- */
1221
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1222
- mutex_lock(&ctrl->scan_lock);
1223
- mutex_lock(&ctrl->subsys->lock);
1224
- nvme_mpath_start_freeze(ctrl->subsys);
1225
- nvme_mpath_wait_freeze(ctrl->subsys);
1226
- nvme_start_freeze(ctrl);
1227
- nvme_wait_freeze(ctrl);
1228
- }
1229
- return effects;
1230
-}
1231
-
1232
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
1233
-{
1234
- struct nvme_ns *ns;
1235
-
1236
- down_read(&ctrl->namespaces_rwsem);
1237
- list_for_each_entry(ns, &ctrl->namespaces, list)
1238
- if (ns->disk && nvme_revalidate_disk(ns->disk))
1239
- nvme_set_queue_dying(ns);
1240
- up_read(&ctrl->namespaces_rwsem);
1241
-
1242
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1243
-}
1244
-
1245
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1246
-{
1247
- /*
1248
- * Revalidate LBA changes prior to unfreezing. This is necessary to
1249
- * prevent memory corruption if a logical block size was changed by
1250
- * this command.
1251
- */
1252
- if (effects & NVME_CMD_EFFECTS_LBCC)
1253
- nvme_update_formats(ctrl);
1254
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1255
- nvme_unfreeze(ctrl);
1256
- nvme_mpath_unfreeze(ctrl->subsys);
1257
- mutex_unlock(&ctrl->subsys->lock);
1258
- mutex_unlock(&ctrl->scan_lock);
1259
- }
1260
- if (effects & NVME_CMD_EFFECTS_CCC)
1261
- nvme_init_identify(ctrl);
1262
- if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1263
- nvme_queue_scan(ctrl);
1601
+ nvme_to_user_ptr(io.addr), length,
1602
+ metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
12641603 }
12651604
12661605 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
....@@ -1269,7 +1608,7 @@
12691608 struct nvme_passthru_cmd cmd;
12701609 struct nvme_command c;
12711610 unsigned timeout = 0;
1272
- u32 effects;
1611
+ u64 result;
12731612 int status;
12741613
12751614 if (!capable(CAP_SYS_ADMIN))
....@@ -1285,22 +1624,64 @@
12851624 c.common.nsid = cpu_to_le32(cmd.nsid);
12861625 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
12871626 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1288
- c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1289
- c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1290
- c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1291
- c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1292
- c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1293
- c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1627
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1628
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1629
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1630
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1631
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1632
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
12941633
12951634 if (cmd.timeout_ms)
12961635 timeout = msecs_to_jiffies(cmd.timeout_ms);
12971636
1298
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
12991637 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1300
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1301
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
1638
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
1639
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1640
+ 0, &result, timeout);
1641
+
1642
+ if (status >= 0) {
1643
+ if (put_user(result, &ucmd->result))
1644
+ return -EFAULT;
1645
+ }
1646
+
1647
+ return status;
1648
+}
1649
+
1650
+static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1651
+ struct nvme_passthru_cmd64 __user *ucmd)
1652
+{
1653
+ struct nvme_passthru_cmd64 cmd;
1654
+ struct nvme_command c;
1655
+ unsigned timeout = 0;
1656
+ int status;
1657
+
1658
+ if (!capable(CAP_SYS_ADMIN))
1659
+ return -EACCES;
1660
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1661
+ return -EFAULT;
1662
+ if (cmd.flags)
1663
+ return -EINVAL;
1664
+
1665
+ memset(&c, 0, sizeof(c));
1666
+ c.common.opcode = cmd.opcode;
1667
+ c.common.flags = cmd.flags;
1668
+ c.common.nsid = cpu_to_le32(cmd.nsid);
1669
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1670
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1671
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1672
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1673
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1674
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1675
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1676
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1677
+
1678
+ if (cmd.timeout_ms)
1679
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
1680
+
1681
+ status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1682
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
1683
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
13021684 0, &cmd.result, timeout);
1303
- nvme_passthru_end(ctrl, effects);
13041685
13051686 if (status >= 0) {
13061687 if (put_user(cmd.result, &ucmd->result))
....@@ -1314,7 +1695,7 @@
13141695 * Issue ioctl requests on the first available path. Note that unlike normal
13151696 * block layer requests we will not retry failed request on another controller.
13161697 */
1317
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1698
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
13181699 struct nvme_ns_head **head, int *srcu_idx)
13191700 {
13201701 #ifdef CONFIG_NVME_MULTIPATH
....@@ -1334,10 +1715,45 @@
13341715 return disk->private_data;
13351716 }
13361717
1337
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1718
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
13381719 {
13391720 if (head)
13401721 srcu_read_unlock(&head->srcu, idx);
1722
+}
1723
+
1724
+static bool is_ctrl_ioctl(unsigned int cmd)
1725
+{
1726
+ if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
1727
+ return true;
1728
+ if (is_sed_ioctl(cmd))
1729
+ return true;
1730
+ return false;
1731
+}
1732
+
1733
+static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
1734
+ void __user *argp,
1735
+ struct nvme_ns_head *head,
1736
+ int srcu_idx)
1737
+{
1738
+ struct nvme_ctrl *ctrl = ns->ctrl;
1739
+ int ret;
1740
+
1741
+ nvme_get_ctrl(ns->ctrl);
1742
+ nvme_put_ns_from_disk(head, srcu_idx);
1743
+
1744
+ switch (cmd) {
1745
+ case NVME_IOCTL_ADMIN_CMD:
1746
+ ret = nvme_user_cmd(ctrl, NULL, argp);
1747
+ break;
1748
+ case NVME_IOCTL_ADMIN64_CMD:
1749
+ ret = nvme_user_cmd64(ctrl, NULL, argp);
1750
+ break;
1751
+ default:
1752
+ ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1753
+ break;
1754
+ }
1755
+ nvme_put_ctrl(ctrl);
1756
+ return ret;
13411757 }
13421758
13431759 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
....@@ -1357,20 +1773,8 @@
13571773 * seperately and drop the ns SRCU reference early. This avoids a
13581774 * deadlock when deleting namespaces using the passthrough interface.
13591775 */
1360
- if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
1361
- struct nvme_ctrl *ctrl = ns->ctrl;
1362
-
1363
- nvme_get_ctrl(ns->ctrl);
1364
- nvme_put_ns_from_disk(head, srcu_idx);
1365
-
1366
- if (cmd == NVME_IOCTL_ADMIN_CMD)
1367
- ret = nvme_user_cmd(ctrl, NULL, argp);
1368
- else
1369
- ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1370
-
1371
- nvme_put_ctrl(ctrl);
1372
- return ret;
1373
- }
1776
+ if (is_ctrl_ioctl(cmd))
1777
+ return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
13741778
13751779 switch (cmd) {
13761780 case NVME_IOCTL_ID:
....@@ -1383,6 +1787,9 @@
13831787 case NVME_IOCTL_SUBMIT_IO:
13841788 ret = nvme_submit_io(ns, argp);
13851789 break;
1790
+ case NVME_IOCTL_IO64_CMD:
1791
+ ret = nvme_user_cmd64(ns->ctrl, ns, argp);
1792
+ break;
13861793 default:
13871794 if (ns->ndev)
13881795 ret = nvme_nvm_ioctl(ns, cmd, arg);
....@@ -1393,6 +1800,47 @@
13931800 nvme_put_ns_from_disk(head, srcu_idx);
13941801 return ret;
13951802 }
1803
+
1804
+#ifdef CONFIG_COMPAT
1805
+struct nvme_user_io32 {
1806
+ __u8 opcode;
1807
+ __u8 flags;
1808
+ __u16 control;
1809
+ __u16 nblocks;
1810
+ __u16 rsvd;
1811
+ __u64 metadata;
1812
+ __u64 addr;
1813
+ __u64 slba;
1814
+ __u32 dsmgmt;
1815
+ __u32 reftag;
1816
+ __u16 apptag;
1817
+ __u16 appmask;
1818
+} __attribute__((__packed__));
1819
+
1820
+#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32)
1821
+
1822
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1823
+ unsigned int cmd, unsigned long arg)
1824
+{
1825
+ /*
1826
+ * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
1827
+ * between 32 bit programs and 64 bit kernel.
1828
+ * The cause is that the results of sizeof(struct nvme_user_io),
1829
+ * which is used to define NVME_IOCTL_SUBMIT_IO,
1830
+ * are not same between 32 bit compiler and 64 bit compiler.
1831
+ * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
1832
+ * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
1833
+ * Other IOCTL numbers are same between 32 bit and 64 bit.
1834
+ * So there is nothing to do regarding to other IOCTL numbers.
1835
+ */
1836
+ if (cmd == NVME_IOCTL_SUBMIT_IO32)
1837
+ return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
1838
+
1839
+ return nvme_ioctl(bdev, mode, cmd, arg);
1840
+}
1841
+#else
1842
+#define nvme_compat_ioctl NULL
1843
+#endif /* CONFIG_COMPAT */
13961844
13971845 static int nvme_open(struct block_device *bdev, fmode_t mode)
13981846 {
....@@ -1434,7 +1882,8 @@
14341882 }
14351883
14361884 #ifdef CONFIG_BLK_DEV_INTEGRITY
1437
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1885
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1886
+ u32 max_integrity_segments)
14381887 {
14391888 struct blk_integrity integrity;
14401889
....@@ -1457,24 +1906,19 @@
14571906 }
14581907 integrity.tuple_size = ms;
14591908 blk_integrity_register(disk, &integrity);
1460
- blk_queue_max_integrity_segments(disk->queue, 1);
1909
+ blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
14611910 }
14621911 #else
1463
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1912
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1913
+ u32 max_integrity_segments)
14641914 {
14651915 }
14661916 #endif /* CONFIG_BLK_DEV_INTEGRITY */
14671917
1468
-static void nvme_set_chunk_size(struct nvme_ns *ns)
1469
-{
1470
- u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1471
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1472
-}
1473
-
1474
-static void nvme_config_discard(struct nvme_ns *ns)
1918
+static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
14751919 {
14761920 struct nvme_ctrl *ctrl = ns->ctrl;
1477
- struct request_queue *queue = ns->queue;
1921
+ struct request_queue *queue = disk->queue;
14781922 u32 size = queue_logical_block_size(queue);
14791923
14801924 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
....@@ -1502,23 +1946,18 @@
15021946 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
15031947 }
15041948
1505
-static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1506
- struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1949
+/*
1950
+ * Even though NVMe spec explicitly states that MDTS is not applicable to the
1951
+ * write-zeroes, we are cautious and limit the size to the controllers
1952
+ * max_hw_sectors value, which is based on the MDTS field and possibly other
1953
+ * limiting factors.
1954
+ */
1955
+static void nvme_config_write_zeroes(struct request_queue *q,
1956
+ struct nvme_ctrl *ctrl)
15071957 {
1508
- memset(ids, 0, sizeof(*ids));
1509
-
1510
- if (ctrl->vs >= NVME_VS(1, 1, 0))
1511
- memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1512
- if (ctrl->vs >= NVME_VS(1, 2, 0))
1513
- memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1514
- if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1515
- /* Don't treat error as fatal we potentially
1516
- * already have a NGUID or EUI-64
1517
- */
1518
- if (nvme_identify_ns_descs(ctrl, nsid, ids))
1519
- dev_warn(ctrl->device,
1520
- "%s: Identify Descriptors failed\n", __func__);
1521
- }
1958
+ if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
1959
+ !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
1960
+ blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
15221961 }
15231962
15241963 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
....@@ -1532,110 +1971,250 @@
15321971 {
15331972 return uuid_equal(&a->uuid, &b->uuid) &&
15341973 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1535
- memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1974
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1975
+ a->csi == b->csi;
15361976 }
15371977
1538
-static void nvme_update_disk_info(struct gendisk *disk,
1539
- struct nvme_ns *ns, struct nvme_id_ns *id)
1978
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1979
+ u32 *phys_bs, u32 *io_opt)
15401980 {
1541
- sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1542
- unsigned short bs = 1 << ns->lba_shift;
1981
+ struct streams_directive_params s;
1982
+ int ret;
15431983
1544
- if (ns->lba_shift > PAGE_SHIFT) {
1545
- /* unsupported block size, set capacity to 0 later */
1546
- bs = (1 << 9);
1984
+ if (!ctrl->nr_streams)
1985
+ return 0;
1986
+
1987
+ ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1988
+ if (ret)
1989
+ return ret;
1990
+
1991
+ ns->sws = le32_to_cpu(s.sws);
1992
+ ns->sgs = le16_to_cpu(s.sgs);
1993
+
1994
+ if (ns->sws) {
1995
+ *phys_bs = ns->sws * (1 << ns->lba_shift);
1996
+ if (ns->sgs)
1997
+ *io_opt = *phys_bs * ns->sgs;
15471998 }
1548
- blk_mq_freeze_queue(disk->queue);
1549
- blk_integrity_unregister(disk);
15501999
1551
- blk_queue_logical_block_size(disk->queue, bs);
1552
- blk_queue_physical_block_size(disk->queue, bs);
1553
- blk_queue_io_min(disk->queue, bs);
1554
-
1555
- if (ns->ms && !ns->ext &&
1556
- (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1557
- nvme_init_integrity(disk, ns->ms, ns->pi_type);
1558
- if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
1559
- ns->lba_shift > PAGE_SHIFT)
1560
- capacity = 0;
1561
-
1562
- set_capacity(disk, capacity);
1563
- nvme_config_discard(ns);
1564
-
1565
- if (id->nsattr & (1 << 0))
1566
- set_disk_ro(disk, true);
1567
- else
1568
- set_disk_ro(disk, false);
1569
-
1570
- blk_mq_unfreeze_queue(disk->queue);
2000
+ return 0;
15712001 }
15722002
1573
-static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
2003
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
15742004 {
1575
- struct nvme_ns *ns = disk->private_data;
2005
+ struct nvme_ctrl *ctrl = ns->ctrl;
15762006
15772007 /*
1578
- * If identify namespace failed, use default 512 byte block size so
1579
- * block layer can use before failing read/write for 0 capacity.
2008
+ * The PI implementation requires the metadata size to be equal to the
2009
+ * t10 pi tuple size.
15802010 */
1581
- ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1582
- if (ns->lba_shift == 0)
1583
- ns->lba_shift = 9;
1584
- ns->noiob = le16_to_cpu(id->noiob);
15852011 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1586
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1587
- /* the PI implementation requires metadata equal t10 pi tuple size */
15882012 if (ns->ms == sizeof(struct t10_pi_tuple))
15892013 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
15902014 else
15912015 ns->pi_type = 0;
15922016
1593
- if (ns->noiob)
1594
- nvme_set_chunk_size(ns);
1595
- nvme_update_disk_info(disk, ns, id);
1596
- if (ns->ndev)
1597
- nvme_nvm_update_nvm_info(ns);
1598
-#ifdef CONFIG_NVME_MULTIPATH
1599
- if (ns->head->disk) {
1600
- nvme_update_disk_info(ns->head->disk, ns, id);
1601
- blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
1602
- nvme_mpath_update_disk_size(ns->head->disk);
2017
+ ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
2018
+ if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
2019
+ return 0;
2020
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
2021
+ /*
2022
+ * The NVMe over Fabrics specification only supports metadata as
2023
+ * part of the extended data LBA. We rely on HCA/HBA support to
2024
+ * remap the separate metadata buffer from the block layer.
2025
+ */
2026
+ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
2027
+ return -EINVAL;
2028
+ if (ctrl->max_integrity_segments)
2029
+ ns->features |=
2030
+ (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
2031
+ } else {
2032
+ /*
2033
+ * For PCIe controllers, we can't easily remap the separate
2034
+ * metadata buffer from the block layer and thus require a
2035
+ * separate metadata buffer for block layer metadata/PI support.
2036
+ * We allow extended LBAs for the passthrough interface, though.
2037
+ */
2038
+ if (id->flbas & NVME_NS_FLBAS_META_EXT)
2039
+ ns->features |= NVME_NS_EXT_LBAS;
2040
+ else
2041
+ ns->features |= NVME_NS_METADATA_SUPPORTED;
16032042 }
1604
-#endif
2043
+
2044
+ return 0;
16052045 }
16062046
1607
-static int nvme_revalidate_disk(struct gendisk *disk)
2047
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
2048
+ struct request_queue *q)
16082049 {
1609
- struct nvme_ns *ns = disk->private_data;
2050
+ bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
2051
+
2052
+ if (ctrl->max_hw_sectors) {
2053
+ u32 max_segments =
2054
+ (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
2055
+
2056
+ max_segments = min_not_zero(max_segments, ctrl->max_segments);
2057
+ blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
2058
+ blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
2059
+ }
2060
+ blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
2061
+ blk_queue_dma_alignment(q, 3);
2062
+ blk_queue_write_cache(q, vwc, vwc);
2063
+}
2064
+
2065
+static void nvme_update_disk_info(struct gendisk *disk,
2066
+ struct nvme_ns *ns, struct nvme_id_ns *id)
2067
+{
2068
+ sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
2069
+ unsigned short bs = 1 << ns->lba_shift;
2070
+ u32 atomic_bs, phys_bs, io_opt = 0;
2071
+
2072
+ /*
2073
+ * The block layer can't support LBA sizes larger than the page size
2074
+ * yet, so catch this early and don't allow block I/O.
2075
+ */
2076
+ if (ns->lba_shift > PAGE_SHIFT) {
2077
+ capacity = 0;
2078
+ bs = (1 << 9);
2079
+ }
2080
+
2081
+ blk_integrity_unregister(disk);
2082
+
2083
+ atomic_bs = phys_bs = bs;
2084
+ nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
2085
+ if (id->nabo == 0) {
2086
+ /*
2087
+ * Bit 1 indicates whether NAWUPF is defined for this namespace
2088
+ * and whether it should be used instead of AWUPF. If NAWUPF ==
2089
+ * 0 then AWUPF must be used instead.
2090
+ */
2091
+ if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2092
+ atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2093
+ else
2094
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2095
+ }
2096
+
2097
+ if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2098
+ /* NPWG = Namespace Preferred Write Granularity */
2099
+ phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2100
+ /* NOWS = Namespace Optimal Write Size */
2101
+ io_opt = bs * (1 + le16_to_cpu(id->nows));
2102
+ }
2103
+
2104
+ blk_queue_logical_block_size(disk->queue, bs);
2105
+ /*
2106
+ * Linux filesystems assume writing a single physical block is
2107
+ * an atomic operation. Hence limit the physical block size to the
2108
+ * value of the Atomic Write Unit Power Fail parameter.
2109
+ */
2110
+ blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
2111
+ blk_queue_io_min(disk->queue, phys_bs);
2112
+ blk_queue_io_opt(disk->queue, io_opt);
2113
+
2114
+ /*
2115
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
2116
+ * metadata masquerading as Type 0 if supported, otherwise reject block
2117
+ * I/O to namespaces with metadata except when the namespace supports
2118
+ * PI, as it can strip/insert in that case.
2119
+ */
2120
+ if (ns->ms) {
2121
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2122
+ (ns->features & NVME_NS_METADATA_SUPPORTED))
2123
+ nvme_init_integrity(disk, ns->ms, ns->pi_type,
2124
+ ns->ctrl->max_integrity_segments);
2125
+ else if (!nvme_ns_has_pi(ns))
2126
+ capacity = 0;
2127
+ }
2128
+
2129
+ set_capacity_revalidate_and_notify(disk, capacity, false);
2130
+
2131
+ nvme_config_discard(disk, ns);
2132
+ nvme_config_write_zeroes(disk->queue, ns->ctrl);
2133
+
2134
+ if (id->nsattr & NVME_NS_ATTR_RO)
2135
+ set_disk_ro(disk, true);
2136
+}
2137
+
2138
+static inline bool nvme_first_scan(struct gendisk *disk)
2139
+{
2140
+ /* nvme_alloc_ns() scans the disk prior to adding it */
2141
+ return !(disk->flags & GENHD_FL_UP);
2142
+}
2143
+
2144
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
2145
+{
16102146 struct nvme_ctrl *ctrl = ns->ctrl;
1611
- struct nvme_id_ns *id;
1612
- struct nvme_ns_ids ids;
1613
- int ret = 0;
2147
+ u32 iob;
16142148
1615
- if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1616
- set_capacity(disk, 0);
1617
- return -ENODEV;
2149
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2150
+ is_power_of_2(ctrl->max_hw_sectors))
2151
+ iob = ctrl->max_hw_sectors;
2152
+ else
2153
+ iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
2154
+
2155
+ if (!iob)
2156
+ return;
2157
+
2158
+ if (!is_power_of_2(iob)) {
2159
+ if (nvme_first_scan(ns->disk))
2160
+ pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2161
+ ns->disk->disk_name, iob);
2162
+ return;
16182163 }
16192164
1620
- id = nvme_identify_ns(ctrl, ns->head->ns_id);
1621
- if (!id)
1622
- return -ENODEV;
1623
-
1624
- if (id->ncap == 0) {
1625
- ret = -ENODEV;
1626
- goto out;
2165
+ if (blk_queue_is_zoned(ns->disk->queue)) {
2166
+ if (nvme_first_scan(ns->disk))
2167
+ pr_warn("%s: ignoring zoned namespace IO boundary\n",
2168
+ ns->disk->disk_name);
2169
+ return;
16272170 }
16282171
1629
- __nvme_revalidate_disk(disk, id);
1630
- nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1631
- if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1632
- dev_err(ctrl->device,
1633
- "identifiers changed for nsid %d\n", ns->head->ns_id);
1634
- ret = -ENODEV;
2172
+ blk_queue_chunk_sectors(ns->queue, iob);
2173
+}
2174
+
2175
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
2176
+{
2177
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2178
+ int ret;
2179
+
2180
+ blk_mq_freeze_queue(ns->disk->queue);
2181
+ ns->lba_shift = id->lbaf[lbaf].ds;
2182
+ nvme_set_queue_limits(ns->ctrl, ns->queue);
2183
+
2184
+ if (ns->head->ids.csi == NVME_CSI_ZNS) {
2185
+ ret = nvme_update_zone_info(ns, lbaf);
2186
+ if (ret)
2187
+ goto out_unfreeze;
16352188 }
16362189
1637
-out:
1638
- kfree(id);
2190
+ ret = nvme_configure_metadata(ns, id);
2191
+ if (ret)
2192
+ goto out_unfreeze;
2193
+ nvme_set_chunk_sectors(ns, id);
2194
+ nvme_update_disk_info(ns->disk, ns, id);
2195
+ blk_mq_unfreeze_queue(ns->disk->queue);
2196
+
2197
+ if (blk_queue_is_zoned(ns->queue)) {
2198
+ ret = nvme_revalidate_zones(ns);
2199
+ if (ret && !nvme_first_scan(ns->disk))
2200
+ return ret;
2201
+ }
2202
+
2203
+#ifdef CONFIG_NVME_MULTIPATH
2204
+ if (ns->head->disk) {
2205
+ blk_mq_freeze_queue(ns->head->disk->queue);
2206
+ nvme_update_disk_info(ns->head->disk, ns, id);
2207
+ blk_stack_limits(&ns->head->disk->queue->limits,
2208
+ &ns->queue->limits, 0);
2209
+ blk_queue_update_readahead(ns->head->disk->queue);
2210
+ nvme_update_bdev_size(ns->head->disk);
2211
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
2212
+ }
2213
+#endif
2214
+ return 0;
2215
+
2216
+out_unfreeze:
2217
+ blk_mq_unfreeze_queue(ns->disk->queue);
16392218 return ret;
16402219 }
16412220
....@@ -1678,7 +2257,7 @@
16782257 memset(&c, 0, sizeof(c));
16792258 c.common.opcode = op;
16802259 c.common.nsid = cpu_to_le32(ns->head->ns_id);
1681
- c.common.cdw10[0] = cpu_to_le32(cdw10);
2260
+ c.common.cdw10 = cpu_to_le32(cdw10);
16822261
16832262 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
16842263 nvme_put_ns_from_disk(head, srcu_idx);
....@@ -1716,18 +2295,21 @@
17162295 enum pr_type type, bool abort)
17172296 {
17182297 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2298
+
17192299 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
17202300 }
17212301
17222302 static int nvme_pr_clear(struct block_device *bdev, u64 key)
17232303 {
1724
- u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1725
- return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2304
+ u32 cdw10 = 1 | (key ? 0 : 1 << 3);
2305
+
2306
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
17262307 }
17272308
17282309 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
17292310 {
1730
- u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
2311
+ u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
2312
+
17312313 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
17322314 }
17332315
....@@ -1752,11 +2334,11 @@
17522334 else
17532335 cmd.common.opcode = nvme_admin_security_recv;
17542336 cmd.common.nsid = 0;
1755
- cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1756
- cmd.common.cdw10[1] = cpu_to_le32(len);
2337
+ cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2338
+ cmd.common.cdw11 = cpu_to_le32(len);
17572339
17582340 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1759
- ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
2341
+ ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
17602342 }
17612343 EXPORT_SYMBOL_GPL(nvme_sec_submit);
17622344 #endif /* CONFIG_BLK_SED_OPAL */
....@@ -1764,11 +2346,11 @@
17642346 static const struct block_device_operations nvme_fops = {
17652347 .owner = THIS_MODULE,
17662348 .ioctl = nvme_ioctl,
1767
- .compat_ioctl = nvme_ioctl,
2349
+ .compat_ioctl = nvme_compat_ioctl,
17682350 .open = nvme_open,
17692351 .release = nvme_release,
17702352 .getgeo = nvme_getgeo,
1771
- .revalidate_disk= nvme_revalidate_disk,
2353
+ .report_zones = nvme_report_zones,
17722354 .pr_ops = &nvme_pr_ops,
17732355 };
17742356
....@@ -1789,11 +2371,13 @@
17892371
17902372 const struct block_device_operations nvme_ns_head_ops = {
17912373 .owner = THIS_MODULE,
2374
+ .submit_bio = nvme_ns_head_submit_bio,
17922375 .open = nvme_ns_head_open,
17932376 .release = nvme_ns_head_release,
17942377 .ioctl = nvme_ioctl,
1795
- .compat_ioctl = nvme_ioctl,
2378
+ .compat_ioctl = nvme_compat_ioctl,
17962379 .getgeo = nvme_getgeo,
2380
+ .report_zones = nvme_report_zones,
17972381 .pr_ops = &nvme_pr_ops,
17982382 };
17992383 #endif /* CONFIG_NVME_MULTIPATH */
....@@ -1811,13 +2395,13 @@
18112395 if ((csts & NVME_CSTS_RDY) == bit)
18122396 break;
18132397
1814
- msleep(100);
2398
+ usleep_range(1000, 2000);
18152399 if (fatal_signal_pending(current))
18162400 return -EINTR;
18172401 if (time_after(jiffies, timeout)) {
18182402 dev_err(ctrl->device,
1819
- "Device not ready; aborting %s\n", enabled ?
1820
- "initialisation" : "reset");
2403
+ "Device not ready; aborting %s, CSTS=0x%x\n",
2404
+ enabled ? "initialisation" : "reset", csts);
18212405 return -ENODEV;
18222406 }
18232407 }
....@@ -1831,7 +2415,7 @@
18312415 * bits', but doing so may cause the device to complete commands to the
18322416 * admin queue ... and we don't know what memory that might be pointing at!
18332417 */
1834
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
2418
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
18352419 {
18362420 int ret;
18372421
....@@ -1845,31 +2429,34 @@
18452429 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
18462430 msleep(NVME_QUIRK_DELAY_AMOUNT);
18472431
1848
- return nvme_wait_ready(ctrl, cap, false);
2432
+ return nvme_wait_ready(ctrl, ctrl->cap, false);
18492433 }
18502434 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
18512435
1852
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
2436
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
18532437 {
1854
- /*
1855
- * Default to a 4K page size, with the intention to update this
1856
- * path in the future to accomodate architectures with differing
1857
- * kernel and IO page sizes.
1858
- */
1859
- unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
2438
+ unsigned dev_page_min;
18602439 int ret;
18612440
1862
- if (page_shift < dev_page_min) {
2441
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2442
+ if (ret) {
2443
+ dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2444
+ return ret;
2445
+ }
2446
+ dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2447
+
2448
+ if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
18632449 dev_err(ctrl->device,
18642450 "Minimum device page size %u too large for host (%u)\n",
1865
- 1 << dev_page_min, 1 << page_shift);
2451
+ 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
18662452 return -ENODEV;
18672453 }
18682454
1869
- ctrl->page_size = 1 << page_shift;
1870
-
1871
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
1872
- ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
2455
+ if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2456
+ ctrl->ctrl_config = NVME_CC_CSS_CSI;
2457
+ else
2458
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
2459
+ ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
18732460 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
18742461 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
18752462 ctrl->ctrl_config |= NVME_CC_ENABLE;
....@@ -1877,7 +2464,7 @@
18772464 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
18782465 if (ret)
18792466 return ret;
1880
- return nvme_wait_ready(ctrl, cap, true);
2467
+ return nvme_wait_ready(ctrl, ctrl->cap, true);
18812468 }
18822469 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
18832470
....@@ -1912,28 +2499,6 @@
19122499 }
19132500 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
19142501
1915
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1916
- struct request_queue *q)
1917
-{
1918
- bool vwc = false;
1919
-
1920
- if (ctrl->max_hw_sectors) {
1921
- u32 max_segments =
1922
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1923
-
1924
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
1925
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1926
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1927
- }
1928
- if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1929
- is_power_of_2(ctrl->max_hw_sectors))
1930
- blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1931
- blk_queue_virt_boundary(q, ctrl->page_size - 1);
1932
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1933
- vwc = true;
1934
- blk_queue_write_cache(q, vwc, vwc);
1935
-}
1936
-
19372502 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
19382503 {
19392504 __le64 ts;
....@@ -1948,6 +2513,26 @@
19482513 if (ret)
19492514 dev_warn_once(ctrl->device,
19502515 "could not set timestamp (%d)\n", ret);
2516
+ return ret;
2517
+}
2518
+
2519
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2520
+{
2521
+ struct nvme_feat_host_behavior *host;
2522
+ int ret;
2523
+
2524
+ /* Don't bother enabling the feature if retry delay is not reported */
2525
+ if (!ctrl->crdt[0])
2526
+ return 0;
2527
+
2528
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
2529
+ if (!host)
2530
+ return 0;
2531
+
2532
+ host->acre = NVME_ENABLE_ACRE;
2533
+ ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2534
+ host, sizeof(*host), NULL);
2535
+ kfree(host);
19512536 return ret;
19522537 }
19532538
....@@ -2117,6 +2702,44 @@
21172702 .vid = 0x1179,
21182703 .mn = "THNSF5256GPUK TOSHIBA",
21192704 .quirks = NVME_QUIRK_NO_APST,
2705
+ },
2706
+ {
2707
+ /*
2708
+ * This LiteON CL1-3D*-Q11 firmware version has a race
2709
+ * condition associated with actions related to suspend to idle
2710
+ * LiteON has resolved the problem in future firmware
2711
+ */
2712
+ .vid = 0x14a4,
2713
+ .fr = "22301111",
2714
+ .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2715
+ },
2716
+ {
2717
+ /*
2718
+ * This Kioxia CD6-V Series / HPE PE8030 device times out and
2719
+ * aborts I/O during any load, but more easily reproducible
2720
+ * with discards (fstrim).
2721
+ *
2722
+ * The device is left in a state where it is also not possible
2723
+ * to use "nvme set-feature" to disable APST, but booting with
2724
+ * nvme_core.default_ps_max_latency=0 works.
2725
+ */
2726
+ .vid = 0x1e0f,
2727
+ .mn = "KCD6XVUL6T40",
2728
+ .quirks = NVME_QUIRK_NO_APST,
2729
+ },
2730
+ {
2731
+ /*
2732
+ * The external Samsung X5 SSD fails initialization without a
2733
+ * delay before checking if it is ready and has a whole set of
2734
+ * other problems. To make this even more interesting, it
2735
+ * shares the PCI ID with internal Samsung 970 Evo Plus that
2736
+ * does not need or want these quirks.
2737
+ */
2738
+ .vid = 0x144d,
2739
+ .mn = "Samsung Portable SSD X5",
2740
+ .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2741
+ NVME_QUIRK_NO_DEEPEST_PS |
2742
+ NVME_QUIRK_IGNORE_DEV_SUBNQN,
21202743 }
21212744 };
21222745
....@@ -2155,14 +2778,16 @@
21552778 size_t nqnlen;
21562779 int off;
21572780
2158
- nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2159
- if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2160
- strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2161
- return;
2162
- }
2781
+ if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2782
+ nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2783
+ if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2784
+ strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2785
+ return;
2786
+ }
21632787
2164
- if (ctrl->vs >= NVME_VS(1, 2, 1))
2165
- dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2788
+ if (ctrl->vs >= NVME_VS(1, 2, 1))
2789
+ dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2790
+ }
21662791
21672792 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
21682793 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
....@@ -2175,15 +2800,14 @@
21752800 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
21762801 }
21772802
2178
-static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
2179
-{
2180
- ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
2181
- kfree(subsys);
2182
-}
2183
-
21842803 static void nvme_release_subsystem(struct device *dev)
21852804 {
2186
- __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
2805
+ struct nvme_subsystem *subsys =
2806
+ container_of(dev, struct nvme_subsystem, dev);
2807
+
2808
+ if (subsys->instance >= 0)
2809
+ ida_simple_remove(&nvme_instance_ida, subsys->instance);
2810
+ kfree(subsys);
21872811 }
21882812
21892813 static void nvme_destroy_subsystem(struct kref *ref)
....@@ -2254,8 +2878,8 @@
22542878 { \
22552879 struct nvme_subsystem *subsys = \
22562880 container_of(dev, struct nvme_subsystem, dev); \
2257
- return sprintf(buf, "%.*s\n", \
2258
- (int)sizeof(subsys->field), subsys->field); \
2881
+ return sysfs_emit(buf, "%.*s\n", \
2882
+ (int)sizeof(subsys->field), subsys->field); \
22592883 } \
22602884 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
22612885
....@@ -2268,6 +2892,9 @@
22682892 &subsys_attr_serial.attr,
22692893 &subsys_attr_firmware_rev.attr,
22702894 &subsys_attr_subsysnqn.attr,
2895
+#ifdef CONFIG_NVME_MULTIPATH
2896
+ &subsys_attr_iopolicy.attr,
2897
+#endif
22712898 NULL,
22722899 };
22732900
....@@ -2280,20 +2907,39 @@
22802907 NULL,
22812908 };
22822909
2283
-static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2910
+static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
22842911 {
2285
- int count = 0;
2286
- struct nvme_ctrl *ctrl;
2912
+ return ctrl->opts && ctrl->opts->discovery_nqn;
2913
+}
22872914
2288
- mutex_lock(&subsys->lock);
2289
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2290
- if (ctrl->state != NVME_CTRL_DELETING &&
2291
- ctrl->state != NVME_CTRL_DEAD)
2292
- count++;
2915
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2916
+ struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2917
+{
2918
+ struct nvme_ctrl *tmp;
2919
+
2920
+ lockdep_assert_held(&nvme_subsystems_lock);
2921
+
2922
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2923
+ if (nvme_state_terminal(tmp))
2924
+ continue;
2925
+
2926
+ if (tmp->cntlid == ctrl->cntlid) {
2927
+ dev_err(ctrl->device,
2928
+ "Duplicate cntlid %u with %s, rejecting\n",
2929
+ ctrl->cntlid, dev_name(tmp->device));
2930
+ return false;
2931
+ }
2932
+
2933
+ if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2934
+ nvme_discovery_ctrl(ctrl))
2935
+ continue;
2936
+
2937
+ dev_err(ctrl->device,
2938
+ "Subsystem does not support multiple controllers\n");
2939
+ return false;
22932940 }
2294
- mutex_unlock(&subsys->lock);
22952941
2296
- return count;
2942
+ return true;
22972943 }
22982944
22992945 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
....@@ -2304,12 +2950,8 @@
23042950 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
23052951 if (!subsys)
23062952 return -ENOMEM;
2307
- ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2308
- if (ret < 0) {
2309
- kfree(subsys);
2310
- return ret;
2311
- }
2312
- subsys->instance = ret;
2953
+
2954
+ subsys->instance = -1;
23132955 mutex_init(&subsys->lock);
23142956 kref_init(&subsys->ref);
23152957 INIT_LIST_HEAD(&subsys->ctrls);
....@@ -2317,74 +2959,68 @@
23172959 nvme_init_subnqn(subsys, ctrl, id);
23182960 memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
23192961 memcpy(subsys->model, id->mn, sizeof(subsys->model));
2320
- memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
23212962 subsys->vendor_id = le16_to_cpu(id->vid);
23222963 subsys->cmic = id->cmic;
2964
+ subsys->awupf = le16_to_cpu(id->awupf);
2965
+#ifdef CONFIG_NVME_MULTIPATH
2966
+ subsys->iopolicy = NVME_IOPOLICY_NUMA;
2967
+#endif
23232968
23242969 subsys->dev.class = nvme_subsys_class;
23252970 subsys->dev.release = nvme_release_subsystem;
23262971 subsys->dev.groups = nvme_subsys_attrs_groups;
2327
- dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2972
+ dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
23282973 device_initialize(&subsys->dev);
23292974
23302975 mutex_lock(&nvme_subsystems_lock);
23312976 found = __nvme_find_get_subsystem(subsys->subnqn);
23322977 if (found) {
2333
- /*
2334
- * Verify that the subsystem actually supports multiple
2335
- * controllers, else bail out.
2336
- */
2337
- if (!(ctrl->opts && ctrl->opts->discovery_nqn) &&
2338
- nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2339
- dev_err(ctrl->device,
2340
- "ignoring ctrl due to duplicate subnqn (%s).\n",
2341
- found->subnqn);
2342
- nvme_put_subsystem(found);
2343
- ret = -EINVAL;
2344
- goto out_unlock;
2345
- }
2346
-
2347
- __nvme_release_subsystem(subsys);
2978
+ put_device(&subsys->dev);
23482979 subsys = found;
2980
+
2981
+ if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2982
+ ret = -EINVAL;
2983
+ goto out_put_subsystem;
2984
+ }
23492985 } else {
23502986 ret = device_add(&subsys->dev);
23512987 if (ret) {
23522988 dev_err(ctrl->device,
23532989 "failed to register subsystem device.\n");
2990
+ put_device(&subsys->dev);
23542991 goto out_unlock;
23552992 }
23562993 ida_init(&subsys->ns_ida);
23572994 list_add_tail(&subsys->entry, &nvme_subsystems);
23582995 }
23592996
2360
- ctrl->subsys = subsys;
2361
- mutex_unlock(&nvme_subsystems_lock);
2362
-
2363
- if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2364
- dev_name(ctrl->device))) {
2997
+ ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2998
+ dev_name(ctrl->device));
2999
+ if (ret) {
23653000 dev_err(ctrl->device,
23663001 "failed to create sysfs link from subsystem.\n");
2367
- /* the transport driver will eventually put the subsystem */
2368
- return -EINVAL;
3002
+ goto out_put_subsystem;
23693003 }
23703004
2371
- mutex_lock(&subsys->lock);
3005
+ if (!found)
3006
+ subsys->instance = ctrl->instance;
3007
+ ctrl->subsys = subsys;
23723008 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2373
- mutex_unlock(&subsys->lock);
2374
-
3009
+ mutex_unlock(&nvme_subsystems_lock);
23753010 return 0;
23763011
3012
+out_put_subsystem:
3013
+ nvme_put_subsystem(subsys);
23773014 out_unlock:
23783015 mutex_unlock(&nvme_subsystems_lock);
2379
- put_device(&subsys->dev);
23803016 return ret;
23813017 }
23823018
2383
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
3019
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
23843020 void *log, size_t size, u64 offset)
23853021 {
23863022 struct nvme_command c = { };
2387
- unsigned long dwlen = size / 4 - 1;
3023
+ u32 dwlen = nvme_bytes_to_numd(size);
23883024
23893025 c.get_log_page.opcode = nvme_admin_get_log_page;
23903026 c.get_log_page.nsid = cpu_to_le32(nsid);
....@@ -2394,27 +3030,35 @@
23943030 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
23953031 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
23963032 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3033
+ c.get_log_page.csi = csi;
23973034
23983035 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
23993036 }
24003037
2401
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
3038
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3039
+ struct nvme_effects_log **log)
24023040 {
3041
+ struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
24033042 int ret;
24043043
2405
- if (!ctrl->effects)
2406
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
3044
+ if (cel)
3045
+ goto out;
24073046
2408
- if (!ctrl->effects)
2409
- return 0;
3047
+ cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3048
+ if (!cel)
3049
+ return -ENOMEM;
24103050
2411
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
2412
- ctrl->effects, sizeof(*ctrl->effects), 0);
3051
+ ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3052
+ cel, sizeof(*cel), 0);
24133053 if (ret) {
2414
- kfree(ctrl->effects);
2415
- ctrl->effects = NULL;
3054
+ kfree(cel);
3055
+ return ret;
24163056 }
2417
- return ret;
3057
+
3058
+ xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3059
+out:
3060
+ *log = cel;
3061
+ return 0;
24183062 }
24193063
24203064 /*
....@@ -2425,7 +3069,6 @@
24253069 int nvme_init_identify(struct nvme_ctrl *ctrl)
24263070 {
24273071 struct nvme_id_ctrl *id;
2428
- u64 cap;
24293072 int ret, page_shift;
24303073 u32 max_hw_sectors;
24313074 bool prev_apst_enabled;
....@@ -2435,16 +3078,11 @@
24353078 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
24363079 return ret;
24373080 }
2438
-
2439
- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
2440
- if (ret) {
2441
- dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2442
- return ret;
2443
- }
2444
- page_shift = NVME_CAP_MPSMIN(cap) + 12;
3081
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
3082
+ ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
24453083
24463084 if (ctrl->vs >= NVME_VS(1, 1, 0))
2447
- ctrl->subsystem = NVME_CAP_NSSRC(cap);
3085
+ ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
24483086
24493087 ret = nvme_identify_ctrl(ctrl, &id);
24503088 if (ret) {
....@@ -2453,17 +3091,16 @@
24533091 }
24543092
24553093 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2456
- ret = nvme_get_effects_log(ctrl);
3094
+ ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
24573095 if (ret < 0)
24583096 goto out_free;
24593097 }
24603098
3099
+ if (!(ctrl->ops->flags & NVME_F_FABRICS))
3100
+ ctrl->cntlid = le16_to_cpu(id->cntlid);
3101
+
24613102 if (!ctrl->identified) {
24623103 int i;
2463
-
2464
- ret = nvme_init_subsystem(ctrl, id);
2465
- if (ret)
2466
- goto out_free;
24673104
24683105 /*
24693106 * Check for quirks. Quirk can depend on firmware version,
....@@ -2477,19 +3114,32 @@
24773114 if (quirk_matches(id, &core_quirks[i]))
24783115 ctrl->quirks |= core_quirks[i].quirks;
24793116 }
3117
+
3118
+ ret = nvme_init_subsystem(ctrl, id);
3119
+ if (ret)
3120
+ goto out_free;
24803121 }
3122
+ memcpy(ctrl->subsys->firmware_rev, id->fr,
3123
+ sizeof(ctrl->subsys->firmware_rev));
24813124
24823125 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
24833126 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
24843127 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
24853128 }
24863129
3130
+ ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3131
+ ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3132
+ ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3133
+
24873134 ctrl->oacs = le16_to_cpu(id->oacs);
2488
- ctrl->oncs = le16_to_cpup(&id->oncs);
3135
+ ctrl->oncs = le16_to_cpu(id->oncs);
3136
+ ctrl->mtfa = le16_to_cpu(id->mtfa);
24893137 ctrl->oaes = le32_to_cpu(id->oaes);
3138
+ ctrl->wctemp = le16_to_cpu(id->wctemp);
3139
+ ctrl->cctemp = le16_to_cpu(id->cctemp);
3140
+
24903141 atomic_set(&ctrl->abort_limit, id->acl + 1);
24913142 ctrl->vwc = id->vwc;
2492
- ctrl->cntlid = le16_to_cpup(&id->cntlid);
24933143 if (id->mdts)
24943144 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
24953145 else
....@@ -2501,10 +3151,11 @@
25013151 ctrl->sgls = le32_to_cpu(id->sgls);
25023152 ctrl->kas = le16_to_cpu(id->kas);
25033153 ctrl->max_namespaces = le32_to_cpu(id->mnan);
3154
+ ctrl->ctratt = le32_to_cpu(id->ctratt);
25043155
25053156 if (id->rtd3e) {
25063157 /* us -> s */
2507
- u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
3158
+ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
25083159
25093160 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
25103161 shutdown_timeout, 60);
....@@ -2542,25 +3193,28 @@
25423193 * admin connect
25433194 */
25443195 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3196
+ dev_err(ctrl->device,
3197
+ "Mismatching cntlid: Connect %u vs Identify "
3198
+ "%u, rejecting\n",
3199
+ ctrl->cntlid, le16_to_cpu(id->cntlid));
25453200 ret = -EINVAL;
25463201 goto out_free;
25473202 }
25483203
2549
- if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
3204
+ if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
25503205 dev_err(ctrl->device,
25513206 "keep-alive support is mandatory for fabrics\n");
25523207 ret = -EINVAL;
25533208 goto out_free;
25543209 }
25553210 } else {
2556
- ctrl->cntlid = le16_to_cpu(id->cntlid);
25573211 ctrl->hmpre = le32_to_cpu(id->hmpre);
25583212 ctrl->hmmin = le32_to_cpu(id->hmmin);
25593213 ctrl->hmminds = le32_to_cpu(id->hmminds);
25603214 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
25613215 }
25623216
2563
- ret = nvme_mpath_init(ctrl, id);
3217
+ ret = nvme_mpath_init_identify(ctrl, id);
25643218 kfree(id);
25653219
25663220 if (ret < 0)
....@@ -2583,6 +3237,20 @@
25833237 if (ret < 0)
25843238 return ret;
25853239
3240
+ ret = nvme_configure_acre(ctrl);
3241
+ if (ret < 0)
3242
+ return ret;
3243
+
3244
+ if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3245
+ /*
3246
+ * Do not return errors unless we are in a controller reset,
3247
+ * the controller works perfectly fine without hwmon.
3248
+ */
3249
+ ret = nvme_hwmon_init(ctrl);
3250
+ if (ret == -EINTR)
3251
+ return ret;
3252
+ }
3253
+
25863254 ctrl->identified = true;
25873255
25883256 return 0;
....@@ -2600,7 +3268,6 @@
26003268
26013269 switch (ctrl->state) {
26023270 case NVME_CTRL_LIVE:
2603
- case NVME_CTRL_ADMIN_ONLY:
26043271 break;
26053272 default:
26063273 return -EWOULDBLOCK;
....@@ -2668,14 +3335,22 @@
26683335 switch (cmd) {
26693336 case NVME_IOCTL_ADMIN_CMD:
26703337 return nvme_user_cmd(ctrl, NULL, argp);
3338
+ case NVME_IOCTL_ADMIN64_CMD:
3339
+ return nvme_user_cmd64(ctrl, NULL, argp);
26713340 case NVME_IOCTL_IO_CMD:
26723341 return nvme_dev_user_cmd(ctrl, argp);
26733342 case NVME_IOCTL_RESET:
3343
+ if (!capable(CAP_SYS_ADMIN))
3344
+ return -EACCES;
26743345 dev_warn(ctrl->device, "resetting controller\n");
26753346 return nvme_reset_ctrl_sync(ctrl);
26763347 case NVME_IOCTL_SUBSYS_RESET:
3348
+ if (!capable(CAP_SYS_ADMIN))
3349
+ return -EACCES;
26773350 return nvme_reset_subsystem(ctrl);
26783351 case NVME_IOCTL_RESCAN:
3352
+ if (!capable(CAP_SYS_ADMIN))
3353
+ return -EACCES;
26793354 nvme_queue_scan(ctrl);
26803355 return 0;
26813356 default:
....@@ -2688,7 +3363,7 @@
26883363 .open = nvme_dev_open,
26893364 .release = nvme_dev_release,
26903365 .unlocked_ioctl = nvme_dev_ioctl,
2691
- .compat_ioctl = nvme_dev_ioctl,
3366
+ .compat_ioctl = compat_ptr_ioctl,
26923367 };
26933368
26943369 static ssize_t nvme_sysfs_reset(struct device *dev,
....@@ -2736,13 +3411,13 @@
27363411 int model_len = sizeof(subsys->model);
27373412
27383413 if (!uuid_is_null(&ids->uuid))
2739
- return sprintf(buf, "uuid.%pU\n", &ids->uuid);
3414
+ return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
27403415
27413416 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2742
- return sprintf(buf, "eui.%16phN\n", ids->nguid);
3417
+ return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
27433418
27443419 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2745
- return sprintf(buf, "eui.%8phN\n", ids->eui64);
3420
+ return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
27463421
27473422 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
27483423 subsys->serial[serial_len - 1] == '\0'))
....@@ -2751,7 +3426,7 @@
27513426 subsys->model[model_len - 1] == '\0'))
27523427 model_len--;
27533428
2754
- return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3429
+ return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
27553430 serial_len, subsys->serial, model_len, subsys->model,
27563431 head->ns_id);
27573432 }
....@@ -2760,7 +3435,7 @@
27603435 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
27613436 char *buf)
27623437 {
2763
- return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3438
+ return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
27643439 }
27653440 static DEVICE_ATTR_RO(nguid);
27663441
....@@ -2773,25 +3448,25 @@
27733448 * we have no UUID set
27743449 */
27753450 if (uuid_is_null(&ids->uuid)) {
2776
- printk_ratelimited(KERN_WARNING
2777
- "No UUID available providing old NGUID\n");
2778
- return sprintf(buf, "%pU\n", ids->nguid);
3451
+ dev_warn_ratelimited(dev,
3452
+ "No UUID available providing old NGUID\n");
3453
+ return sysfs_emit(buf, "%pU\n", ids->nguid);
27793454 }
2780
- return sprintf(buf, "%pU\n", &ids->uuid);
3455
+ return sysfs_emit(buf, "%pU\n", &ids->uuid);
27813456 }
27823457 static DEVICE_ATTR_RO(uuid);
27833458
27843459 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
27853460 char *buf)
27863461 {
2787
- return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3462
+ return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
27883463 }
27893464 static DEVICE_ATTR_RO(eui);
27903465
27913466 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
27923467 char *buf)
27933468 {
2794
- return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3469
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
27953470 }
27963471 static DEVICE_ATTR_RO(nsid);
27973472
....@@ -2838,9 +3513,17 @@
28383513 return a->mode;
28393514 }
28403515
2841
-const struct attribute_group nvme_ns_id_attr_group = {
3516
+static const struct attribute_group nvme_ns_id_attr_group = {
28423517 .attrs = nvme_ns_id_attrs,
28433518 .is_visible = nvme_ns_id_attrs_are_visible,
3519
+};
3520
+
3521
+const struct attribute_group *nvme_ns_id_attr_groups[] = {
3522
+ &nvme_ns_id_attr_group,
3523
+#ifdef CONFIG_NVM
3524
+ &nvme_nvm_attr_group,
3525
+#endif
3526
+ NULL,
28443527 };
28453528
28463529 #define nvme_show_str_function(field) \
....@@ -2848,7 +3531,7 @@
28483531 struct device_attribute *attr, char *buf) \
28493532 { \
28503533 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2851
- return sprintf(buf, "%.*s\n", \
3534
+ return sysfs_emit(buf, "%.*s\n", \
28523535 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
28533536 } \
28543537 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
....@@ -2862,21 +3545,20 @@
28623545 struct device_attribute *attr, char *buf) \
28633546 { \
28643547 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2865
- return sprintf(buf, "%d\n", ctrl->field); \
3548
+ return sysfs_emit(buf, "%d\n", ctrl->field); \
28663549 } \
28673550 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
28683551
28693552 nvme_show_int_function(cntlid);
3553
+nvme_show_int_function(numa_node);
3554
+nvme_show_int_function(queue_count);
3555
+nvme_show_int_function(sqsize);
28703556
28713557 static ssize_t nvme_sysfs_delete(struct device *dev,
28723558 struct device_attribute *attr, const char *buf,
28733559 size_t count)
28743560 {
28753561 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2876
-
2877
- /* Can't delete non-created controllers */
2878
- if (!ctrl->created)
2879
- return -EBUSY;
28803562
28813563 if (device_remove_file_self(dev, attr))
28823564 nvme_delete_ctrl_sync(ctrl);
....@@ -2902,18 +3584,18 @@
29023584 static const char *const state_name[] = {
29033585 [NVME_CTRL_NEW] = "new",
29043586 [NVME_CTRL_LIVE] = "live",
2905
- [NVME_CTRL_ADMIN_ONLY] = "only-admin",
29063587 [NVME_CTRL_RESETTING] = "resetting",
29073588 [NVME_CTRL_CONNECTING] = "connecting",
29083589 [NVME_CTRL_DELETING] = "deleting",
3590
+ [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
29093591 [NVME_CTRL_DEAD] = "dead",
29103592 };
29113593
29123594 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
29133595 state_name[ctrl->state])
2914
- return sprintf(buf, "%s\n", state_name[ctrl->state]);
3596
+ return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
29153597
2916
- return sprintf(buf, "unknown state\n");
3598
+ return sysfs_emit(buf, "unknown state\n");
29173599 }
29183600
29193601 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
....@@ -2928,6 +3610,26 @@
29283610 }
29293611 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
29303612
3613
+static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3614
+ struct device_attribute *attr,
3615
+ char *buf)
3616
+{
3617
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3618
+
3619
+ return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
3620
+}
3621
+static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3622
+
3623
+static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3624
+ struct device_attribute *attr,
3625
+ char *buf)
3626
+{
3627
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3628
+
3629
+ return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
3630
+}
3631
+static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3632
+
29313633 static ssize_t nvme_sysfs_show_address(struct device *dev,
29323634 struct device_attribute *attr,
29333635 char *buf)
....@@ -2937,6 +3639,66 @@
29373639 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
29383640 }
29393641 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3642
+
3643
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3644
+ struct device_attribute *attr, char *buf)
3645
+{
3646
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3647
+ struct nvmf_ctrl_options *opts = ctrl->opts;
3648
+
3649
+ if (ctrl->opts->max_reconnects == -1)
3650
+ return sysfs_emit(buf, "off\n");
3651
+ return sysfs_emit(buf, "%d\n",
3652
+ opts->max_reconnects * opts->reconnect_delay);
3653
+}
3654
+
3655
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3656
+ struct device_attribute *attr, const char *buf, size_t count)
3657
+{
3658
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3659
+ struct nvmf_ctrl_options *opts = ctrl->opts;
3660
+ int ctrl_loss_tmo, err;
3661
+
3662
+ err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3663
+ if (err)
3664
+ return -EINVAL;
3665
+
3666
+ else if (ctrl_loss_tmo < 0)
3667
+ opts->max_reconnects = -1;
3668
+ else
3669
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3670
+ opts->reconnect_delay);
3671
+ return count;
3672
+}
3673
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3674
+ nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3675
+
3676
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3677
+ struct device_attribute *attr, char *buf)
3678
+{
3679
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3680
+
3681
+ if (ctrl->opts->reconnect_delay == -1)
3682
+ return sysfs_emit(buf, "off\n");
3683
+ return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3684
+}
3685
+
3686
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3687
+ struct device_attribute *attr, const char *buf, size_t count)
3688
+{
3689
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3690
+ unsigned int v;
3691
+ int err;
3692
+
3693
+ err = kstrtou32(buf, 10, &v);
3694
+ if (err)
3695
+ return err;
3696
+
3697
+ ctrl->opts->reconnect_delay = v;
3698
+ return count;
3699
+}
3700
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3701
+ nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
29403702
29413703 static struct attribute *nvme_dev_attrs[] = {
29423704 &dev_attr_reset_controller.attr,
....@@ -2950,6 +3712,13 @@
29503712 &dev_attr_subsysnqn.attr,
29513713 &dev_attr_address.attr,
29523714 &dev_attr_state.attr,
3715
+ &dev_attr_numa_node.attr,
3716
+ &dev_attr_queue_count.attr,
3717
+ &dev_attr_sqsize.attr,
3718
+ &dev_attr_hostnqn.attr,
3719
+ &dev_attr_hostid.attr,
3720
+ &dev_attr_ctrl_loss_tmo.attr,
3721
+ &dev_attr_reconnect_delay.attr,
29533722 NULL
29543723 };
29553724
....@@ -2962,6 +3731,14 @@
29623731 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
29633732 return 0;
29643733 if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3734
+ return 0;
3735
+ if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3736
+ return 0;
3737
+ if (a == &dev_attr_hostid.attr && !ctrl->opts)
3738
+ return 0;
3739
+ if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3740
+ return 0;
3741
+ if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
29653742 return 0;
29663743
29673744 return a->mode;
....@@ -2977,7 +3754,7 @@
29773754 NULL,
29783755 };
29793756
2980
-static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
3757
+static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
29813758 unsigned nsid)
29823759 {
29833760 struct nvme_ns_head *h;
....@@ -2992,17 +3769,15 @@
29923769 return NULL;
29933770 }
29943771
2995
-static int __nvme_check_ids(struct nvme_subsystem *subsys,
2996
- struct nvme_ns_head *new)
3772
+static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3773
+ struct nvme_ns_ids *ids)
29973774 {
29983775 struct nvme_ns_head *h;
29993776
30003777 lockdep_assert_held(&subsys->lock);
30013778
30023779 list_for_each_entry(h, &subsys->nsheads, entry) {
3003
- if (nvme_ns_ids_valid(&new->ids) &&
3004
- !list_empty(&h->list) &&
3005
- nvme_ns_ids_equal(&new->ids, &h->ids))
3780
+ if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids))
30063781 return -EINVAL;
30073782 }
30083783
....@@ -3010,12 +3785,17 @@
30103785 }
30113786
30123787 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3013
- unsigned nsid, struct nvme_id_ns *id)
3788
+ unsigned nsid, struct nvme_ns_ids *ids)
30143789 {
30153790 struct nvme_ns_head *head;
3791
+ size_t size = sizeof(*head);
30163792 int ret = -ENOMEM;
30173793
3018
- head = kzalloc(sizeof(*head), GFP_KERNEL);
3794
+#ifdef CONFIG_NVME_MULTIPATH
3795
+ size += num_possible_nodes() * sizeof(struct nvme_ns *);
3796
+#endif
3797
+
3798
+ head = kzalloc(size, GFP_KERNEL);
30193799 if (!head)
30203800 goto out;
30213801 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
....@@ -3028,16 +3808,22 @@
30283808 goto out_ida_remove;
30293809 head->subsys = ctrl->subsys;
30303810 head->ns_id = nsid;
3811
+ head->ids = *ids;
30313812 kref_init(&head->ref);
30323813
3033
- nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
3034
-
3035
- ret = __nvme_check_ids(ctrl->subsys, head);
3814
+ ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids);
30363815 if (ret) {
30373816 dev_err(ctrl->device,
30383817 "duplicate IDs for nsid %d\n", nsid);
30393818 goto out_cleanup_srcu;
30403819 }
3820
+
3821
+ if (head->ids.csi) {
3822
+ ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3823
+ if (ret)
3824
+ goto out_cleanup_srcu;
3825
+ } else
3826
+ head->effects = ctrl->effects;
30413827
30423828 ret = nvme_mpath_alloc_disk(ctrl, head);
30433829 if (ret)
....@@ -3055,56 +3841,55 @@
30553841 out_free_head:
30563842 kfree(head);
30573843 out:
3844
+ if (ret > 0)
3845
+ ret = blk_status_to_errno(nvme_error_status(ret));
30583846 return ERR_PTR(ret);
30593847 }
30603848
30613849 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3062
- struct nvme_id_ns *id)
3850
+ struct nvme_ns_ids *ids, bool is_shared)
30633851 {
30643852 struct nvme_ctrl *ctrl = ns->ctrl;
3065
- bool is_shared = id->nmic & (1 << 0);
30663853 struct nvme_ns_head *head = NULL;
30673854 int ret = 0;
30683855
30693856 mutex_lock(&ctrl->subsys->lock);
3070
- if (is_shared)
3071
- head = __nvme_find_ns_head(ctrl->subsys, nsid);
3857
+ head = nvme_find_ns_head(ctrl->subsys, nsid);
30723858 if (!head) {
3073
- head = nvme_alloc_ns_head(ctrl, nsid, id);
3859
+ head = nvme_alloc_ns_head(ctrl, nsid, ids);
30743860 if (IS_ERR(head)) {
30753861 ret = PTR_ERR(head);
30763862 goto out_unlock;
30773863 }
3864
+ head->shared = is_shared;
30783865 } else {
3079
- struct nvme_ns_ids ids;
3080
-
3081
- nvme_report_ns_ids(ctrl, nsid, id, &ids);
3082
- if (!nvme_ns_ids_equal(&head->ids, &ids)) {
3866
+ ret = -EINVAL;
3867
+ if (!is_shared || !head->shared) {
3868
+ dev_err(ctrl->device,
3869
+ "Duplicate unshared namespace %d\n", nsid);
3870
+ goto out_put_ns_head;
3871
+ }
3872
+ if (!nvme_ns_ids_equal(&head->ids, ids)) {
30833873 dev_err(ctrl->device,
30843874 "IDs don't match for shared namespace %d\n",
30853875 nsid);
3086
- ret = -EINVAL;
3087
- goto out_unlock;
3876
+ goto out_put_ns_head;
30883877 }
30893878 }
30903879
30913880 list_add_tail(&ns->siblings, &head->list);
30923881 ns->head = head;
3882
+ mutex_unlock(&ctrl->subsys->lock);
3883
+ return 0;
30933884
3885
+out_put_ns_head:
3886
+ nvme_put_ns_head(head);
30943887 out_unlock:
30953888 mutex_unlock(&ctrl->subsys->lock);
30963889 return ret;
30973890 }
30983891
3099
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
3100
-{
3101
- struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
3102
- struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
3103
-
3104
- return nsa->head->ns_id - nsb->head->ns_id;
3105
-}
3106
-
3107
-static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3892
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
31083893 {
31093894 struct nvme_ns *ns, *ret = NULL;
31103895
....@@ -3122,76 +3907,59 @@
31223907 up_read(&ctrl->namespaces_rwsem);
31233908 return ret;
31243909 }
3910
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
31253911
3126
-static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
3912
+/*
3913
+ * Add the namespace to the controller list while keeping the list ordered.
3914
+ */
3915
+static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
31273916 {
3128
- struct streams_directive_params s;
3129
- int ret;
3917
+ struct nvme_ns *tmp;
31303918
3131
- if (!ctrl->nr_streams)
3132
- return 0;
3133
-
3134
- ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
3135
- if (ret)
3136
- return ret;
3137
-
3138
- ns->sws = le32_to_cpu(s.sws);
3139
- ns->sgs = le16_to_cpu(s.sgs);
3140
-
3141
- if (ns->sws) {
3142
- unsigned int bs = 1 << ns->lba_shift;
3143
-
3144
- blk_queue_io_min(ns->queue, bs * ns->sws);
3145
- if (ns->sgs)
3146
- blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
3919
+ list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3920
+ if (tmp->head->ns_id < ns->head->ns_id) {
3921
+ list_add(&ns->list, &tmp->list);
3922
+ return;
3923
+ }
31473924 }
3148
-
3149
- return 0;
3925
+ list_add(&ns->list, &ns->ctrl->namespaces);
31503926 }
31513927
3152
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3928
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
3929
+ struct nvme_ns_ids *ids)
31533930 {
31543931 struct nvme_ns *ns;
31553932 struct gendisk *disk;
31563933 struct nvme_id_ns *id;
31573934 char disk_name[DISK_NAME_LEN];
3158
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
3935
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
3936
+
3937
+ if (nvme_identify_ns(ctrl, nsid, ids, &id))
3938
+ return;
31593939
31603940 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
31613941 if (!ns)
3162
- return;
3942
+ goto out_free_id;
31633943
31643944 ns->queue = blk_mq_init_queue(ctrl->tagset);
31653945 if (IS_ERR(ns->queue))
31663946 goto out_free_ns;
3947
+
3948
+ if (ctrl->opts && ctrl->opts->data_digest)
3949
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3950
+
31673951 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3952
+ if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3953
+ blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3954
+
31683955 ns->queue->queuedata = ns;
31693956 ns->ctrl = ctrl;
3170
-
31713957 kref_init(&ns->kref);
3172
- ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
31733958
3174
- blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
3175
- nvme_set_queue_limits(ctrl, ns->queue);
3176
-
3177
- id = nvme_identify_ns(ctrl, nsid);
3178
- if (!id)
3959
+ ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
3960
+ if (ret)
31793961 goto out_free_queue;
3180
-
3181
- if (id->ncap == 0)
3182
- goto out_free_id;
3183
-
3184
- if (nvme_init_ns_head(ns, nsid, id))
3185
- goto out_free_id;
3186
- nvme_setup_streams_ns(ctrl, ns);
31873962 nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3188
-
3189
- if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3190
- if (nvme_nvm_register(ns, disk_name, node)) {
3191
- dev_warn(ctrl->device, "LightNVM init failure\n");
3192
- goto out_unlink_ns;
3193
- }
3194
- }
31953963
31963964 disk = alloc_disk_node(0, node);
31973965 if (!disk)
....@@ -3204,38 +3972,46 @@
32043972 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
32053973 ns->disk = disk;
32063974
3207
- __nvme_revalidate_disk(disk, id);
3975
+ if (nvme_update_ns_info(ns, id))
3976
+ goto out_put_disk;
3977
+
3978
+ if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3979
+ ret = nvme_nvm_register(ns, disk_name, node);
3980
+ if (ret) {
3981
+ dev_warn(ctrl->device, "LightNVM init failure\n");
3982
+ goto out_put_disk;
3983
+ }
3984
+ }
32083985
32093986 down_write(&ctrl->namespaces_rwsem);
3210
- list_add_tail(&ns->list, &ctrl->namespaces);
3987
+ nvme_ns_add_to_ctrl_list(ns);
32113988 up_write(&ctrl->namespaces_rwsem);
3212
-
32133989 nvme_get_ctrl(ctrl);
32143990
3215
- device_add_disk(ctrl->device, ns->disk);
3216
- if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
3217
- &nvme_ns_id_attr_group))
3218
- pr_warn("%s: failed to create sysfs group for identification\n",
3219
- ns->disk->disk_name);
3220
- if (ns->ndev && nvme_nvm_register_sysfs(ns))
3221
- pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
3222
- ns->disk->disk_name);
3991
+ device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
32233992
32243993 nvme_mpath_add_disk(ns, id);
3225
- nvme_fault_inject_init(ns);
3994
+ nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
32263995 kfree(id);
32273996
32283997 return;
3998
+ out_put_disk:
3999
+ /* prevent double queue cleanup */
4000
+ ns->disk->queue = NULL;
4001
+ put_disk(ns->disk);
32294002 out_unlink_ns:
32304003 mutex_lock(&ctrl->subsys->lock);
32314004 list_del_rcu(&ns->siblings);
4005
+ if (list_empty(&ns->head->list))
4006
+ list_del_init(&ns->head->entry);
32324007 mutex_unlock(&ctrl->subsys->lock);
3233
- out_free_id:
3234
- kfree(id);
4008
+ nvme_put_ns_head(ns->head);
32354009 out_free_queue:
32364010 blk_cleanup_queue(ns->queue);
32374011 out_free_ns:
32384012 kfree(ns);
4013
+ out_free_id:
4014
+ kfree(id);
32394015 }
32404016
32414017 static void nvme_ns_remove(struct nvme_ns *ns)
....@@ -3243,20 +4019,20 @@
32434019 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
32444020 return;
32454021
3246
- nvme_fault_inject_fini(ns);
4022
+ set_capacity(ns->disk, 0);
4023
+ nvme_fault_inject_fini(&ns->fault_inject);
32474024
32484025 mutex_lock(&ns->ctrl->subsys->lock);
32494026 list_del_rcu(&ns->siblings);
4027
+ if (list_empty(&ns->head->list))
4028
+ list_del_init(&ns->head->entry);
32504029 mutex_unlock(&ns->ctrl->subsys->lock);
4030
+
32514031 synchronize_rcu(); /* guarantee not available in head->list */
32524032 nvme_mpath_clear_current_path(ns);
32534033 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
32544034
3255
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
3256
- sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
3257
- &nvme_ns_id_attr_group);
3258
- if (ns->ndev)
3259
- nvme_nvm_unregister_sysfs(ns);
4035
+ if (ns->disk->flags & GENHD_FL_UP) {
32604036 del_gendisk(ns->disk);
32614037 blk_cleanup_queue(ns->queue);
32624038 if (blk_get_integrity(ns->disk))
....@@ -3271,17 +4047,91 @@
32714047 nvme_put_ns(ns);
32724048 }
32734049
3274
-static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4050
+static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
32754051 {
4052
+ struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4053
+
4054
+ if (ns) {
4055
+ nvme_ns_remove(ns);
4056
+ nvme_put_ns(ns);
4057
+ }
4058
+}
4059
+
4060
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
4061
+{
4062
+ struct nvme_id_ns *id;
4063
+ int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4064
+
4065
+ if (test_bit(NVME_NS_DEAD, &ns->flags))
4066
+ goto out;
4067
+
4068
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
4069
+ if (ret)
4070
+ goto out;
4071
+
4072
+ ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4073
+ if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
4074
+ dev_err(ns->ctrl->device,
4075
+ "identifiers changed for nsid %d\n", ns->head->ns_id);
4076
+ goto out_free_id;
4077
+ }
4078
+
4079
+ ret = nvme_update_ns_info(ns, id);
4080
+
4081
+out_free_id:
4082
+ kfree(id);
4083
+out:
4084
+ /*
4085
+ * Only remove the namespace if we got a fatal error back from the
4086
+ * device, otherwise ignore the error and just move on.
4087
+ *
4088
+ * TODO: we should probably schedule a delayed retry here.
4089
+ */
4090
+ if (ret > 0 && (ret & NVME_SC_DNR))
4091
+ nvme_ns_remove(ns);
4092
+ else
4093
+ revalidate_disk_size(ns->disk, true);
4094
+}
4095
+
4096
+static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4097
+{
4098
+ struct nvme_ns_ids ids = { };
32764099 struct nvme_ns *ns;
4100
+
4101
+ if (nvme_identify_ns_descs(ctrl, nsid, &ids))
4102
+ return;
32774103
32784104 ns = nvme_find_get_ns(ctrl, nsid);
32794105 if (ns) {
3280
- if (ns->disk && revalidate_disk(ns->disk))
3281
- nvme_ns_remove(ns);
4106
+ nvme_validate_ns(ns, &ids);
32824107 nvme_put_ns(ns);
3283
- } else
3284
- nvme_alloc_ns(ctrl, nsid);
4108
+ return;
4109
+ }
4110
+
4111
+ switch (ids.csi) {
4112
+ case NVME_CSI_NVM:
4113
+ nvme_alloc_ns(ctrl, nsid, &ids);
4114
+ break;
4115
+ case NVME_CSI_ZNS:
4116
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
4117
+ dev_warn(ctrl->device,
4118
+ "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
4119
+ nsid);
4120
+ break;
4121
+ }
4122
+ if (!nvme_multi_css(ctrl)) {
4123
+ dev_warn(ctrl->device,
4124
+ "command set not reported for nsid: %d\n",
4125
+ nsid);
4126
+ break;
4127
+ }
4128
+ nvme_alloc_ns(ctrl, nsid, &ids);
4129
+ break;
4130
+ default:
4131
+ dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
4132
+ ids.csi, nsid);
4133
+ break;
4134
+ }
32854135 }
32864136
32874137 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
....@@ -3302,39 +4152,41 @@
33024152
33034153 }
33044154
3305
-static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
4155
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
33064156 {
3307
- struct nvme_ns *ns;
4157
+ const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
33084158 __le32 *ns_list;
3309
- unsigned i, j, nsid, prev = 0;
3310
- unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
3311
- int ret = 0;
4159
+ u32 prev = 0;
4160
+ int ret = 0, i;
4161
+
4162
+ if (nvme_ctrl_limited_cns(ctrl))
4163
+ return -EOPNOTSUPP;
33124164
33134165 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
33144166 if (!ns_list)
33154167 return -ENOMEM;
33164168
3317
- for (i = 0; i < num_lists; i++) {
3318
- ret = nvme_identify_ns_list(ctrl, prev, ns_list);
4169
+ for (;;) {
4170
+ struct nvme_command cmd = {
4171
+ .identify.opcode = nvme_admin_identify,
4172
+ .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
4173
+ .identify.nsid = cpu_to_le32(prev),
4174
+ };
4175
+
4176
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4177
+ NVME_IDENTIFY_DATA_SIZE);
33194178 if (ret)
33204179 goto free;
33214180
3322
- for (j = 0; j < min(nn, 1024U); j++) {
3323
- nsid = le32_to_cpu(ns_list[j]);
3324
- if (!nsid)
4181
+ for (i = 0; i < nr_entries; i++) {
4182
+ u32 nsid = le32_to_cpu(ns_list[i]);
4183
+
4184
+ if (!nsid) /* end of the list? */
33254185 goto out;
3326
-
3327
- nvme_validate_ns(ctrl, nsid);
3328
-
3329
- while (++prev < nsid) {
3330
- ns = nvme_find_get_ns(ctrl, prev);
3331
- if (ns) {
3332
- nvme_ns_remove(ns);
3333
- nvme_put_ns(ns);
3334
- }
3335
- }
4186
+ nvme_validate_or_alloc_ns(ctrl, nsid);
4187
+ while (++prev < nsid)
4188
+ nvme_ns_remove_by_nsid(ctrl, prev);
33364189 }
3337
- nn -= j;
33384190 }
33394191 out:
33404192 nvme_remove_invalid_namespaces(ctrl, prev);
....@@ -3343,12 +4195,18 @@
33434195 return ret;
33444196 }
33454197
3346
-static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
4198
+static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
33474199 {
3348
- unsigned i;
4200
+ struct nvme_id_ctrl *id;
4201
+ u32 nn, i;
4202
+
4203
+ if (nvme_identify_ctrl(ctrl, &id))
4204
+ return;
4205
+ nn = le32_to_cpu(id->nn);
4206
+ kfree(id);
33494207
33504208 for (i = 1; i <= nn; i++)
3351
- nvme_validate_ns(ctrl, i);
4209
+ nvme_validate_or_alloc_ns(ctrl, i);
33524210
33534211 nvme_remove_invalid_namespaces(ctrl, nn);
33544212 }
....@@ -3369,8 +4227,8 @@
33694227 * raced with us in reading the log page, which could cause us to miss
33704228 * updates.
33714229 */
3372
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
3373
- log_size, 0);
4230
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4231
+ NVME_CSI_NVM, log, log_size, 0);
33744232 if (error)
33754233 dev_warn(ctrl->device,
33764234 "reading changed ns log failed: %d\n", error);
....@@ -3382,35 +4240,20 @@
33824240 {
33834241 struct nvme_ctrl *ctrl =
33844242 container_of(work, struct nvme_ctrl, scan_work);
3385
- struct nvme_id_ctrl *id;
3386
- unsigned nn;
33874243
3388
- if (ctrl->state != NVME_CTRL_LIVE)
4244
+ /* No tagset on a live ctrl means IO queues could not created */
4245
+ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
33894246 return;
3390
-
3391
- WARN_ON_ONCE(!ctrl->tagset);
33924247
33934248 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
33944249 dev_info(ctrl->device, "rescanning namespaces.\n");
33954250 nvme_clear_changed_ns_log(ctrl);
33964251 }
33974252
3398
- if (nvme_identify_ctrl(ctrl, &id))
3399
- return;
3400
-
34014253 mutex_lock(&ctrl->scan_lock);
3402
- nn = le32_to_cpu(id->nn);
3403
- if (!nvme_ctrl_limited_cns(ctrl)) {
3404
- if (!nvme_scan_ns_list(ctrl, nn))
3405
- goto out_free_id;
3406
- }
3407
- nvme_scan_ns_sequential(ctrl, nn);
3408
-out_free_id:
4254
+ if (nvme_scan_ns_list(ctrl) != 0)
4255
+ nvme_scan_ns_sequential(ctrl);
34094256 mutex_unlock(&ctrl->scan_lock);
3410
- kfree(id);
3411
- down_write(&ctrl->namespaces_rwsem);
3412
- list_sort(NULL, &ctrl->namespaces, ns_cmp);
3413
- up_write(&ctrl->namespaces_rwsem);
34144257 }
34154258
34164259 /*
....@@ -3422,6 +4265,13 @@
34224265 {
34234266 struct nvme_ns *ns, *next;
34244267 LIST_HEAD(ns_list);
4268
+
4269
+ /*
4270
+ * make sure to requeue I/O to all namespaces as these
4271
+ * might result from the scan itself and must complete
4272
+ * for the scan_work to make progress
4273
+ */
4274
+ nvme_mpath_clear_ctrl_paths(ctrl);
34254275
34264276 /* prevent racing with ns scanning */
34274277 flush_work(&ctrl->scan_work);
....@@ -3435,6 +4285,9 @@
34354285 if (ctrl->state == NVME_CTRL_DEAD)
34364286 nvme_kill_queues(ctrl);
34374287
4288
+ /* this is a no-op when called from the controller reset handler */
4289
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4290
+
34384291 down_write(&ctrl->namespaces_rwsem);
34394292 list_splice_init(&ctrl->namespaces, &ns_list);
34404293 up_write(&ctrl->namespaces_rwsem);
....@@ -3443,6 +4296,33 @@
34434296 nvme_ns_remove(ns);
34444297 }
34454298 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4299
+
4300
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4301
+{
4302
+ struct nvme_ctrl *ctrl =
4303
+ container_of(dev, struct nvme_ctrl, ctrl_device);
4304
+ struct nvmf_ctrl_options *opts = ctrl->opts;
4305
+ int ret;
4306
+
4307
+ ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4308
+ if (ret)
4309
+ return ret;
4310
+
4311
+ if (opts) {
4312
+ ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4313
+ if (ret)
4314
+ return ret;
4315
+
4316
+ ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4317
+ opts->trsvcid ?: "none");
4318
+ if (ret)
4319
+ return ret;
4320
+
4321
+ ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4322
+ opts->host_traddr ?: "none");
4323
+ }
4324
+ return ret;
4325
+}
34464326
34474327 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
34484328 {
....@@ -3498,8 +4378,8 @@
34984378 if (!log)
34994379 return;
35004380
3501
- if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
3502
- sizeof(*log), 0))
4381
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4382
+ log, sizeof(*log), 0))
35034383 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
35044384 kfree(log);
35054385 }
....@@ -3522,13 +4402,13 @@
35224402 if (time_after(jiffies, fw_act_timeout)) {
35234403 dev_warn(ctrl->device,
35244404 "Fw activation timeout, reset controller\n");
3525
- nvme_reset_ctrl(ctrl);
3526
- break;
4405
+ nvme_try_sched_reset(ctrl);
4406
+ return;
35274407 }
35284408 msleep(100);
35294409 }
35304410
3531
- if (ctrl->state != NVME_CTRL_LIVE)
4411
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
35324412 return;
35334413
35344414 nvme_start_queues(ctrl);
....@@ -3536,15 +4416,33 @@
35364416 nvme_get_fw_slot_info(ctrl);
35374417 }
35384418
4419
+static u32 nvme_aer_type(u32 result)
4420
+{
4421
+ return result & 0x7;
4422
+}
4423
+
4424
+static u32 nvme_aer_subtype(u32 result)
4425
+{
4426
+ return (result & 0xff00) >> 8;
4427
+}
4428
+
35394429 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
35404430 {
3541
- switch ((result & 0xff00) >> 8) {
4431
+ u32 aer_notice_type = nvme_aer_subtype(result);
4432
+
4433
+ switch (aer_notice_type) {
35424434 case NVME_AER_NOTICE_NS_CHANGED:
35434435 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
35444436 nvme_queue_scan(ctrl);
35454437 break;
35464438 case NVME_AER_NOTICE_FW_ACT_STARTING:
3547
- queue_work(nvme_wq, &ctrl->fw_act_work);
4439
+ /*
4440
+ * We are (ab)using the RESETTING state to prevent subsequent
4441
+ * recovery actions from interfering with the controller's
4442
+ * firmware activation.
4443
+ */
4444
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4445
+ queue_work(nvme_wq, &ctrl->fw_act_work);
35484446 break;
35494447 #ifdef CONFIG_NVME_MULTIPATH
35504448 case NVME_AER_NOTICE_ANA:
....@@ -3553,24 +4451,45 @@
35534451 queue_work(nvme_wq, &ctrl->ana_work);
35544452 break;
35554453 #endif
4454
+ case NVME_AER_NOTICE_DISC_CHANGED:
4455
+ ctrl->aen_result = result;
4456
+ break;
35564457 default:
35574458 dev_warn(ctrl->device, "async event result %08x\n", result);
35584459 }
4460
+}
4461
+
4462
+static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4463
+{
4464
+ dev_warn(ctrl->device, "resetting controller due to AER\n");
4465
+ nvme_reset_ctrl(ctrl);
35594466 }
35604467
35614468 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
35624469 volatile union nvme_result *res)
35634470 {
35644471 u32 result = le32_to_cpu(res->u32);
4472
+ u32 aer_type = nvme_aer_type(result);
4473
+ u32 aer_subtype = nvme_aer_subtype(result);
35654474
35664475 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
35674476 return;
35684477
3569
- switch (result & 0x7) {
4478
+ trace_nvme_async_event(ctrl, result);
4479
+ switch (aer_type) {
35704480 case NVME_AER_NOTICE:
35714481 nvme_handle_aen_notice(ctrl, result);
35724482 break;
35734483 case NVME_AER_ERROR:
4484
+ /*
4485
+ * For a persistent internal error, don't run async_event_work
4486
+ * to submit a new AER. The controller reset will do it.
4487
+ */
4488
+ if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4489
+ nvme_handle_aer_persistent_error(ctrl);
4490
+ return;
4491
+ }
4492
+ fallthrough;
35744493 case NVME_AER_SMART:
35754494 case NVME_AER_CSS:
35764495 case NVME_AER_VS:
....@@ -3596,25 +4515,40 @@
35964515
35974516 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
35984517 {
3599
- if (ctrl->kato)
3600
- nvme_start_keep_alive(ctrl);
4518
+ nvme_start_keep_alive(ctrl);
4519
+
4520
+ nvme_enable_aen(ctrl);
36014521
36024522 if (ctrl->queue_count > 1) {
36034523 nvme_queue_scan(ctrl);
3604
- nvme_enable_aen(ctrl);
3605
- queue_work(nvme_wq, &ctrl->async_event_work);
36064524 nvme_start_queues(ctrl);
4525
+ nvme_mpath_update(ctrl);
36074526 }
3608
- ctrl->created = true;
36094527 }
36104528 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
36114529
36124530 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
36134531 {
4532
+ nvme_hwmon_exit(ctrl);
4533
+ nvme_fault_inject_fini(&ctrl->fault_inject);
36144534 dev_pm_qos_hide_latency_tolerance(ctrl->device);
36154535 cdev_device_del(&ctrl->cdev, ctrl->device);
4536
+ nvme_put_ctrl(ctrl);
36164537 }
36174538 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4539
+
4540
+static void nvme_free_cels(struct nvme_ctrl *ctrl)
4541
+{
4542
+ struct nvme_effects_log *cel;
4543
+ unsigned long i;
4544
+
4545
+ xa_for_each (&ctrl->cels, i, cel) {
4546
+ xa_erase(&ctrl->cels, i);
4547
+ kfree(cel);
4548
+ }
4549
+
4550
+ xa_destroy(&ctrl->cels);
4551
+}
36184552
36194553 static void nvme_free_ctrl(struct device *dev)
36204554 {
....@@ -3622,16 +4556,18 @@
36224556 container_of(dev, struct nvme_ctrl, ctrl_device);
36234557 struct nvme_subsystem *subsys = ctrl->subsys;
36244558
3625
- ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3626
- kfree(ctrl->effects);
4559
+ if (!subsys || ctrl->instance != subsys->instance)
4560
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4561
+
4562
+ nvme_free_cels(ctrl);
36274563 nvme_mpath_uninit(ctrl);
36284564 __free_page(ctrl->discard_page);
36294565
36304566 if (subsys) {
3631
- mutex_lock(&subsys->lock);
4567
+ mutex_lock(&nvme_subsystems_lock);
36324568 list_del(&ctrl->subsys_entry);
3633
- mutex_unlock(&subsys->lock);
36344569 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4570
+ mutex_unlock(&nvme_subsystems_lock);
36354571 }
36364572
36374573 ctrl->ops->free_ctrl(ctrl);
....@@ -3654,14 +4590,17 @@
36544590 spin_lock_init(&ctrl->lock);
36554591 mutex_init(&ctrl->scan_lock);
36564592 INIT_LIST_HEAD(&ctrl->namespaces);
4593
+ xa_init(&ctrl->cels);
36574594 init_rwsem(&ctrl->namespaces_rwsem);
36584595 ctrl->dev = dev;
36594596 ctrl->ops = ops;
36604597 ctrl->quirks = quirks;
4598
+ ctrl->numa_node = NUMA_NO_NODE;
36614599 INIT_WORK(&ctrl->scan_work, nvme_scan_work);
36624600 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
36634601 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
36644602 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4603
+ init_waitqueue_head(&ctrl->state_wq);
36654604
36664605 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
36674606 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
....@@ -3692,6 +4631,7 @@
36924631 if (ret)
36934632 goto out_release_instance;
36944633
4634
+ nvme_get_ctrl(ctrl);
36954635 cdev_init(&ctrl->cdev, &nvme_dev_fops);
36964636 ctrl->cdev.owner = ops->module;
36974637 ret = cdev_device_add(&ctrl->cdev, ctrl->device);
....@@ -3706,8 +4646,12 @@
37064646 dev_pm_qos_update_user_latency_tolerance(ctrl->device,
37074647 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
37084648
4649
+ nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4650
+ nvme_mpath_init_ctrl(ctrl);
4651
+
37094652 return 0;
37104653 out_free_name:
4654
+ nvme_put_ctrl(ctrl);
37114655 kfree_const(ctrl->device->kobj.name);
37124656 out_release_instance:
37134657 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
....@@ -3753,7 +4697,7 @@
37534697 }
37544698 EXPORT_SYMBOL_GPL(nvme_unfreeze);
37554699
3756
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4700
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
37574701 {
37584702 struct nvme_ns *ns;
37594703
....@@ -3764,6 +4708,7 @@
37644708 break;
37654709 }
37664710 up_read(&ctrl->namespaces_rwsem);
4711
+ return timeout;
37674712 }
37684713 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
37694714
....@@ -3811,9 +4756,65 @@
38114756 }
38124757 EXPORT_SYMBOL_GPL(nvme_start_queues);
38134758
3814
-int __init nvme_core_init(void)
4759
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4760
+{
4761
+ struct nvme_ns *ns;
4762
+
4763
+ down_read(&ctrl->namespaces_rwsem);
4764
+ list_for_each_entry(ns, &ctrl->namespaces, list)
4765
+ blk_sync_queue(ns->queue);
4766
+ up_read(&ctrl->namespaces_rwsem);
4767
+}
4768
+EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4769
+
4770
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
4771
+{
4772
+ nvme_sync_io_queues(ctrl);
4773
+ if (ctrl->admin_q)
4774
+ blk_sync_queue(ctrl->admin_q);
4775
+}
4776
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
4777
+
4778
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4779
+{
4780
+ if (file->f_op != &nvme_dev_fops)
4781
+ return NULL;
4782
+ return file->private_data;
4783
+}
4784
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4785
+
4786
+/*
4787
+ * Check we didn't inadvertently grow the command structure sizes:
4788
+ */
4789
+static inline void _nvme_check_size(void)
4790
+{
4791
+ BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4792
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4793
+ BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4794
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4795
+ BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4796
+ BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4797
+ BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4798
+ BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4799
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4800
+ BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4801
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4802
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4803
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4804
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4805
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4806
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4807
+ BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4808
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4809
+ BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4810
+}
4811
+
4812
+
4813
+static int __init nvme_core_init(void)
38154814 {
38164815 int result = -ENOMEM;
4816
+
4817
+ _nvme_check_size();
38174818
38184819 nvme_wq = alloc_workqueue("nvme-wq",
38194820 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
....@@ -3839,6 +4840,7 @@
38394840 result = PTR_ERR(nvme_class);
38404841 goto unregister_chrdev;
38414842 }
4843
+ nvme_class->dev_uevent = nvme_class_uevent;
38424844
38434845 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
38444846 if (IS_ERR(nvme_subsys_class)) {
....@@ -3861,15 +4863,15 @@
38614863 return result;
38624864 }
38634865
3864
-void nvme_core_exit(void)
4866
+static void __exit nvme_core_exit(void)
38654867 {
3866
- ida_destroy(&nvme_subsystems_ida);
38674868 class_destroy(nvme_subsys_class);
38684869 class_destroy(nvme_class);
38694870 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
38704871 destroy_workqueue(nvme_delete_wq);
38714872 destroy_workqueue(nvme_reset_wq);
38724873 destroy_workqueue(nvme_wq);
4874
+ ida_destroy(&nvme_instance_ida);
38734875 }
38744876
38754877 MODULE_LICENSE("GPL");